From 51384fd6917dda0f624b14b3deb09b20400436b6 Mon Sep 17 00:00:00 2001 From: comunodi Date: Wed, 16 Jan 2019 01:08:56 +0300 Subject: [PATCH 001/309] Add basic functionality for dict --- ci/jobs/quick-build/run.sh | 2 +- cmake/find_poco.cmake | 15 +- dbms/src/Common/config.h.in | 1 + dbms/src/Dictionaries/CMakeLists.txt | 4 + .../Dictionaries/RedisBlockInputStream.cpp | 252 ++++++++++++++++++ dbms/src/Dictionaries/RedisBlockInputStream.h | 44 +++ .../Dictionaries/RedisDictionarySource.cpp | 194 ++++++++++++++ dbms/src/Dictionaries/RedisDictionarySource.h | 84 ++++++ 8 files changed, 592 insertions(+), 4 deletions(-) create mode 100644 dbms/src/Dictionaries/RedisBlockInputStream.cpp create mode 100644 dbms/src/Dictionaries/RedisBlockInputStream.h create mode 100644 dbms/src/Dictionaries/RedisDictionarySource.cpp create mode 100644 dbms/src/Dictionaries/RedisDictionarySource.h diff --git a/ci/jobs/quick-build/run.sh b/ci/jobs/quick-build/run.sh index 6a948c560ee..eb31802e79b 100755 --- a/ci/jobs/quick-build/run.sh +++ b/ci/jobs/quick-build/run.sh @@ -21,7 +21,7 @@ BUILD_TARGETS=clickhouse BUILD_TYPE=Debug ENABLE_EMBEDDED_COMPILER=0 -CMAKE_FLAGS="-D CMAKE_C_FLAGS_ADD=-g0 -D CMAKE_CXX_FLAGS_ADD=-g0 -D ENABLE_JEMALLOC=0 -D ENABLE_CAPNP=0 -D ENABLE_RDKAFKA=0 -D ENABLE_UNWIND=0 -D ENABLE_ICU=0 -D ENABLE_POCO_MONGODB=0 -D ENABLE_POCO_NETSSL=0 -D ENABLE_POCO_ODBC=0 -D ENABLE_ODBC=0 -D ENABLE_MYSQL=0" +CMAKE_FLAGS="-D CMAKE_C_FLAGS_ADD=-g0 -D CMAKE_CXX_FLAGS_ADD=-g0 -D ENABLE_JEMALLOC=0 -D ENABLE_CAPNP=0 -D ENABLE_RDKAFKA=0 -D ENABLE_UNWIND=0 -D ENABLE_ICU=0 -D ENABLE_POCO_MONGODB=0 -D ENABLE_POCO_REDIS=0 -D ENABLE_POCO_NETSSL=0 -D ENABLE_POCO_ODBC=0 -D ENABLE_ODBC=0 -D ENABLE_MYSQL=0" [[ $(uname) == "FreeBSD" ]] && COMPILER_PACKAGE_VERSION=devel && export COMPILER_PATH=/usr/local/bin diff --git a/cmake/find_poco.cmake b/cmake/find_poco.cmake index 012f269d48d..4c9cb16e729 100644 --- a/cmake/find_poco.cmake +++ b/cmake/find_poco.cmake @@ -15,6 +15,9 @@ endif () if (NOT DEFINED ENABLE_POCO_MONGODB OR ENABLE_POCO_MONGODB) list (APPEND POCO_COMPONENTS MongoDB) endif () +if (NOT DEFINED ENABLE_POCO_REDIS OR ENABLE_POCO_REDIS) + list (APPEND POCO_COMPONENTS Redis) +endif () # TODO: after new poco release with SQL library rename ENABLE_POCO_ODBC -> ENABLE_POCO_SQLODBC if (NOT DEFINED ENABLE_POCO_ODBC OR ENABLE_POCO_ODBC) list (APPEND POCO_COMPONENTS DataODBC) @@ -32,7 +35,6 @@ elseif (NOT MISSING_INTERNAL_POCO_LIBRARY) set (ENABLE_ZIP 0 CACHE BOOL "") set (ENABLE_PAGECOMPILER 0 CACHE BOOL "") set (ENABLE_PAGECOMPILER_FILE2PAGE 0 CACHE BOOL "") - set (ENABLE_REDIS 0 CACHE BOOL "") set (ENABLE_DATA_SQLITE 0 CACHE BOOL "") set (ENABLE_DATA_MYSQL 0 CACHE BOOL "") set (ENABLE_DATA_POSTGRESQL 0 CACHE BOOL "") @@ -40,7 +42,6 @@ elseif (NOT MISSING_INTERNAL_POCO_LIBRARY) set (POCO_ENABLE_ZIP 0 CACHE BOOL "") set (POCO_ENABLE_PAGECOMPILER 0 CACHE BOOL "") set (POCO_ENABLE_PAGECOMPILER_FILE2PAGE 0 CACHE BOOL "") - set (POCO_ENABLE_REDIS 0 CACHE BOOL "") set (POCO_ENABLE_SQL_SQLITE 0 CACHE BOOL "") set (POCO_ENABLE_SQL_MYSQL 0 CACHE BOOL "") set (POCO_ENABLE_SQL_POSTGRESQL 0 CACHE BOOL "") @@ -63,6 +64,11 @@ elseif (NOT MISSING_INTERNAL_POCO_LIBRARY) set (Poco_MongoDB_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/poco/MongoDB/include/") endif () + if (NOT DEFINED ENABLE_POCO_REDIS OR ENABLE_POCO_REDIS) + set (Poco_Redis_LIBRARY PocoRedis) + set (Poco_Redis_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/poco/Redis/include/") + endif () + if (EXISTS "${ClickHouse_SOURCE_DIR}/contrib/poco/SQL/ODBC/include/") set (Poco_SQL_FOUND 1) set (Poco_SQL_LIBRARY PocoSQL) @@ -116,6 +122,9 @@ endif () if (Poco_MongoDB_LIBRARY) set (USE_POCO_MONGODB 1) endif () +if (Poco_Redis_LIBRARY) + set (USE_POCO_REDIS 1) +endif () if (Poco_DataODBC_LIBRARY AND ODBC_FOUND) set (USE_POCO_DATAODBC 1) endif () @@ -123,7 +132,7 @@ if (Poco_SQLODBC_LIBRARY AND ODBC_FOUND) set (USE_POCO_SQLODBC 1) endif () -message(STATUS "Using Poco: ${Poco_INCLUDE_DIRS} : ${Poco_Foundation_LIBRARY},${Poco_Util_LIBRARY},${Poco_Net_LIBRARY},${Poco_NetSSL_LIBRARY},${Poco_Crypto_LIBRARY},${Poco_XML_LIBRARY},${Poco_Data_LIBRARY},${Poco_DataODBC_LIBRARY},${Poco_SQL_LIBRARY},${Poco_SQLODBC_LIBRARY},${Poco_MongoDB_LIBRARY}; MongoDB=${USE_POCO_MONGODB}, DataODBC=${USE_POCO_DATAODBC}, NetSSL=${USE_POCO_NETSSL}") +message(STATUS "Using Poco: ${Poco_INCLUDE_DIRS} : ${Poco_Foundation_LIBRARY},${Poco_Util_LIBRARY},${Poco_Net_LIBRARY},${Poco_NetSSL_LIBRARY},${Poco_Crypto_LIBRARY},${Poco_XML_LIBRARY},${Poco_Data_LIBRARY},${Poco_DataODBC_LIBRARY},${Poco_SQL_LIBRARY},${Poco_SQLODBC_LIBRARY},${Poco_MongoDB_LIBRARY}; MongoDB=${USE_POCO_MONGODB}, Redis=${USE_POCO_REDIS}, DataODBC=${USE_POCO_DATAODBC}, NetSSL=${USE_POCO_NETSSL}") # How to make sutable poco: # use branch: diff --git a/dbms/src/Common/config.h.in b/dbms/src/Common/config.h.in index 09c2eadde29..d3a61037119 100644 --- a/dbms/src/Common/config.h.in +++ b/dbms/src/Common/config.h.in @@ -12,6 +12,7 @@ #cmakedefine01 USE_POCO_SQLODBC #cmakedefine01 USE_POCO_DATAODBC #cmakedefine01 USE_POCO_MONGODB +#cmakedefine01 USE_POCO_REDIS #cmakedefine01 USE_POCO_NETSSL #cmakedefine01 USE_BASE64 #cmakedefine01 USE_HDFS diff --git a/dbms/src/Dictionaries/CMakeLists.txt b/dbms/src/Dictionaries/CMakeLists.txt index d7f85a5c7eb..2e8219f2170 100644 --- a/dbms/src/Dictionaries/CMakeLists.txt +++ b/dbms/src/Dictionaries/CMakeLists.txt @@ -36,4 +36,8 @@ if(USE_POCO_MONGODB) target_link_libraries(clickhouse_dictionaries PRIVATE ${Poco_MongoDB_LIBRARY}) endif() +if(USE_POCO_REDIS) + target_link_libraries(clickhouse_dictionaries PRIVATE ${Poco_Redis_LIBRARY}) +endif() + add_subdirectory(Embedded) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp new file mode 100644 index 00000000000..dfbb03a0034 --- /dev/null +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -0,0 +1,252 @@ +#include +#if USE_POCO_REDIS + +# include +# include +# include + +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include +# include +# include +# include +# include +# include +# include +# include "DictionaryStructure.h" +# include "RedisBlockInputStream.h" + + +namespace DB +{ + namespace ErrorCodes + { + extern const int TYPE_MISMATCH; + } + + + RedisBlockInputStream::RedisBlockInputStream( + std::shared_ptr client_, + const DB::Block & sample_block, + const size_t max_block_size) + : client(client_), max_block_size{max_block_size} + { + description.init(sample_block); + } + + RedisBlockInputStream::~RedisBlockInputStream() = default; + + + namespace + { + using ValueType = ExternalResultDescription::ValueType; + using RedisArray = Poco::Redis::Array; + + template + void insertNumber(IColumn & column, const Poco::Redis::RedisType::Ptr & value, const std::string & name) + { + switch (value->type()) + { + case Poco::Redis::RedisTypeTraits::TypeId: + static_cast &>(column).getData().push_back( + static_cast *>(value.get())->value()); + break; + case Poco::Redis::RedisTypeTraits::TypeId: + static_cast &>(column).getData().push_back( + parse(static_cast *>(value.get())->value())); + break; + case Poco::Redis::RedisTypeTraits::TypeId: + { + const auto &bs = + static_cast *>(value.get())->value(); + if (bs.isNull()) + static_cast &>(column).getData().emplace_back(); + else + static_cast &>(column).getData().push_back(parse(bs.value())); + break; + } + default: + throw Exception( + "Type mismatch, expected a number, got type id = " + toString(value->type()) + " for column " + name, + ErrorCodes::TYPE_MISMATCH); + } + } + + void insertValue(IColumn & column, const ValueType type, const Poco::Redis::RedisType::Ptr & value, const std::string & name) + { + auto getStringIfCould = [&value, &name]() + { + switch (value->type()) + { + case Poco::Redis::RedisTypeTraits::TypeId: + { + const auto & bs = static_cast *>(value.get())->value(); + if (bs.isNull()) + throw Exception{"Type mismatch, expected not null String for column " + name, + ErrorCodes::TYPE_MISMATCH}; + return bs.value(); + } + case Poco::Redis::RedisTypeTraits::TypeId: + return static_cast *>(value.get())->value(); + default: + throw Exception{"Type mismatch, expected String, got type id = " + toString(value->type()) + " for column " + name, + ErrorCodes::TYPE_MISMATCH}; + } + }; + switch (type) + { + case ValueType::UInt8: + insertNumber(column, value, name); + break; + case ValueType::UInt16: + insertNumber(column, value, name); + break; + case ValueType::UInt32: + insertNumber(column, value, name); + break; + case ValueType::UInt64: + insertNumber(column, value, name); + break; + case ValueType::Int8: + insertNumber(column, value, name); + break; + case ValueType::Int16: + insertNumber(column, value, name); + break; + case ValueType::Int32: + insertNumber(column, value, name); + break; + case ValueType::Int64: + insertNumber(column, value, name); + break; + case ValueType::Float32: + insertNumber(column, value, name); + break; + case ValueType::Float64: + insertNumber(column, value, name); + break; + + case ValueType::String: + { + String string = getStringIfCould(); + static_cast(column).insertDataWithTerminatingZero(string.data(), string.size() + 1); + break; + } + + case ValueType::Date: + { + if (value->type() != Poco::Redis::RedisTypeTraits::TypeId) + throw Exception{"Type mismatch, expected Int64 (Timestamp), got type id = " + toString(value->type()) + " for column " + name, + ErrorCodes::TYPE_MISMATCH}; + + static_cast(column).getData().push_back(UInt16{DateLUT::instance().toDayNum( + static_cast( + static_cast *>(value.get())->value()).epochTime())}); + break; + } + + case ValueType::DateTime: + { + if (value->type() != Poco::Redis::RedisTypeTraits::TypeId) + throw Exception{"Type mismatch, expected Int64 (Timestamp), got type id = " + toString(value->type()) + " for column " + name, + ErrorCodes::TYPE_MISMATCH}; + + static_cast(column).getData().push_back( + static_cast( + static_cast *>(value.get())->value()).epochTime()); + break; + } + case ValueType::UUID: + { + String string = getStringIfCould(); + static_cast(column).getData().push_back(parse(string)); + break; + } + } + } + + void insertDefaultValue(IColumn & column, const IColumn & sample_column) { column.insertFrom(sample_column, 0); } + } + + + Block RedisBlockInputStream::readImpl() + { + if (all_read) + return {}; + + const size_t size = 2; + assert(size == description.sample_block.columns()); + MutableColumns columns(description.sample_block.columns()); + + for (const auto i : ext::range(0, size)) + columns[i] = description.sample_block.getByPosition(i).column->cloneEmpty(); + + size_t num_rows = 0; + while (num_rows < max_block_size) + { + RedisArray commandForKeys; + commandForKeys << "SCAN" << cursor; + + auto replyForKeys = client->execute(commandForKeys); + if (cursor = replyForKeys.get(0); cursor == 0) + { + all_read = true; + break; + } + + auto response = replyForKeys.get(1); + if (response.isNull()) + continue; + + Poco::Redis::Array commandForValues; + commandForValues << "MGET"; + + const auto insertValueByIdx = [this, &columns](size_t idx, const auto & value) + { + const auto & name = description.sample_block.getByPosition(idx).name; + if (description.types[idx].second) + { + ColumnNullable & column_nullable = static_cast(*columns[idx]); + insertValue(column_nullable.getNestedColumn(), description.types[idx].first, value, name); + column_nullable.getNullMapData().emplace_back(0); + } + else + insertValue(*columns[idx], description.types[idx].first, value, name); + }; + + for (const auto & key : response) + { + ++num_rows; + String keyS = static_cast *>(key.get())->value(); + commandForValues << keyS; + insertValueByIdx(0, key); + } + + auto replyForValues = client->execute(commandForValues); + for (const auto & value : replyForValues) + { + if (value.isNull()) + insertDefaultValue(*columns[1], *description.sample_block.getByPosition(1).column); + else + insertValueByIdx(1, value); + } + } + + if (num_rows == 0) + return {}; + + return description.sample_block.cloneWithColumns(std::move(columns)); + } + +} + +#endif diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.h b/dbms/src/Dictionaries/RedisBlockInputStream.h new file mode 100644 index 00000000000..7e32b3ff8ff --- /dev/null +++ b/dbms/src/Dictionaries/RedisBlockInputStream.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include +#include "ExternalResultDescription.h" + + +namespace Poco +{ + namespace Redis + { + class Client; + } +} + + +namespace DB +{ +/// Converts Redis Cursor to a stream of Blocks + class RedisBlockInputStream final : public IProfilingBlockInputStream + { + public: + RedisBlockInputStream( + std::shared_ptr client_, + const Block & sample_block, + const size_t max_block_size); + + ~RedisBlockInputStream() override; + + String getName() const override { return "Redis"; } + + Block getHeader() const override { return description.sample_block.cloneEmpty(); } + + private: + Block readImpl() override; + + std::shared_ptr client; + const size_t max_block_size; + ExternalResultDescription description; + int64_t cursor = 0; + bool all_read = false; + }; + +} diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp new file mode 100644 index 00000000000..1fb5472b48b --- /dev/null +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -0,0 +1,194 @@ +#include "RedisDictionarySource.h" +#include "DictionarySourceFactory.h" +#include "DictionaryStructure.h" + +namespace DB +{ + namespace ErrorCodes + { + extern const int SUPPORT_IS_DISABLED; + } + + void registerDictionarySourceRedis(DictionarySourceFactory & factory) + { + auto createTableSource = [=](const DictionaryStructure & dict_struct, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + Block & sample_block, + const Context & /* context */) -> DictionarySourcePtr { +#if USE_POCO_REDIS + return std::make_unique(dict_struct, config, config_prefix + ".redis", sample_block); +#else + (void)dict_struct; + (void)config; + (void)config_prefix; + (void)sample_block; + throw Exception{"Dictionary source of type `redis` is disabled because poco library was built without redis support.", + ErrorCodes::SUPPORT_IS_DISABLED}; +#endif + }; + factory.registerSource("redis", createTableSource); + } + +} + + +#if USE_POCO_REDIS + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include +# include +# include +# include "RedisBlockInputStream.h" + + +namespace DB +{ + namespace ErrorCodes + { + extern const int UNSUPPORTED_METHOD; + } + + + static const size_t max_block_size = 8192; + + + RedisDictionarySource::RedisDictionarySource( + const DictionaryStructure & dict_struct, + const std::string & host, + UInt16 port, + const Block & sample_block) + : dict_struct{dict_struct} + , host{host} + , port{port} + , sample_block{sample_block} + , client{std::make_shared(host, port)} + { + } + + + RedisDictionarySource::RedisDictionarySource( + const DictionaryStructure & dict_struct, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + Block & sample_block) + : RedisDictionarySource( + dict_struct, + config.getString(config_prefix + ".host"), + config.getUInt(config_prefix + ".port"), + sample_block) + { + } + + + RedisDictionarySource::RedisDictionarySource(const RedisDictionarySource & other) + : RedisDictionarySource{other.dict_struct, + other.host, + other.port, + other.sample_block} + { + } + + + RedisDictionarySource::~RedisDictionarySource() = default; + + + BlockInputStreamPtr RedisDictionarySource::loadAll() + { + return std::make_shared(client, sample_block, max_block_size); + } + +/* + BlockInputStreamPtr RedisDictionarySource::loadIds(const std::vector & ids) + { + if (!dict_struct.id) + throw Exception{"'id' is required for selective loading", ErrorCodes::UNSUPPORTED_METHOD}; + + Poco::Redis::Array ids_array(new Poco::Redis::Array); + for (const UInt64 id : ids) + ids_array->add(DB::toString(id), Int32(id)); + + cursor->query().selector().addNewDocument(dict_struct.id->name).add("$in", ids_array); + + return std::make_shared(connection, sample_block, max_block_size); + } + + + BlockInputStreamPtr RedisDictionarySource::loadKeys(const Columns & key_columns, const std::vector & requested_rows) + { + if (!dict_struct.key) + throw Exception{"'key' is required for selective loading", ErrorCodes::UNSUPPORTED_METHOD}; + + Poco::Redis::Array::Ptr keys_array(new Poco::Redis::Array); + + for (const auto row_idx : requested_rows) + { + auto & key = keys_array->addNewDocument(DB::toString(row_idx)); + + for (const auto attr : ext::enumerate(*dict_struct.key)) + { + switch (attr.second.underlying_type) + { + case AttributeUnderlyingType::UInt8: + case AttributeUnderlyingType::UInt16: + case AttributeUnderlyingType::UInt32: + case AttributeUnderlyingType::UInt64: + case AttributeUnderlyingType::UInt128: + case AttributeUnderlyingType::Int8: + case AttributeUnderlyingType::Int16: + case AttributeUnderlyingType::Int32: + case AttributeUnderlyingType::Int64: + case AttributeUnderlyingType::Decimal32: + case AttributeUnderlyingType::Decimal64: + case AttributeUnderlyingType::Decimal128: + key.add(attr.second.name, Int32(key_columns[attr.first]->get64(row_idx))); + break; + + case AttributeUnderlyingType::Float32: + case AttributeUnderlyingType::Float64: + key.add(attr.second.name, applyVisitor(FieldVisitorConvertToNumber(), (*key_columns[attr.first])[row_idx])); + break; + + case AttributeUnderlyingType::String: + String _str(get((*key_columns[attr.first])[row_idx])); + /// Convert string to ObjectID + if (attr.second.is_object_id) + { + Poco::Redis::ObjectId::Ptr _id(new Poco::Redis::ObjectId(_str)); + key.add(attr.second.name, _id); + } + else + { + key.add(attr.second.name, _str); + } + break; + } + } + } + + /// If more than one key we should use $or + cursor->query().selector().add("$or", keys_array); + + return std::make_shared(connection, sample_block, max_block_size); + } +*/ + + std::string RedisDictionarySource::toString() const + { + return "Redis: " + host + ':' + DB::toString(port); + } + +} + +#endif diff --git a/dbms/src/Dictionaries/RedisDictionarySource.h b/dbms/src/Dictionaries/RedisDictionarySource.h new file mode 100644 index 00000000000..61417fac393 --- /dev/null +++ b/dbms/src/Dictionaries/RedisDictionarySource.h @@ -0,0 +1,84 @@ +#pragma once + +#include +#if USE_POCO_REDIS + +# include "DictionaryStructure.h" +# include "IDictionarySource.h" + +namespace Poco +{ + namespace Util + { + class AbstractConfiguration; + } + + namespace Redis + { + class Client; + } +} + + +namespace DB +{ +/// Allows loading dictionaries from a Redis collection + class RedisDictionarySource final : public IDictionarySource + { + RedisDictionarySource( + const DictionaryStructure & dict_struct, + const std::string & host, + UInt16 port, + const Block & sample_block); + + public: + RedisDictionarySource( + const DictionaryStructure & dict_struct, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + Block & sample_block); + + RedisDictionarySource(const RedisDictionarySource & other); + + ~RedisDictionarySource() override; + + BlockInputStreamPtr loadAll() override; + + BlockInputStreamPtr loadUpdatedAll() override + { + throw Exception{"Method loadUpdatedAll is unsupported for RedisDictionarySource", ErrorCodes::NOT_IMPLEMENTED}; + } + + bool supportsSelectiveLoad() const override { return true; } + + BlockInputStreamPtr loadIds(const std::vector & /* ids */) override {throw 1;}; + + BlockInputStreamPtr loadKeys(const Columns & /* key_columns */, const std::vector & /* requested_rows */) override {throw 1;}; + + /// @todo: for Redis, modification date can somehow be determined from the `_id` object field + bool isModified() const override { return true; } + + ///Not yet supported + bool hasUpdateField() const override { return false; } + + DictionarySourcePtr clone() const override { return std::make_unique(*this); } + + std::string toString() const override; + + private: + const DictionaryStructure dict_struct; + const std::string host; + const UInt16 port; + Block sample_block; + + std::shared_ptr client; + }; + +} +#endif + +/*namespace DB +{ +class DictionarySourceFactory; +void registerDictionarySourceRedis(DictionarySourceFactory & factory); +}*/ From 8472b26f07778c516000a80e90ffabf945743595 Mon Sep 17 00:00:00 2001 From: comunodi Date: Wed, 16 Jan 2019 04:05:40 +0300 Subject: [PATCH 002/309] Fix code highlighting --- cmake/find_poco.cmake | 2 +- dbms/src/Dictionaries/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/find_poco.cmake b/cmake/find_poco.cmake index 4c9cb16e729..ade020141bb 100644 --- a/cmake/find_poco.cmake +++ b/cmake/find_poco.cmake @@ -132,7 +132,7 @@ if (Poco_SQLODBC_LIBRARY AND ODBC_FOUND) set (USE_POCO_SQLODBC 1) endif () -message(STATUS "Using Poco: ${Poco_INCLUDE_DIRS} : ${Poco_Foundation_LIBRARY},${Poco_Util_LIBRARY},${Poco_Net_LIBRARY},${Poco_NetSSL_LIBRARY},${Poco_Crypto_LIBRARY},${Poco_XML_LIBRARY},${Poco_Data_LIBRARY},${Poco_DataODBC_LIBRARY},${Poco_SQL_LIBRARY},${Poco_SQLODBC_LIBRARY},${Poco_MongoDB_LIBRARY}; MongoDB=${USE_POCO_MONGODB}, Redis=${USE_POCO_REDIS}, DataODBC=${USE_POCO_DATAODBC}, NetSSL=${USE_POCO_NETSSL}") +message(STATUS "Using Poco: ${Poco_INCLUDE_DIRS} : ${Poco_Foundation_LIBRARY},${Poco_Util_LIBRARY},${Poco_Net_LIBRARY},${Poco_NetSSL_LIBRARY},${Poco_Crypto_LIBRARY},${Poco_XML_LIBRARY},${Poco_Data_LIBRARY},${Poco_DataODBC_LIBRARY},${Poco_SQL_LIBRARY},${Poco_SQLODBC_LIBRARY},${Poco_MongoDB_LIBRARY},${Poco_Redis_INCLUDE_DIR}; MongoDB=${USE_POCO_MONGODB}, Redis=${USE_POCO_REDIS}, DataODBC=${USE_POCO_DATAODBC}, NetSSL=${USE_POCO_NETSSL}") # How to make sutable poco: # use branch: diff --git a/dbms/src/Dictionaries/CMakeLists.txt b/dbms/src/Dictionaries/CMakeLists.txt index 2e8219f2170..de6bdd6b915 100644 --- a/dbms/src/Dictionaries/CMakeLists.txt +++ b/dbms/src/Dictionaries/CMakeLists.txt @@ -37,7 +37,7 @@ if(USE_POCO_MONGODB) endif() if(USE_POCO_REDIS) - target_link_libraries(clickhouse_dictionaries PRIVATE ${Poco_Redis_LIBRARY}) + target_include_directories(clickhouse_dictionaries SYSTEM PRIVATE ${Poco_Redis_INCLUDE_DIR}) endif() add_subdirectory(Embedded) From 741f630141f714ad39b39d769dacfd1a1daa884b Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 27 Jan 2019 16:14:02 +0300 Subject: [PATCH 003/309] Support loadIds --- .../Dictionaries/RedisBlockInputStream.cpp | 71 +++++------ dbms/src/Dictionaries/RedisBlockInputStream.h | 7 +- .../Dictionaries/RedisDictionarySource.cpp | 113 ++++++++---------- dbms/src/Dictionaries/RedisDictionarySource.h | 9 +- 4 files changed, 87 insertions(+), 113 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index dfbb03a0034..a7d0b27bd09 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -35,10 +35,10 @@ namespace DB RedisBlockInputStream::RedisBlockInputStream( - std::shared_ptr client_, + const Poco::Redis::Array & reply_array_, const DB::Block & sample_block, const size_t max_block_size) - : client(client_), max_block_size{max_block_size} + : reply_array(reply_array_), max_block_size{max_block_size} { description.init(sample_block); } @@ -190,55 +190,42 @@ namespace DB for (const auto i : ext::range(0, size)) columns[i] = description.sample_block.getByPosition(i).column->cloneEmpty(); + const auto insertValueByIdx = [this, &columns](size_t idx, const auto & value) + { + const auto & name = description.sample_block.getByPosition(idx).name; + if (description.types[idx].second) + { + ColumnNullable & column_nullable = static_cast(*columns[idx]); + insertValue(column_nullable.getNestedColumn(), description.types[idx].first, value, name); + column_nullable.getNullMapData().emplace_back(0); + } + else + insertValue(*columns[idx], description.types[idx].first, value, name); + }; + size_t num_rows = 0; + + const auto & keys = reply_array.get(0); + const auto & values = reply_array.get(1); + while (num_rows < max_block_size) { - RedisArray commandForKeys; - commandForKeys << "SCAN" << cursor; - - auto replyForKeys = client->execute(commandForKeys); - if (cursor = replyForKeys.get(0); cursor == 0) - { + if (cursor == keys.size()) { all_read = true; break; } - auto response = replyForKeys.get(1); - if (response.isNull()) - continue; + ++num_rows; + ++cursor; - Poco::Redis::Array commandForValues; - commandForValues << "MGET"; + const auto & key = *(keys.begin() + cursor); + insertValueByIdx(0, key); - const auto insertValueByIdx = [this, &columns](size_t idx, const auto & value) - { - const auto & name = description.sample_block.getByPosition(idx).name; - if (description.types[idx].second) - { - ColumnNullable & column_nullable = static_cast(*columns[idx]); - insertValue(column_nullable.getNestedColumn(), description.types[idx].first, value, name); - column_nullable.getNullMapData().emplace_back(0); - } - else - insertValue(*columns[idx], description.types[idx].first, value, name); - }; - - for (const auto & key : response) - { - ++num_rows; - String keyS = static_cast *>(key.get())->value(); - commandForValues << keyS; - insertValueByIdx(0, key); - } - - auto replyForValues = client->execute(commandForValues); - for (const auto & value : replyForValues) - { - if (value.isNull()) - insertDefaultValue(*columns[1], *description.sample_block.getByPosition(1).column); - else - insertValueByIdx(1, value); - } + const auto & value = *(values.begin() + cursor); + if (value.isNull()) + insertDefaultValue(*columns[1], *description.sample_block.getByPosition(1).column); + else + insertValueByIdx(1, value); } if (num_rows == 0) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.h b/dbms/src/Dictionaries/RedisBlockInputStream.h index 7e32b3ff8ff..1884ce7a0f6 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.h +++ b/dbms/src/Dictionaries/RedisBlockInputStream.h @@ -9,6 +9,7 @@ namespace Poco { namespace Redis { + class Array; class Client; } } @@ -21,7 +22,7 @@ namespace DB { public: RedisBlockInputStream( - std::shared_ptr client_, + const Poco::Redis::Array & reply_array_, const Block & sample_block, const size_t max_block_size); @@ -34,10 +35,10 @@ namespace DB private: Block readImpl() override; - std::shared_ptr client; + Poco::Redis::Array reply_array; const size_t max_block_size; ExternalResultDescription description; - int64_t cursor = 0; + size_t cursor = 0; bool all_read = false; }; diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index 1fb5472b48b..90229c087dd 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -53,6 +53,17 @@ namespace DB # include "RedisBlockInputStream.h" +namespace +{ + template + Poco::Redis::Array makeResult(const K & keys, const V & values) { + Poco::Redis::Array result; + result << keys << values; + return result; + } +} + + namespace DB { namespace ErrorCodes @@ -106,83 +117,57 @@ namespace DB BlockInputStreamPtr RedisDictionarySource::loadAll() { - return std::make_shared(client, sample_block, max_block_size); + Int64 cursor = 0; + Poco::Redis::Array keys; + + do + { + Poco::Redis::Array commandForKeys; + commandForKeys << "SCAN" << cursor << "COUNT 1000"; + + Poco::Redis::Array replyForKeys = client->execute(commandForKeys); + cursor = replyForKeys.get(0); + + Poco::Redis::Array response = replyForKeys.get(1); + if (response.isNull()) + continue; + + for (const Poco::Redis::RedisType::Ptr & key : response) + keys.addRedisType(key); + } + while (cursor != 0); + + Poco::Redis::Array commandForValues; + commandForValues << "MGET"; + for (const Poco::Redis::RedisType::Ptr & key : keys) + commandForValues.addRedisType(key); + + Poco::Redis::Array values = client->execute(commandForValues); + + return std::make_shared(makeResult(keys, values), sample_block, max_block_size); } -/* + BlockInputStreamPtr RedisDictionarySource::loadIds(const std::vector & ids) { if (!dict_struct.id) throw Exception{"'id' is required for selective loading", ErrorCodes::UNSUPPORTED_METHOD}; - Poco::Redis::Array ids_array(new Poco::Redis::Array); + Poco::Redis::Array keys; + Poco::Redis::Array command; + command << "MGET"; + for (const UInt64 id : ids) - ids_array->add(DB::toString(id), Int32(id)); - - cursor->query().selector().addNewDocument(dict_struct.id->name).add("$in", ids_array); - - return std::make_shared(connection, sample_block, max_block_size); - } - - - BlockInputStreamPtr RedisDictionarySource::loadKeys(const Columns & key_columns, const std::vector & requested_rows) - { - if (!dict_struct.key) - throw Exception{"'key' is required for selective loading", ErrorCodes::UNSUPPORTED_METHOD}; - - Poco::Redis::Array::Ptr keys_array(new Poco::Redis::Array); - - for (const auto row_idx : requested_rows) { - auto & key = keys_array->addNewDocument(DB::toString(row_idx)); - - for (const auto attr : ext::enumerate(*dict_struct.key)) - { - switch (attr.second.underlying_type) - { - case AttributeUnderlyingType::UInt8: - case AttributeUnderlyingType::UInt16: - case AttributeUnderlyingType::UInt32: - case AttributeUnderlyingType::UInt64: - case AttributeUnderlyingType::UInt128: - case AttributeUnderlyingType::Int8: - case AttributeUnderlyingType::Int16: - case AttributeUnderlyingType::Int32: - case AttributeUnderlyingType::Int64: - case AttributeUnderlyingType::Decimal32: - case AttributeUnderlyingType::Decimal64: - case AttributeUnderlyingType::Decimal128: - key.add(attr.second.name, Int32(key_columns[attr.first]->get64(row_idx))); - break; - - case AttributeUnderlyingType::Float32: - case AttributeUnderlyingType::Float64: - key.add(attr.second.name, applyVisitor(FieldVisitorConvertToNumber(), (*key_columns[attr.first])[row_idx])); - break; - - case AttributeUnderlyingType::String: - String _str(get((*key_columns[attr.first])[row_idx])); - /// Convert string to ObjectID - if (attr.second.is_object_id) - { - Poco::Redis::ObjectId::Ptr _id(new Poco::Redis::ObjectId(_str)); - key.add(attr.second.name, _id); - } - else - { - key.add(attr.second.name, _str); - } - break; - } - } + keys << static_cast(id); + command << static_cast(id); } - /// If more than one key we should use $or - cursor->query().selector().add("$or", keys_array); + Poco::Redis::Array values = client->execute(command); - return std::make_shared(connection, sample_block, max_block_size); + return std::make_shared(makeResult(keys, values), sample_block, max_block_size); } -*/ + std::string RedisDictionarySource::toString() const { diff --git a/dbms/src/Dictionaries/RedisDictionarySource.h b/dbms/src/Dictionaries/RedisDictionarySource.h index 61417fac393..e3566731f06 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.h +++ b/dbms/src/Dictionaries/RedisDictionarySource.h @@ -51,14 +51,15 @@ namespace DB bool supportsSelectiveLoad() const override { return true; } - BlockInputStreamPtr loadIds(const std::vector & /* ids */) override {throw 1;}; + BlockInputStreamPtr loadIds(const std::vector & ids) override; - BlockInputStreamPtr loadKeys(const Columns & /* key_columns */, const std::vector & /* requested_rows */) override {throw 1;}; + BlockInputStreamPtr loadKeys(const Columns & /* key_columns */, const std::vector & /* requested_rows */) override + { + throw Exception{"Method loadKeys is unsupported for RedisDictionarySource", ErrorCodes::NOT_IMPLEMENTED}; + }; - /// @todo: for Redis, modification date can somehow be determined from the `_id` object field bool isModified() const override { return true; } - ///Not yet supported bool hasUpdateField() const override { return false; } DictionarySourcePtr clone() const override { return std::make_unique(*this); } From b455708eab688c695853eeda694d1d55de74a8d2 Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 27 Jan 2019 18:30:51 +0300 Subject: [PATCH 004/309] Use batch query for reading keys --- .../Dictionaries/RedisDictionarySource.cpp | 23 ++++--------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index 90229c087dd..b3ec940a8d1 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -117,25 +117,10 @@ namespace DB BlockInputStreamPtr RedisDictionarySource::loadAll() { - Int64 cursor = 0; - Poco::Redis::Array keys; - - do - { - Poco::Redis::Array commandForKeys; - commandForKeys << "SCAN" << cursor << "COUNT 1000"; - - Poco::Redis::Array replyForKeys = client->execute(commandForKeys); - cursor = replyForKeys.get(0); - - Poco::Redis::Array response = replyForKeys.get(1); - if (response.isNull()) - continue; - - for (const Poco::Redis::RedisType::Ptr & key : response) - keys.addRedisType(key); - } - while (cursor != 0); + Poco::Redis::Array commandForKeys; + commandForKeys << "KEYS" << "*"; + + Poco::Redis::Array keys = client->execute(commandForKeys); Poco::Redis::Array commandForValues; commandForValues << "MGET"; From 933906403ac3ecf98ba32b7d4f453380a0a8878a Mon Sep 17 00:00:00 2001 From: comunodi Date: Mon, 28 Jan 2019 01:22:18 +0300 Subject: [PATCH 005/309] Optimize memory consumption --- .../Dictionaries/RedisBlockInputStream.cpp | 22 ++++++++------ dbms/src/Dictionaries/RedisBlockInputStream.h | 10 ++++--- .../Dictionaries/RedisDictionarySource.cpp | 29 ++----------------- 3 files changed, 21 insertions(+), 40 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index a7d0b27bd09..32d9abc71a8 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -35,10 +35,11 @@ namespace DB RedisBlockInputStream::RedisBlockInputStream( - const Poco::Redis::Array & reply_array_, + const std::shared_ptr & client_, + const Poco::Redis::Array & keys_, const DB::Block & sample_block, const size_t max_block_size) - : reply_array(reply_array_), max_block_size{max_block_size} + : client(client_), keys(keys_), max_block_size{max_block_size} { description.init(sample_block); } @@ -102,6 +103,7 @@ namespace DB ErrorCodes::TYPE_MISMATCH}; } }; + switch (type) { case ValueType::UInt8: @@ -204,9 +206,7 @@ namespace DB }; size_t num_rows = 0; - - const auto & keys = reply_array.get(0); - const auto & values = reply_array.get(1); + Poco::Redis::Command commandForValues("MGET"); while (num_rows < max_block_size) { @@ -220,17 +220,21 @@ namespace DB const auto & key = *(keys.begin() + cursor); insertValueByIdx(0, key); + commandForValues.addRedisType(key); + } - const auto & value = *(values.begin() + cursor); + if (num_rows == 0) + return {}; + + Poco::Redis::Array values = client->execute(commandForValues); + for (size_t i = 0; i < num_rows; ++i) { + const Poco::Redis::RedisType::Ptr & value = *(values.begin() + i); if (value.isNull()) insertDefaultValue(*columns[1], *description.sample_block.getByPosition(1).column); else insertValueByIdx(1, value); } - if (num_rows == 0) - return {}; - return description.sample_block.cloneWithColumns(std::move(columns)); } diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.h b/dbms/src/Dictionaries/RedisBlockInputStream.h index 1884ce7a0f6..d1c3ad157e9 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.h +++ b/dbms/src/Dictionaries/RedisBlockInputStream.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include "ExternalResultDescription.h" @@ -18,11 +18,12 @@ namespace Poco namespace DB { /// Converts Redis Cursor to a stream of Blocks - class RedisBlockInputStream final : public IProfilingBlockInputStream + class RedisBlockInputStream final : public IBlockInputStream { public: RedisBlockInputStream( - const Poco::Redis::Array & reply_array_, + const std::shared_ptr & client_, + const Poco::Redis::Array & keys_, const Block & sample_block, const size_t max_block_size); @@ -35,7 +36,8 @@ namespace DB private: Block readImpl() override; - Poco::Redis::Array reply_array; + std::shared_ptr client; + Poco::Redis::Array keys; const size_t max_block_size; ExternalResultDescription description; size_t cursor = 0; diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index b3ec940a8d1..b4c1ac97330 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -53,17 +53,6 @@ namespace DB # include "RedisBlockInputStream.h" -namespace -{ - template - Poco::Redis::Array makeResult(const K & keys, const V & values) { - Poco::Redis::Array result; - result << keys << values; - return result; - } -} - - namespace DB { namespace ErrorCodes @@ -122,14 +111,7 @@ namespace DB Poco::Redis::Array keys = client->execute(commandForKeys); - Poco::Redis::Array commandForValues; - commandForValues << "MGET"; - for (const Poco::Redis::RedisType::Ptr & key : keys) - commandForValues.addRedisType(key); - - Poco::Redis::Array values = client->execute(commandForValues); - - return std::make_shared(makeResult(keys, values), sample_block, max_block_size); + return std::make_shared(client, std::move(keys), sample_block, max_block_size); } @@ -139,18 +121,11 @@ namespace DB throw Exception{"'id' is required for selective loading", ErrorCodes::UNSUPPORTED_METHOD}; Poco::Redis::Array keys; - Poco::Redis::Array command; - command << "MGET"; for (const UInt64 id : ids) - { keys << static_cast(id); - command << static_cast(id); - } - Poco::Redis::Array values = client->execute(command); - - return std::make_shared(makeResult(keys, values), sample_block, max_block_size); + return std::make_shared(client, std::move(keys), sample_block, max_block_size); } From 6e28c22876578ac2b375d999d695bb1382b76e7e Mon Sep 17 00:00:00 2001 From: comunodi Date: Tue, 12 Feb 2019 12:23:22 +0300 Subject: [PATCH 006/309] Add tests --- cmake/find_poco.cmake | 2 +- dbms/src/Dictionaries/CMakeLists.txt | 6 +- .../src/Dictionaries/registerDictionaries.cpp | 2 + .../generate_and_test.py | 76 +++++++++++++++++++ dbms/tests/external_dictionaries/run.sh | 29 +++++++ .../dicts/external_dicts_dict_sources.md | 20 +++++ 6 files changed, 133 insertions(+), 2 deletions(-) diff --git a/cmake/find_poco.cmake b/cmake/find_poco.cmake index ade020141bb..4583fafac54 100644 --- a/cmake/find_poco.cmake +++ b/cmake/find_poco.cmake @@ -132,7 +132,7 @@ if (Poco_SQLODBC_LIBRARY AND ODBC_FOUND) set (USE_POCO_SQLODBC 1) endif () -message(STATUS "Using Poco: ${Poco_INCLUDE_DIRS} : ${Poco_Foundation_LIBRARY},${Poco_Util_LIBRARY},${Poco_Net_LIBRARY},${Poco_NetSSL_LIBRARY},${Poco_Crypto_LIBRARY},${Poco_XML_LIBRARY},${Poco_Data_LIBRARY},${Poco_DataODBC_LIBRARY},${Poco_SQL_LIBRARY},${Poco_SQLODBC_LIBRARY},${Poco_MongoDB_LIBRARY},${Poco_Redis_INCLUDE_DIR}; MongoDB=${USE_POCO_MONGODB}, Redis=${USE_POCO_REDIS}, DataODBC=${USE_POCO_DATAODBC}, NetSSL=${USE_POCO_NETSSL}") +message(STATUS "Using Poco: ${Poco_INCLUDE_DIRS} : ${Poco_Foundation_LIBRARY},${Poco_Util_LIBRARY},${Poco_Net_LIBRARY},${Poco_NetSSL_LIBRARY},${Poco_Crypto_LIBRARY},${Poco_XML_LIBRARY},${Poco_Data_LIBRARY},${Poco_DataODBC_LIBRARY},${Poco_SQL_LIBRARY},${Poco_SQLODBC_LIBRARY},${Poco_MongoDB_LIBRARY},${Poco_Redis_LIBRARY}; MongoDB=${USE_POCO_MONGODB}, Redis=${USE_POCO_REDIS}, DataODBC=${USE_POCO_DATAODBC}, NetSSL=${USE_POCO_NETSSL}") # How to make sutable poco: # use branch: diff --git a/dbms/src/Dictionaries/CMakeLists.txt b/dbms/src/Dictionaries/CMakeLists.txt index de6bdd6b915..0a5d198dd8c 100644 --- a/dbms/src/Dictionaries/CMakeLists.txt +++ b/dbms/src/Dictionaries/CMakeLists.txt @@ -37,7 +37,11 @@ if(USE_POCO_MONGODB) endif() if(USE_POCO_REDIS) - target_include_directories(clickhouse_dictionaries SYSTEM PRIVATE ${Poco_Redis_INCLUDE_DIR}) + # for code highlighting in CLion + # target_include_directories(clickhouse_dictionaries SYSTEM PRIVATE ${Poco_Redis_INCLUDE_DIR}) + + # for build + target_link_libraries(clickhouse_dictionaries PRIVATE ${Poco_Redis_LIBRARY}) endif() add_subdirectory(Embedded) diff --git a/dbms/src/Dictionaries/registerDictionaries.cpp b/dbms/src/Dictionaries/registerDictionaries.cpp index 1a8c5a7be7b..ee320d7177b 100644 --- a/dbms/src/Dictionaries/registerDictionaries.cpp +++ b/dbms/src/Dictionaries/registerDictionaries.cpp @@ -7,6 +7,7 @@ void registerDictionarySourceFile(DictionarySourceFactory & source_factory); void registerDictionarySourceMysql(DictionarySourceFactory & source_factory); void registerDictionarySourceClickHouse(DictionarySourceFactory & source_factory); void registerDictionarySourceMongoDB(DictionarySourceFactory & source_factory); +void registerDictionarySourceRedis(DictionarySourceFactory & source_factory); void registerDictionarySourceXDBC(DictionarySourceFactory & source_factory); void registerDictionarySourceJDBC(DictionarySourceFactory & source_factory); void registerDictionarySourceExecutable(DictionarySourceFactory & source_factory); @@ -30,6 +31,7 @@ void registerDictionaries() registerDictionarySourceMysql(source_factory); registerDictionarySourceClickHouse(source_factory); registerDictionarySourceMongoDB(source_factory); + registerDictionarySourceRedis(source_factory); registerDictionarySourceXDBC(source_factory); registerDictionarySourceJDBC(source_factory); registerDictionarySourceExecutable(source_factory); diff --git a/dbms/tests/external_dictionaries/generate_and_test.py b/dbms/tests/external_dictionaries/generate_and_test.py index 2c72d29de9d..f4891424c21 100755 --- a/dbms/tests/external_dictionaries/generate_and_test.py +++ b/dbms/tests/external_dictionaries/generate_and_test.py @@ -119,6 +119,17 @@ def generate_structure(args): [ 'mongodb_user_flat', 0, True ], ]) + if not args.no_redis: + dictionaries.extend([ + [ 'redis_flat', 0, True ], + [ 'redis_hashed', 0, True ], + [ 'redis_cache', 0, True ], + [ 'redis_complex_integers_key_hashed', 1, False ], + [ 'redis_complex_integers_key_cache', 1, False ], + [ 'redis_complex_mixed_key_hashed', 2, False ], + [ 'redis_complex_mixed_key_cache', 2, False ], + ]) + if args.use_lib: dictionaries.extend([ # [ 'library_flat', 0, True ], @@ -382,6 +393,51 @@ def generate_data(args): print 'Could not create MongoDB collection' exit(-1) + # create Redis storage from complete_query via JSON file + if not args.no_redis: + print 'Creating Redis storage' + table_rows = json.loads(subprocess.check_output([ + args.client, + '--port', + args.port, + '--output_format_json_quote_64bit_integers', + '0', + '--query', + "select * from test.dictionary_source where not ignore(" \ + "concat('new Date(\\'', toString(Date_), '\\')') as Date_, " \ + "concat('new ISODate(\\'', replaceOne(toString(DateTime_, 'UTC'), ' ', 'T'), 'Z\\')') as DateTime_" \ + ") format JSON" + ]))['data'] + + # print json.dumps(table_rows) + + # For Integers the first byte of the reply is ":" + # For Bulk Strings the first byte of the reply is "$" + + proto_for_redis = "" + for counter, collection in enumerate(table_rows): + proto_for_redis += "SELECT " + str(counter) + "\r\n" + proto_for_redis += "FLUSHDB\r\n" + for key, value in collection.iteritems(): + value_type = "$" + if isinstance(value, int): + value_type = ":" + else: + value = str(value) + if "Date" in value: + value = value[value.find("'") + 1:-2] + + proto_for_redis += "SET " + "$" + key + " " + value_type + str(value) + "\r\n" + + # with open("clickhouse_redis.log", "w") as f: + # f.write(json.dumps(table_rows) + "\n" + proto_for_redis + "\n") + + open('generated/full.json', 'w').write(proto_for_redis) + result = system('cat {0}/full.json | redis-cli > \\dev\\null'.format(args.generated)) + if result != 0: + print 'Could not create Redis storage' + exit(-1) + def generate_dictionaries(args): dictionary_skeleton = ''' @@ -482,6 +538,13 @@ def generate_dictionaries(args): '''.format(mongo_host=args.mongo_host) + source_redis = ''' + + {redis_host} + 6379 + + '''.format(redis_host=args.redis_host) + source_executable = ''' cat %s @@ -668,6 +731,17 @@ def generate_dictionaries(args): [ source_mongodb_user, layout_flat ], ]) + if not args.no_redis: + sources_and_layouts.extend([ + [ source_redis, layout_flat ], + [ source_redis, layout_hashed ], + [ source_redis, layout_cache ], + [ source_redis, layout_complex_key_cache ], + [ source_redis, layout_complex_key_hashed ], + [ source_redis, layout_complex_key_hashed ], + [ source_redis, layout_complex_key_cache ], + ]) + if args.use_lib: sources_and_layouts.extend([ #[ source_library, layout_flat ], @@ -947,6 +1021,8 @@ if __name__ == '__main__': parser.add_argument('--no_mongo', action='store_true', help = 'Dont use mongodb dictionaries') parser.add_argument('--mongo_host', default = 'localhost', help = 'mongo server host') parser.add_argument('--use_mongo_user', action='store_true', help = 'Test mongodb with user-pass') + parser.add_argument('--no_redis', action='store_true', help = 'Dont use redis dictionaries') + parser.add_argument('--redis_host', default = 'localhost', help = 'redis server host') parser.add_argument('--no_http', action='store_true', help = 'Dont use http dictionaries') parser.add_argument('--http_port', default = 58000, help = 'http server port') diff --git a/dbms/tests/external_dictionaries/run.sh b/dbms/tests/external_dictionaries/run.sh index a04be3080a9..4560e167c57 100755 --- a/dbms/tests/external_dictionaries/run.sh +++ b/dbms/tests/external_dictionaries/run.sh @@ -9,6 +9,7 @@ fi NO_MYSQL=0 NO_MONGO=0 +NO_REDIS=0 for arg in "$@"; do if [ "$arg" = "--no_mysql" ]; then @@ -17,6 +18,9 @@ for arg in "$@"; do if [ "$arg" == "--no_mongo" ]; then NO_MONGO=1 fi + if [ "$arg" == "--no_redis" ]; then + NO_REDIS=1 + fi done # MySQL @@ -101,6 +105,31 @@ else fi fi +# Redis +if [ $NO_REDIS -eq 1 ]; then + echo "Not using Redis" +else + if [ -z $(which redis-cli) ]; then + echo 'Installing Redis' + + sudo apt-get update &>/dev/null + sudo apt-get install redis-server + + which redis-server >/dev/null + if [ $? -ne 0 ]; then + echo 'Failed installing redis-server' + exit -1 + fi + fi + + echo | redis-cli &>/dev/null + if [ $? -ne 0 ]; then + sudo systemctl start redis.service + else + echo 'Redis already started' + fi +fi + # ClickHouse clickhouse-server &> clickhouse.log & sleep 3 diff --git a/docs/en/query_language/dicts/external_dicts_dict_sources.md b/docs/en/query_language/dicts/external_dicts_dict_sources.md index f26967c2d0f..67d2d980b75 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_sources.md +++ b/docs/en/query_language/dicts/external_dicts_dict_sources.md @@ -30,6 +30,7 @@ Types of sources (`source_type`): - [MySQL](#dicts-external_dicts_dict_sources-mysql) - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) + - [Redis](#dicts-external_dicts_dict_sources-redis) - [ODBC](#dicts-external_dicts_dict_sources-odbc) @@ -421,4 +422,23 @@ Setting fields: - `db` – Name of the database. - `collection` – Name of the collection. + +### Redis {#dicts-external_dicts_dict_sources-redis} + +Example of settings: + +```xml + + + localhost + 6379 + + +``` + +Setting fields: + +- `host` – The Redis host. +- `port` – The port on the Redis server. + [Original article](https://clickhouse.yandex/docs/en/query_language/dicts/external_dicts_dict_sources/) From f2eadcfe49606362e6e49f19c385212050012423 Mon Sep 17 00:00:00 2001 From: comunodi Date: Tue, 12 Feb 2019 14:27:49 +0300 Subject: [PATCH 007/309] Remove some escaped lines --- dbms/src/Dictionaries/RedisBlockInputStream.h | 1 - dbms/src/Dictionaries/RedisDictionarySource.h | 7 ------- dbms/tests/external_dictionaries/generate_and_test.py | 5 ----- 3 files changed, 13 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.h b/dbms/src/Dictionaries/RedisBlockInputStream.h index d1c3ad157e9..f5117ec6a9c 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.h +++ b/dbms/src/Dictionaries/RedisBlockInputStream.h @@ -17,7 +17,6 @@ namespace Poco namespace DB { -/// Converts Redis Cursor to a stream of Blocks class RedisBlockInputStream final : public IBlockInputStream { public: diff --git a/dbms/src/Dictionaries/RedisDictionarySource.h b/dbms/src/Dictionaries/RedisDictionarySource.h index e3566731f06..d41e557ce24 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.h +++ b/dbms/src/Dictionaries/RedisDictionarySource.h @@ -22,7 +22,6 @@ namespace Poco namespace DB { -/// Allows loading dictionaries from a Redis collection class RedisDictionarySource final : public IDictionarySource { RedisDictionarySource( @@ -77,9 +76,3 @@ namespace DB } #endif - -/*namespace DB -{ -class DictionarySourceFactory; -void registerDictionarySourceRedis(DictionarySourceFactory & factory); -}*/ diff --git a/dbms/tests/external_dictionaries/generate_and_test.py b/dbms/tests/external_dictionaries/generate_and_test.py index ebcd62ece5c..90426962189 100755 --- a/dbms/tests/external_dictionaries/generate_and_test.py +++ b/dbms/tests/external_dictionaries/generate_and_test.py @@ -409,8 +409,6 @@ def generate_data(args): ") format JSON" ]))['data'] - # print json.dumps(table_rows) - # For Integers the first byte of the reply is ":" # For Bulk Strings the first byte of the reply is "$" @@ -429,9 +427,6 @@ def generate_data(args): proto_for_redis += "SET " + "$" + key + " " + value_type + str(value) + "\r\n" - # with open("clickhouse_redis.log", "w") as f: - # f.write(json.dumps(table_rows) + "\n" + proto_for_redis + "\n") - open('generated/full.json', 'w').write(proto_for_redis) result = system('cat {0}/full.json | redis-cli > \\dev\\null'.format(args.generated)) if result != 0: From 572463f9c141b79c6e0e4112fa0c2f0bc0e5d8c1 Mon Sep 17 00:00:00 2001 From: comunodi Date: Wed, 13 Feb 2019 03:05:43 +0300 Subject: [PATCH 008/309] Style fix --- dbms/src/Dictionaries/RedisBlockInputStream.cpp | 6 ++++-- dbms/src/Dictionaries/RedisDictionarySource.cpp | 2 +- dbms/src/Dictionaries/RedisDictionarySource.h | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 32d9abc71a8..85c92aad638 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -210,7 +210,8 @@ namespace DB while (num_rows < max_block_size) { - if (cursor == keys.size()) { + if (cursor == keys.size()) + { all_read = true; break; } @@ -227,7 +228,8 @@ namespace DB return {}; Poco::Redis::Array values = client->execute(commandForValues); - for (size_t i = 0; i < num_rows; ++i) { + for (size_t i = 0; i < num_rows; ++i) + { const Poco::Redis::RedisType::Ptr & value = *(values.begin() + i); if (value.isNull()) insertDefaultValue(*columns[1], *description.sample_block.getByPosition(1).column); diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index b4c1ac97330..7d546d39cf0 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -108,7 +108,7 @@ namespace DB { Poco::Redis::Array commandForKeys; commandForKeys << "KEYS" << "*"; - + Poco::Redis::Array keys = client->execute(commandForKeys); return std::make_shared(client, std::move(keys), sample_block, max_block_size); diff --git a/dbms/src/Dictionaries/RedisDictionarySource.h b/dbms/src/Dictionaries/RedisDictionarySource.h index d41e557ce24..f50a85ca10e 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.h +++ b/dbms/src/Dictionaries/RedisDictionarySource.h @@ -55,7 +55,7 @@ namespace DB BlockInputStreamPtr loadKeys(const Columns & /* key_columns */, const std::vector & /* requested_rows */) override { throw Exception{"Method loadKeys is unsupported for RedisDictionarySource", ErrorCodes::NOT_IMPLEMENTED}; - }; + } bool isModified() const override { return true; } From 162b26fe07626971ebd34cfcd3bd0c7e9200a7b6 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 21 Mar 2019 21:10:55 +0300 Subject: [PATCH 009/309] Add integration test for redis --- dbms/tests/integration/helpers/cluster.py | 20 ++++++++++-- .../external_sources.py | 32 +++++++++++++++++++ .../test_external_dictionaries/test.py | 5 +-- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/dbms/tests/integration/helpers/cluster.py b/dbms/tests/integration/helpers/cluster.py index 240cc2c8695..bf0abdff2b0 100644 --- a/dbms/tests/integration/helpers/cluster.py +++ b/dbms/tests/integration/helpers/cluster.py @@ -101,6 +101,7 @@ class ClickHouseCluster: self.with_odbc_drivers = False self.with_hdfs = False self.with_mongo = False + self.with_redis = False self.docker_client = None self.is_up = False @@ -112,7 +113,7 @@ class ClickHouseCluster: cmd += " client" return cmd - def add_instance(self, name, config_dir=None, main_configs=[], user_configs=[], macros={}, with_zookeeper=False, with_mysql=False, with_kafka=False, clickhouse_path_dir=None, with_odbc_drivers=False, with_postgres=False, with_hdfs=False, with_mongo=False, hostname=None, env_variables={}, image="yandex/clickhouse-integration-test", stay_alive=False, ipv4_address=None, ipv6_address=None): + def add_instance(self, name, config_dir=None, main_configs=[], user_configs=[], macros={}, with_zookeeper=False, with_mysql=False, with_kafka=False, clickhouse_path_dir=None, with_odbc_drivers=False, with_postgres=False, with_hdfs=False, with_mongo=False, with_redis=False, hostname=None, env_variables={}, image="yandex/clickhouse-integration-test", stay_alive=False, ipv4_address=None, ipv6_address=None): """Add an instance to the cluster. name - the name of the instance directory and the value of the 'instance' macro in ClickHouse. @@ -130,7 +131,7 @@ class ClickHouseCluster: instance = ClickHouseInstance( self, self.base_dir, name, config_dir, main_configs, user_configs, macros, with_zookeeper, - self.zookeeper_config_path, with_mysql, with_kafka, with_mongo, self.base_configs_dir, self.server_bin_path, + self.zookeeper_config_path, with_mysql, with_kafka, with_mongo, with_redis, self.base_configs_dir, self.server_bin_path, self.odbc_bridge_bin_path, clickhouse_path_dir, with_odbc_drivers, hostname=hostname, env_variables=env_variables, image=image, stay_alive=stay_alive, ipv4_address=ipv4_address, ipv6_address=ipv6_address) @@ -185,6 +186,13 @@ class ClickHouseCluster: self.base_mongo_cmd = ['docker-compose', '--project-directory', self.base_dir, '--project-name', self.project_name, '--file', p.join(HELPERS_DIR, 'docker_compose_mongo.yml')] + if with_redis and not self.with_redis: + self.with_redis = True + self.base_cmd.extend(['--file', p.join(HELPERS_DIR, 'docker_compose_redis.yml')]) + self.base_redis_cmd = ['docker-compose', '--project-directory', self.base_dir, '--project-name', + self.project_name, '--file', p.join(HELPERS_DIR, 'docker_compose_redis.yml')] + + return instance @@ -316,6 +324,11 @@ class ClickHouseCluster: subprocess_check_call(self.base_mongo_cmd + ['up', '-d', '--force-recreate']) self.wait_mongo_to_start(30) + if self.with_redis and self.base_redis_cmd: + subprocess_check_call(self.base_redis_cmd + ['up', '-d', '--force-recreate']) + time.sleep(10) + + subprocess_check_call(self.base_cmd + ['up', '-d', '--no-recreate']) start_deadline = time.time() + 20.0 # seconds @@ -414,7 +427,7 @@ class ClickHouseInstance: def __init__( self, cluster, base_path, name, custom_config_dir, custom_main_configs, custom_user_configs, macros, - with_zookeeper, zookeeper_config_path, with_mysql, with_kafka, with_mongo, base_configs_dir, server_bin_path, odbc_bridge_bin_path, + with_zookeeper, zookeeper_config_path, with_mysql, with_kafka, with_mongo, with_redis, base_configs_dir, server_bin_path, odbc_bridge_bin_path, clickhouse_path_dir, with_odbc_drivers, hostname=None, env_variables={}, image="yandex/clickhouse-integration-test", stay_alive=False, ipv4_address=None, ipv6_address=None): @@ -439,6 +452,7 @@ class ClickHouseInstance: self.with_mysql = with_mysql self.with_kafka = with_kafka self.with_mongo = with_mongo + self.with_redis = with_redis self.path = p.join(self.cluster.instances_dir, name) self.docker_compose_path = p.join(self.path, 'docker_compose.yml') diff --git a/dbms/tests/integration/test_external_dictionaries/external_sources.py b/dbms/tests/integration/test_external_dictionaries/external_sources.py index 71dc05ca78c..e0489acf7cf 100644 --- a/dbms/tests/integration/test_external_dictionaries/external_sources.py +++ b/dbms/tests/integration/test_external_dictionaries/external_sources.py @@ -2,6 +2,7 @@ import warnings import pymysql.cursors import pymongo +import redis from tzlocal import get_localzone import datetime import os @@ -372,3 +373,34 @@ class SourceHTTP(SourceHTTPBase): class SourceHTTPS(SourceHTTPBase): def _get_schema(self): return "https" + + +class SourceRedis(ExternalSource): + def get_source_str(self, table_name): + return ''' + + {host} + {port} + + '''.format( + host=self.docker_hostname, + port=self.docker_port, + ) + + def prepare(self, structure, table_name, cluster): + self.client = redis.StrictRedis(host=self.internal_hostname, port=self.internal_port) + self.prepared = True + + def load_data(self, data, table_name): + for row_num, row in enumerate(data): + self.client.execute_command("SELECT " + str(row_num)) + self.client.execute_command("FLUSHDB") + for cell_name, cell_value in row.data.items(): + value_type = "$" + if isinstance(cell_value, int): + value_type = ":" + else: + cell_value = '"' + str(cell_value).replace(' ', '\s') + '"' + cmd = "SET " + "$" + cell_name + " " + value_type + str(cell_value) + print(cmd) + self.client.execute_command(cmd) diff --git a/dbms/tests/integration/test_external_dictionaries/test.py b/dbms/tests/integration/test_external_dictionaries/test.py index 314ec26a106..752b37cd760 100644 --- a/dbms/tests/integration/test_external_dictionaries/test.py +++ b/dbms/tests/integration/test_external_dictionaries/test.py @@ -5,7 +5,7 @@ import time from helpers.cluster import ClickHouseCluster from dictionary import Field, Row, Dictionary, DictionaryStructure, Layout from external_sources import SourceMySQL, SourceClickHouse, SourceFile, SourceExecutableCache, SourceExecutableHashed, SourceMongo -from external_sources import SourceHTTP, SourceHTTPS +from external_sources import SourceHTTP, SourceHTTPS, SourceRedis SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) @@ -79,6 +79,7 @@ LAYOUTS = [ ] SOURCES = [ + SourceRedis("Redis", "localhost", "6380", "redis1", "6379", "", ""), SourceMongo("MongoDB", "localhost", "27018", "mongo1", "27017", "root", "clickhouse"), SourceMySQL("MySQL", "localhost", "3308", "mysql1", "3306", "root", "clickhouse"), SourceClickHouse("RemoteClickHouse", "localhost", "9000", "clickhouse1", "9000", "default", ""), @@ -120,7 +121,7 @@ def setup_module(module): for fname in os.listdir(dict_configs_path): main_configs.append(os.path.join(dict_configs_path, fname)) cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs')) - node = cluster.add_instance('node', main_configs=main_configs, with_mysql=True, with_mongo=True) + node = cluster.add_instance('node', main_configs=main_configs, with_mysql=True, with_mongo=True, with_redis=True) cluster.add_instance('clickhouse1') @pytest.fixture(scope="module") From 09a130372e2b364c763fee1ddf96715c8ab37c44 Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 30 Mar 2019 16:51:59 +0300 Subject: [PATCH 010/309] Missed yml file --- dbms/tests/integration/helpers/docker_compose_redis.yml | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 dbms/tests/integration/helpers/docker_compose_redis.yml diff --git a/dbms/tests/integration/helpers/docker_compose_redis.yml b/dbms/tests/integration/helpers/docker_compose_redis.yml new file mode 100644 index 00000000000..205409b3a21 --- /dev/null +++ b/dbms/tests/integration/helpers/docker_compose_redis.yml @@ -0,0 +1,7 @@ +version: '2.2' +services: + redis1: + image: redis + restart: always + ports: + - 6380:6379 From 8abffd4f602deaf7e7dfdf4fbd0670d47f7eefbb Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 31 Mar 2019 00:42:13 +0300 Subject: [PATCH 011/309] Fix build --- dbms/src/Dictionaries/RedisBlockInputStream.cpp | 2 +- dbms/src/Dictionaries/RedisBlockInputStream.h | 2 +- dbms/src/Dictionaries/RedisDictionarySource.cpp | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 85c92aad638..0375e420430 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -210,7 +210,7 @@ namespace DB while (num_rows < max_block_size) { - if (cursor == keys.size()) + if (cursor >= keys.size()) { all_read = true; break; diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.h b/dbms/src/Dictionaries/RedisBlockInputStream.h index f5117ec6a9c..95a563cee80 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.h +++ b/dbms/src/Dictionaries/RedisBlockInputStream.h @@ -2,7 +2,7 @@ #include #include -#include "ExternalResultDescription.h" +#include namespace Poco diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index 7d546d39cf0..07027d24e24 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -17,9 +17,9 @@ namespace DB Block & sample_block, const Context & /* context */) -> DictionarySourcePtr { #if USE_POCO_REDIS - return std::make_unique(dict_struct, config, config_prefix + ".redis", sample_block); + return std::make_unique(dict_struct, config, config_prefix + ".redis", sample_block); #else - (void)dict_struct; + (void)dict_struct; (void)config; (void)config_prefix; (void)sample_block; @@ -122,7 +122,7 @@ namespace DB Poco::Redis::Array keys; - for (const UInt64 id : ids) + for (UInt64 id : ids) keys << static_cast(id); return std::make_shared(client, std::move(keys), sample_block, max_block_size); From 411fcb19dbf0285b32a5005ffef9a36377d2fd07 Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 31 Mar 2019 02:07:40 +0300 Subject: [PATCH 012/309] Missed python package --- dbms/tests/integration/image/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/tests/integration/image/Dockerfile b/dbms/tests/integration/image/Dockerfile index 1dd5c1713b2..9aada808356 100644 --- a/dbms/tests/integration/image/Dockerfile +++ b/dbms/tests/integration/image/Dockerfile @@ -25,7 +25,7 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes - ENV TZ=Europe/Moscow RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone -RUN pip install pytest docker-compose==1.22.0 docker dicttoxml kazoo PyMySQL psycopg2 pymongo tzlocal +RUN pip install pytest docker-compose==1.22.0 docker dicttoxml kazoo PyMySQL psycopg2 pymongo tzlocal redis ENV DOCKER_CHANNEL stable ENV DOCKER_VERSION 17.09.1-ce From d7771b8a07a3133cc88ba05c13c61ecbeb1ce565 Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 7 Apr 2019 12:51:53 +0300 Subject: [PATCH 013/309] Throw exception instead if number of columns mismatch --- dbms/src/Dictionaries/RedisBlockInputStream.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 0375e420430..7b5b68dc9cc 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -31,6 +31,7 @@ namespace DB namespace ErrorCodes { extern const int TYPE_MISMATCH; + extern const int LOGICAL_ERROR; } @@ -182,11 +183,20 @@ namespace DB Block RedisBlockInputStream::readImpl() { + if (description.sample_block.rows() == 0) + all_read = true; + if (all_read) return {}; const size_t size = 2; - assert(size == description.sample_block.columns()); + if (size != description.sample_block.columns()) { + throw Exception{"Unsupported number of columns for key-value storage: " + + std::to_string(description.sample_block.columns()) + + " (expected: " + std::to_string(size) + ")", + ErrorCodes::LOGICAL_ERROR}; + } + MutableColumns columns(description.sample_block.columns()); for (const auto i : ext::range(0, size)) From 562f48ea96dfaace489296bd7debaa89791cd84c Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 14 Apr 2019 20:05:50 +0300 Subject: [PATCH 014/309] Optional select db before usage. Use only one column in tests --- .../Dictionaries/RedisDictionarySource.cpp | 16 ++++++++++ dbms/src/Dictionaries/RedisDictionarySource.h | 2 ++ .../test_external_dictionaries/dictionary.py | 5 +++- .../external_sources.py | 8 +++-- .../test_external_dictionaries/test.py | 30 +++++++++++-------- 5 files changed, 45 insertions(+), 16 deletions(-) diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index 07027d24e24..2b1536f1b02 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -58,6 +58,7 @@ namespace DB namespace ErrorCodes { extern const int UNSUPPORTED_METHOD; + extern const int SELECT_DB_FAILURE; } @@ -68,13 +69,26 @@ namespace DB const DictionaryStructure & dict_struct, const std::string & host, UInt16 port, + UInt8 db_index, const Block & sample_block) : dict_struct{dict_struct} , host{host} , port{port} + , db_index{db_index} , sample_block{sample_block} , client{std::make_shared(host, port)} { + if (db_index != 0) + { + Poco::Redis::Array command; + command << "SELECT" << db_index; + String reply = client->execute(command); + if (reply != "+OK\r\n") + { + throw Exception{"Selecting db with index " + DB::toString(db_index) + " failed with reason " + reply, + ErrorCodes::SELECT_DB_FAILURE}; + } + } } @@ -87,6 +101,7 @@ namespace DB dict_struct, config.getString(config_prefix + ".host"), config.getUInt(config_prefix + ".port"), + config.getUInt(config_prefix + ".db_index", 0), sample_block) { } @@ -96,6 +111,7 @@ namespace DB : RedisDictionarySource{other.dict_struct, other.host, other.port, + other.db_index, other.sample_block} { } diff --git a/dbms/src/Dictionaries/RedisDictionarySource.h b/dbms/src/Dictionaries/RedisDictionarySource.h index f50a85ca10e..1e528ce40e9 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.h +++ b/dbms/src/Dictionaries/RedisDictionarySource.h @@ -28,6 +28,7 @@ namespace DB const DictionaryStructure & dict_struct, const std::string & host, UInt16 port, + UInt8 db_index, const Block & sample_block); public: @@ -69,6 +70,7 @@ namespace DB const DictionaryStructure dict_struct; const std::string host; const UInt16 port; + const UInt8 db_index; // [0..15] Block sample_block; std::shared_ptr client; diff --git a/dbms/tests/integration/test_external_dictionaries/dictionary.py b/dbms/tests/integration/test_external_dictionaries/dictionary.py index bdddc7a9604..49f0ffc1c0b 100644 --- a/dbms/tests/integration/test_external_dictionaries/dictionary.py +++ b/dbms/tests/integration/test_external_dictionaries/dictionary.py @@ -87,12 +87,14 @@ class Field(object): class DictionaryStructure(object): - def __init__(self, layout, fields): + def __init__(self, layout, fields, is_kv=False): self.layout = layout self.keys = [] self.range_key = None self.ordinary_fields = [] self.range_fields = [] + self.is_kv = is_kv + for field in fields: if field.is_key: self.keys.append(field) @@ -286,6 +288,7 @@ class Dictionary(object): self.source = copy.deepcopy(source) self.config_path = config_path self.table_name = table_name + self.is_kv = source.is_kv def generate_config(self): with open(self.config_path, 'w') as result: diff --git a/dbms/tests/integration/test_external_dictionaries/external_sources.py b/dbms/tests/integration/test_external_dictionaries/external_sources.py index e0489acf7cf..57c862cbfe6 100644 --- a/dbms/tests/integration/test_external_dictionaries/external_sources.py +++ b/dbms/tests/integration/test_external_dictionaries/external_sources.py @@ -10,7 +10,7 @@ import os class ExternalSource(object): def __init__(self, name, internal_hostname, internal_port, - docker_hostname, docker_port, user, password): + docker_hostname, docker_port, user, password, is_kv): self.name = name self.internal_hostname = internal_hostname self.internal_port = int(internal_port) @@ -18,6 +18,7 @@ class ExternalSource(object): self.docker_port = int(docker_port) self.user = user self.password = password + self.is_kv = is_kv def get_source_str(self, table_name): raise NotImplementedError("Method {} is not implemented for {}".format( @@ -381,6 +382,7 @@ class SourceRedis(ExternalSource): {host} {port} + 0 '''.format( host=self.docker_hostname, @@ -392,8 +394,7 @@ class SourceRedis(ExternalSource): self.prepared = True def load_data(self, data, table_name): - for row_num, row in enumerate(data): - self.client.execute_command("SELECT " + str(row_num)) + for row_num, row in enumerate(data): # FIXME: yield self.client.execute_command("FLUSHDB") for cell_name, cell_value in row.data.items(): value_type = "$" @@ -404,3 +405,4 @@ class SourceRedis(ExternalSource): cmd = "SET " + "$" + cell_name + " " + value_type + str(cell_value) print(cmd) self.client.execute_command(cmd) + return diff --git a/dbms/tests/integration/test_external_dictionaries/test.py b/dbms/tests/integration/test_external_dictionaries/test.py index 752b37cd760..93e1db2ce70 100644 --- a/dbms/tests/integration/test_external_dictionaries/test.py +++ b/dbms/tests/integration/test_external_dictionaries/test.py @@ -79,16 +79,16 @@ LAYOUTS = [ ] SOURCES = [ - SourceRedis("Redis", "localhost", "6380", "redis1", "6379", "", ""), - SourceMongo("MongoDB", "localhost", "27018", "mongo1", "27017", "root", "clickhouse"), - SourceMySQL("MySQL", "localhost", "3308", "mysql1", "3306", "root", "clickhouse"), - SourceClickHouse("RemoteClickHouse", "localhost", "9000", "clickhouse1", "9000", "default", ""), - SourceClickHouse("LocalClickHouse", "localhost", "9000", "node", "9000", "default", ""), - SourceFile("File", "localhost", "9000", "node", "9000", "", ""), - SourceExecutableHashed("ExecutableHashed", "localhost", "9000", "node", "9000", "", ""), - SourceExecutableCache("ExecutableCache", "localhost", "9000", "node", "9000", "", ""), - SourceHTTP("SourceHTTP", "localhost", "9000", "clickhouse1", "9000", "", ""), - SourceHTTPS("SourceHTTPS", "localhost", "9000", "clickhouse1", "9000", "", ""), + SourceRedis("Redis", "localhost", "6380", "redis1", "6379", "", "", True), + SourceMongo("MongoDB", "localhost", "27018", "mongo1", "27017", "root", "clickhouse", False), + SourceMySQL("MySQL", "localhost", "3308", "mysql1", "3306", "root", "clickhouse", False), + SourceClickHouse("RemoteClickHouse", "localhost", "9000", "clickhouse1", "9000", "default", "", False), + SourceClickHouse("LocalClickHouse", "localhost", "9000", "node", "9000", "default", "", False), + SourceFile("File", "localhost", "9000", "node", "9000", "", "", False), + SourceExecutableHashed("ExecutableHashed", "localhost", "9000", "node", "9000", "", "", False), + SourceExecutableCache("ExecutableCache", "localhost", "9000", "node", "9000", "", "", False), + SourceHTTP("SourceHTTP", "localhost", "9000", "clickhouse1", "9000", "", "", False), + SourceHTTPS("SourceHTTPS", "localhost", "9000", "clickhouse1", "9000", "", "", False), ] DICTIONARIES = [] @@ -108,9 +108,9 @@ def setup_module(module): for layout in LAYOUTS: for source in SOURCES: if source.compatible_with_layout(layout): - structure = DictionaryStructure(layout, FIELDS[layout.layout_type]) + structure = DictionaryStructure(layout, FIELDS[layout.layout_type], source.is_kv) dict_name = source.name + "_" + layout.name - dict_path = os.path.join(dict_configs_path, dict_name + '.xml') + dict_path = os.path.join(dict_configs_path, dict_name + '.xml') # FIXME: single xml config for every column dictionary = Dictionary(dict_name, structure, source, dict_path, "table_" + dict_name) dictionary.generate_config() DICTIONARIES.append(dictionary) @@ -171,6 +171,8 @@ def test_simple_dictionaries(started_cluster): for query in dct.get_select_get_or_default_queries(field, row): queries_with_answers.append((query, field.default_value_for_get)) + if dct.is_kv: + break for query in dct.get_hierarchical_queries(data[0]): queries_with_answers.append((query, [1])) @@ -223,6 +225,8 @@ def test_complex_dictionaries(started_cluster): for query in dct.get_select_get_or_default_queries(field, row): queries_with_answers.append((query, field.default_value_for_get)) + if dct.is_kv: + break for query, answer in queries_with_answers: print query @@ -258,6 +262,8 @@ def test_ranged_dictionaries(started_cluster): if not field.is_key and not field.is_range: for query in dct.get_select_get_queries(field, row): queries_with_answers.append((query, row.get_value_by_name(field.name))) + if dct.is_kv: + break for query, answer in queries_with_answers: print query From f3ead9fe5b8628a35f1030310288bbd2de034a9f Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 14 Apr 2019 20:09:33 +0300 Subject: [PATCH 015/309] Style fix --- dbms/src/Dictionaries/RedisBlockInputStream.cpp | 3 +-- dbms/src/Dictionaries/RedisDictionarySource.cpp | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 7b5b68dc9cc..e705ee01474 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -190,12 +190,11 @@ namespace DB return {}; const size_t size = 2; - if (size != description.sample_block.columns()) { + if (size != description.sample_block.columns()) throw Exception{"Unsupported number of columns for key-value storage: " + std::to_string(description.sample_block.columns()) + " (expected: " + std::to_string(size) + ")", ErrorCodes::LOGICAL_ERROR}; - } MutableColumns columns(description.sample_block.columns()); diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index 2b1536f1b02..717010ac11c 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -84,10 +84,8 @@ namespace DB command << "SELECT" << db_index; String reply = client->execute(command); if (reply != "+OK\r\n") - { throw Exception{"Selecting db with index " + DB::toString(db_index) + " failed with reason " + reply, ErrorCodes::SELECT_DB_FAILURE}; - } } } From 80827b5a9ff29058dec1b166473012a48a250f3a Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 14 Apr 2019 20:44:44 +0300 Subject: [PATCH 016/309] Build fix --- dbms/src/Dictionaries/RedisDictionarySource.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index 717010ac11c..3ef6358c2dd 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -81,8 +81,8 @@ namespace DB if (db_index != 0) { Poco::Redis::Array command; - command << "SELECT" << db_index; - String reply = client->execute(command); + command << "SELECT" << static_cast(db_index); + std::string reply = client->execute(command); if (reply != "+OK\r\n") throw Exception{"Selecting db with index " + DB::toString(db_index) + " failed with reason " + reply, ErrorCodes::SELECT_DB_FAILURE}; From 5849d669753f0d49b2902393490513021ebc3480 Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 14 Apr 2019 20:50:05 +0300 Subject: [PATCH 017/309] Use existing ErrorCode to indicate SELECT failure --- dbms/src/Dictionaries/RedisDictionarySource.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index 3ef6358c2dd..d4584e0d568 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -58,7 +58,7 @@ namespace DB namespace ErrorCodes { extern const int UNSUPPORTED_METHOD; - extern const int SELECT_DB_FAILURE; + extern const int CANNOT_SELECT; } @@ -85,7 +85,7 @@ namespace DB std::string reply = client->execute(command); if (reply != "+OK\r\n") throw Exception{"Selecting db with index " + DB::toString(db_index) + " failed with reason " + reply, - ErrorCodes::SELECT_DB_FAILURE}; + ErrorCodes::CANNOT_SELECT}; } } From 27d138818d96a505c8034386c545091c69674b40 Mon Sep 17 00:00:00 2001 From: Gleb-Tretyakov Date: Mon, 15 Apr 2019 00:21:11 +0300 Subject: [PATCH 018/309] fix invalid memory dereference --- dbms/src/Dictionaries/RedisBlockInputStream.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index e705ee01474..9b11fcb85ae 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -225,12 +225,13 @@ namespace DB break; } - ++num_rows; - ++cursor; const auto & key = *(keys.begin() + cursor); insertValueByIdx(0, key); commandForValues.addRedisType(key); + + ++num_rows; + ++cursor; } if (num_rows == 0) From 9778f7c2f3e4cfe2efd0650acebfc1d6adcf31f9 Mon Sep 17 00:00:00 2001 From: comunodi Date: Mon, 15 Apr 2019 04:34:10 +0300 Subject: [PATCH 019/309] More logs --- .../Dictionaries/RedisBlockInputStream.cpp | 28 +++++++++++++++---- .../Dictionaries/RedisDictionarySource.cpp | 13 +++++++++ .../test_external_dictionaries/dictionary.py | 3 ++ .../test_external_dictionaries/test.py | 6 ++-- 4 files changed, 41 insertions(+), 9 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 9b11fcb85ae..004f223e723 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -25,6 +25,9 @@ # include "DictionaryStructure.h" # include "RedisBlockInputStream.h" +# include "Poco/Logger.h" +# include "common/logger_useful.h" + namespace DB { @@ -56,6 +59,12 @@ namespace DB template void insertNumber(IColumn & column, const Poco::Redis::RedisType::Ptr & value, const std::string & name) { + LOG_ERROR(&Logger::get("Redis"), "Got value: " + value->toString() + "with type=" + + ", isInteger=" + DB::toString(value->isInteger()) + + ", isSimpleString=" + DB::toString(value->isSimpleString()) + + ", isBulkString=" + DB::toString(value->isBulkString()) + + ", isArray=" + DB::toString(value->isArray()) + + ", isError=" + DB::toString(value->isError())); switch (value->type()) { case Poco::Redis::RedisTypeTraits::TypeId: @@ -68,7 +77,7 @@ namespace DB break; case Poco::Redis::RedisTypeTraits::TypeId: { - const auto &bs = + const auto & bs = static_cast *>(value.get())->value(); if (bs.isNull()) static_cast &>(column).getData().emplace_back(); @@ -78,7 +87,8 @@ namespace DB } default: throw Exception( - "Type mismatch, expected a number, got type id = " + toString(value->type()) + " for column " + name, + "Type mismatch, expected a number, got " + value->toString() + + " with type id = " + toString(value->type()) + " for column " + name, ErrorCodes::TYPE_MISMATCH); } } @@ -189,6 +199,9 @@ namespace DB if (all_read) return {}; + for (size_t i = 0; i < 3; ++i) + if (description.sample_block.columns() >= i + 1) + LOG_ERROR(&Logger::get("Redis"), description.sample_block.getByPosition(i).dumpStructure()); const size_t size = 2; if (size != description.sample_block.columns()) throw Exception{"Unsupported number of columns for key-value storage: " @@ -225,21 +238,27 @@ namespace DB break; } - + LOG_ERROR(&Logger::get("Redis"), "Get key: " + DB::toString(cursor)); const auto & key = *(keys.begin() + cursor); insertValueByIdx(0, key); commandForValues.addRedisType(key); - + LOG_ERROR(&Logger::get("Redis"), "Key has read: " + DB::toString(cursor)); + ++num_rows; ++cursor; } + LOG_ERROR(&Logger::get("Redis"), "All " + DB::toString(num_rows) + " rows added"); + if (num_rows == 0) return {}; + LOG_ERROR(&Logger::get("Redis"), "Req to get values"); Poco::Redis::Array values = client->execute(commandForValues); + LOG_ERROR(&Logger::get("Redis"), "Req executed"); for (size_t i = 0; i < num_rows; ++i) { + LOG_ERROR(&Logger::get("Redis"), "Get value from : " + DB::toString(i)); const Poco::Redis::RedisType::Ptr & value = *(values.begin() + i); if (value.isNull()) insertDefaultValue(*columns[1], *description.sample_block.getByPosition(1).column); @@ -249,7 +268,6 @@ namespace DB return description.sample_block.cloneWithColumns(std::move(columns)); } - } #endif diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index d4584e0d568..d32d45d8ed5 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -52,6 +52,9 @@ namespace DB # include # include "RedisBlockInputStream.h" +# include "Poco/Logger.h" +# include "common/logger_useful.h" + namespace DB { @@ -120,17 +123,25 @@ namespace DB BlockInputStreamPtr RedisDictionarySource::loadAll() { + LOG_ERROR(&Logger::get("Redis"), "Redis in loadAll"); + Poco::Redis::Array commandForKeys; commandForKeys << "KEYS" << "*"; + LOG_ERROR(&Logger::get("Redis"), "Command for keys: " + commandForKeys.toString()); Poco::Redis::Array keys = client->execute(commandForKeys); + LOG_ERROR(&Logger::get("Redis"), "Command for keys executed"); + LOG_ERROR(&Logger::get("Redis"), "KEYS: " + keys.toString()); + return std::make_shared(client, std::move(keys), sample_block, max_block_size); } BlockInputStreamPtr RedisDictionarySource::loadIds(const std::vector & ids) { + LOG_ERROR(&Logger::get("Redis"), "Redis in loadIds"); + if (!dict_struct.id) throw Exception{"'id' is required for selective loading", ErrorCodes::UNSUPPORTED_METHOD}; @@ -139,6 +150,8 @@ namespace DB for (UInt64 id : ids) keys << static_cast(id); + LOG_ERROR(&Logger::get("Redis"), "KEYS: " + keys.toString()); + return std::make_shared(client, std::move(keys), sample_block, max_block_size); } diff --git a/dbms/tests/integration/test_external_dictionaries/dictionary.py b/dbms/tests/integration/test_external_dictionaries/dictionary.py index 49f0ffc1c0b..7e44aef455c 100644 --- a/dbms/tests/integration/test_external_dictionaries/dictionary.py +++ b/dbms/tests/integration/test_external_dictionaries/dictionary.py @@ -118,6 +118,9 @@ class DictionaryStructure(object): fields_strs = [] for field in self.ordinary_fields: fields_strs.append(field.get_attribute_str()) + if self.is_kv: + break + key_strs = [] if self.layout.is_complex: for key_field in self.keys: diff --git a/dbms/tests/integration/test_external_dictionaries/test.py b/dbms/tests/integration/test_external_dictionaries/test.py index 93e1db2ce70..d8b92f4e542 100644 --- a/dbms/tests/integration/test_external_dictionaries/test.py +++ b/dbms/tests/integration/test_external_dictionaries/test.py @@ -206,7 +206,7 @@ def test_complex_dictionaries(started_cluster): 'my', 255.543, 3332221.44]), ] - complex_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "complex"] + complex_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "complex" and not d.is_kv] for dct in complex_dicts: dct.load_data(data) @@ -225,8 +225,6 @@ def test_complex_dictionaries(started_cluster): for query in dct.get_select_get_or_default_queries(field, row): queries_with_answers.append((query, field.default_value_for_get)) - if dct.is_kv: - break for query, answer in queries_with_answers: print query @@ -249,7 +247,7 @@ def test_ranged_dictionaries(started_cluster): 32.543, 3332543.4]), ] - ranged_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "ranged"] + ranged_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "ranged" and not d.is_kv] for dct in ranged_dicts: dct.load_data(data) From d2427227dd89f32bf9cfb8aaf2a6ecf22bce8c4c Mon Sep 17 00:00:00 2001 From: comunodi Date: Wed, 17 Apr 2019 02:13:07 +0300 Subject: [PATCH 020/309] Support complex key with 1 or 2 parts --- .../Dictionaries/RedisBlockInputStream.cpp | 135 ++++++++++++------ .../Dictionaries/RedisDictionarySource.cpp | 64 ++++++++- dbms/src/Dictionaries/RedisDictionarySource.h | 25 ++++ 3 files changed, 174 insertions(+), 50 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 004f223e723..8e9aece3670 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -35,6 +35,7 @@ namespace DB { extern const int TYPE_MISMATCH; extern const int LOGICAL_ERROR; + extern const int LIMIT_EXCEEDED; } @@ -107,10 +108,10 @@ namespace DB ErrorCodes::TYPE_MISMATCH}; return bs.value(); } - case Poco::Redis::RedisTypeTraits::TypeId: - return static_cast *>(value.get())->value(); + case Poco::Redis::RedisTypeTraits::TypeId: + return static_cast *>(value.get())->value(); default: - throw Exception{"Type mismatch, expected String, got type id = " + toString(value->type()) + " for column " + name, + throw Exception{"Type mismatch, expected std::string, got type id = " + toString(value->type()) + " for column " + name, ErrorCodes::TYPE_MISMATCH}; } }; @@ -193,21 +194,23 @@ namespace DB Block RedisBlockInputStream::readImpl() { - if (description.sample_block.rows() == 0) + if (description.sample_block.rows() == 0 || keys.size() == 0) all_read = true; if (all_read) return {}; - for (size_t i = 0; i < 3; ++i) + for (size_t i = 0; i < 5; ++i) if (description.sample_block.columns() >= i + 1) LOG_ERROR(&Logger::get("Redis"), description.sample_block.getByPosition(i).dumpStructure()); - const size_t size = 2; - if (size != description.sample_block.columns()) - throw Exception{"Unsupported number of columns for key-value storage: " - + std::to_string(description.sample_block.columns()) - + " (expected: " + std::to_string(size) + ")", - ErrorCodes::LOGICAL_ERROR}; + + const size_t size = description.sample_block.columns(); +// const size_t size = 2; +// if (size != description.sample_block.columns()) +// throw Exception{"Unsupported number of columns for key-value storage: " +// + DB::toString(description.sample_block.columns()) +// + " (expected: " + DB::toString(size) + ")", +// ErrorCodes::LOGICAL_ERROR}; MutableColumns columns(description.sample_block.columns()); @@ -227,43 +230,89 @@ namespace DB insertValue(*columns[idx], description.types[idx].first, value, name); }; - size_t num_rows = 0; - Poco::Redis::Command commandForValues("MGET"); - - while (num_rows < max_block_size) + if (keys.begin()->get()->isArray()) { - if (cursor >= keys.size()) + size_t num_rows = 0; + while (num_rows < max_block_size) { - all_read = true; - break; + if (cursor >= keys.size()) + { + all_read = true; + break; + } + + const auto & primary_with_secondary = *(keys.begin() + cursor); + const auto & keys_array = + static_cast *>(primary_with_secondary.get())->value(); + if (keys_array.size() < 2) + { + throw Exception{"Too low keys in request to source: " + DB::toString(keys_array.size()) + + ", expected 2 or more", + ErrorCodes::LOGICAL_ERROR}; + } + if (num_rows + keys_array.size() - 1 > max_block_size) + { + if (num_rows == 0) + throw Exception{"Too many (" + DB::toString(keys_array.size()) + ") key attributes", + ErrorCodes::LIMIT_EXCEEDED}; + break; + } + + Poco::Redis::Command commandForValues("HMGET"); + const auto & primary_key = *keys_array.begin(); + for (size_t i = 1; i < keys_array.size(); ++i) + { + const auto & secondary_key = *(keys_array.begin() + i); + insertValueByIdx(0, primary_key); + insertValueByIdx(1, secondary_key); + commandForValues.addRedisType(secondary_key); + } + + Poco::Redis::Array values = client->execute(commandForValues); + for (const auto & value : values) + { + if (value.isNull()) + insertDefaultValue(*columns[2], *description.sample_block.getByPosition(2).column); + else + insertValueByIdx(2, value); + } + + num_rows += keys_array.size() - 1; + cursor += keys_array.size() - 1; + } + } + else + { + size_t num_rows = 0; + Poco::Redis::Command commandForValues("MGET"); + + while (num_rows < max_block_size) + { + if (cursor >= keys.size()) + { + all_read = true; + break; + } + + const auto & key = *(keys.begin() + cursor); + insertValueByIdx(0, key); + commandForValues.addRedisType(key); + + ++num_rows; + ++cursor; } - LOG_ERROR(&Logger::get("Redis"), "Get key: " + DB::toString(cursor)); - const auto & key = *(keys.begin() + cursor); - insertValueByIdx(0, key); - commandForValues.addRedisType(key); - LOG_ERROR(&Logger::get("Redis"), "Key has read: " + DB::toString(cursor)); + if (num_rows == 0) + return {}; - ++num_rows; - ++cursor; - } - - LOG_ERROR(&Logger::get("Redis"), "All " + DB::toString(num_rows) + " rows added"); - - if (num_rows == 0) - return {}; - - LOG_ERROR(&Logger::get("Redis"), "Req to get values"); - Poco::Redis::Array values = client->execute(commandForValues); - LOG_ERROR(&Logger::get("Redis"), "Req executed"); - for (size_t i = 0; i < num_rows; ++i) - { - LOG_ERROR(&Logger::get("Redis"), "Get value from : " + DB::toString(i)); - const Poco::Redis::RedisType::Ptr & value = *(values.begin() + i); - if (value.isNull()) - insertDefaultValue(*columns[1], *description.sample_block.getByPosition(1).column); - else - insertValueByIdx(1, value); + Poco::Redis::Array values = client->execute(commandForValues); + for (const auto & value : values) + { + if (value.isNull()) + insertDefaultValue(*columns[1], *description.sample_block.getByPosition(1).column); + else + insertValueByIdx(1, value); + } } return description.sample_block.cloneWithColumns(std::move(columns)); diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index d32d45d8ed5..ce9c1e6f408 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -62,6 +62,7 @@ namespace DB { extern const int UNSUPPORTED_METHOD; extern const int CANNOT_SELECT; + extern const int INVALID_CONFIG_PARAMETER; } @@ -73,18 +74,36 @@ namespace DB const std::string & host, UInt16 port, UInt8 db_index, + RedisStorageType::Id storage_type, const Block & sample_block) : dict_struct{dict_struct} , host{host} , port{port} , db_index{db_index} + , storage_type{storage_type} , sample_block{sample_block} , client{std::make_shared(host, port)} { + if (dict_struct.attributes.size() != 1) + throw Exception{"Invalid number of non key columns for Redis source: " + + DB::toString(dict_struct.attributes.size()) + ", expected 1", + ErrorCodes::INVALID_CONFIG_PARAMETER}; + + if (storage_type == RedisStorageType::HASH_MAP) + { + if (!dict_struct.key.has_value()) + throw Exception{"Redis source with storage type \'hash_map\' mush have key", + ErrorCodes::INVALID_CONFIG_PARAMETER}; + if (dict_struct.key.value().size() > 2) + throw Exception{"Redis source with complex keys having more than 2 attributes are unsupported", + ErrorCodes::INVALID_CONFIG_PARAMETER}; + // suppose key[0] is primary key, key[1] is secondary key + } + if (db_index != 0) { - Poco::Redis::Array command; - command << "SELECT" << static_cast(db_index); + Poco::Redis::Command command("SELECT"); + command << static_cast(db_index); std::string reply = client->execute(command); if (reply != "+OK\r\n") throw Exception{"Selecting db with index " + DB::toString(db_index) + " failed with reason " + reply, @@ -103,6 +122,7 @@ namespace DB config.getString(config_prefix + ".host"), config.getUInt(config_prefix + ".port"), config.getUInt(config_prefix + ".db_index", 0), + parseStorageType(config.getString(config_prefix + ".storage_type", "")), sample_block) { } @@ -113,6 +133,7 @@ namespace DB other.host, other.port, other.db_index, + other.storage_type, other.sample_block} { } @@ -125,15 +146,35 @@ namespace DB { LOG_ERROR(&Logger::get("Redis"), "Redis in loadAll"); - Poco::Redis::Array commandForKeys; - commandForKeys << "KEYS" << "*"; - LOG_ERROR(&Logger::get("Redis"), "Command for keys: " + commandForKeys.toString()); + Poco::Redis::Command command_for_keys("KEYS"); + command_for_keys << "*"; + LOG_ERROR(&Logger::get("Redis"), "Command for keys: " + command_for_keys.toString()); - Poco::Redis::Array keys = client->execute(commandForKeys); + Poco::Redis::Array keys = client->execute(command_for_keys); LOG_ERROR(&Logger::get("Redis"), "Command for keys executed"); LOG_ERROR(&Logger::get("Redis"), "KEYS: " + keys.toString()); + if (storage_type == RedisStorageType::HASH_MAP && dict_struct.key->size() == 2) + { + Poco::Redis::Array hkeys; + for (const auto & key : keys) + { + Poco::Redis::Command command_for_secondary_keys("HKEYS"); + command_for_secondary_keys.addRedisType(key); + Poco::Redis::Array reply_for_primary_key = client->execute(command_for_secondary_keys); + LOG_ERROR(&Logger::get("Redis"), "Command for hkeys executed"); + + Poco::SharedPtr primary_with_secondary; + primary_with_secondary->addRedisType(key); + for (const auto & secondary_key : reply_for_primary_key) + primary_with_secondary->addRedisType(secondary_key); + LOG_ERROR(&Logger::get("Redis"), "HKEYS: " + primary_with_secondary->toString()); + hkeys.addRedisType(primary_with_secondary); + } + keys = hkeys; + } + return std::make_shared(client, std::move(keys), sample_block, max_block_size); } @@ -142,6 +183,9 @@ namespace DB { LOG_ERROR(&Logger::get("Redis"), "Redis in loadIds"); + if (storage_type != RedisStorageType::SIMPLE) + throw Exception{"Cannot use loadIds with \'simple\' storage type", ErrorCodes::UNSUPPORTED_METHOD}; + if (!dict_struct.id) throw Exception{"'id' is required for selective loading", ErrorCodes::UNSUPPORTED_METHOD}; @@ -155,12 +199,18 @@ namespace DB return std::make_shared(client, std::move(keys), sample_block, max_block_size); } - std::string RedisDictionarySource::toString() const { return "Redis: " + host + ':' + DB::toString(port); } + RedisStorageType::Id RedisDictionarySource::parseStorageType(const std::string & storage_type) { + RedisStorageType::Id storage_type_id = RedisStorageType::valueOf(storage_type); + if (storage_type_id == RedisStorageType::UNKNOWN) { + storage_type_id = RedisStorageType::SIMPLE; + } + return storage_type_id; + } } #endif diff --git a/dbms/src/Dictionaries/RedisDictionarySource.h b/dbms/src/Dictionaries/RedisDictionarySource.h index 1e528ce40e9..37014e76360 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.h +++ b/dbms/src/Dictionaries/RedisDictionarySource.h @@ -22,6 +22,25 @@ namespace Poco namespace DB { + namespace RedisStorageType + { + enum Id + { + SIMPLE, + HASH_MAP, + UNKNOWN + }; + + Id valueOf(const std::string& value) + { + if (value == "simple") + return SIMPLE; + if (value == "hash_map") + return HASH_MAP; + return UNKNOWN; + } + } + class RedisDictionarySource final : public IDictionarySource { RedisDictionarySource( @@ -29,6 +48,7 @@ namespace DB const std::string & host, UInt16 port, UInt8 db_index, + RedisStorageType::Id storage_type, const Block & sample_block); public: @@ -55,6 +75,7 @@ namespace DB BlockInputStreamPtr loadKeys(const Columns & /* key_columns */, const std::vector & /* requested_rows */) override { + // Redis does not support native indexing throw Exception{"Method loadKeys is unsupported for RedisDictionarySource", ErrorCodes::NOT_IMPLEMENTED}; } @@ -66,11 +87,15 @@ namespace DB std::string toString() const override; + private: + static RedisStorageType::Id parseStorageType(const std::string& storage_type); + private: const DictionaryStructure dict_struct; const std::string host; const UInt16 port; const UInt8 db_index; // [0..15] + const RedisStorageType::Id storage_type; Block sample_block; std::shared_ptr client; From 20235753442a98a192e441ae7adf54b10743ac8f Mon Sep 17 00:00:00 2001 From: comunodi Date: Wed, 17 Apr 2019 02:26:57 +0300 Subject: [PATCH 021/309] Fix build --- dbms/src/Dictionaries/RedisDictionarySource.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index ce9c1e6f408..e55e5549b27 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -170,7 +170,7 @@ namespace DB for (const auto & secondary_key : reply_for_primary_key) primary_with_secondary->addRedisType(secondary_key); LOG_ERROR(&Logger::get("Redis"), "HKEYS: " + primary_with_secondary->toString()); - hkeys.addRedisType(primary_with_secondary); + hkeys.add(*primary_with_secondary); } keys = hkeys; } From 5bc446befe77403bdd166acd29972e5a6932011a Mon Sep 17 00:00:00 2001 From: comunodi Date: Wed, 17 Apr 2019 04:11:40 +0300 Subject: [PATCH 022/309] Parse date and datetime from Int64 --- .../Dictionaries/RedisBlockInputStream.cpp | 49 +++++++++++++------ 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 8e9aece3670..9da5a92160b 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -72,9 +72,9 @@ namespace DB static_cast &>(column).getData().push_back( static_cast *>(value.get())->value()); break; - case Poco::Redis::RedisTypeTraits::TypeId: + case Poco::Redis::RedisTypeTraits::TypeId: static_cast &>(column).getData().push_back( - parse(static_cast *>(value.get())->value())); + parse(static_cast *>(value.get())->value())); break; case Poco::Redis::RedisTypeTraits::TypeId: { @@ -116,6 +116,35 @@ namespace DB } }; + auto getInt64IfCould = [&value]() + { + switch (value->type()) + { + case Poco::Redis::RedisTypeTraits::TypeId: + { + return static_cast *>(value.get())->value(); + } + case Poco::Redis::RedisTypeTraits::TypeId: + { + return parse( + static_cast *>(value.get())->value()); + } + case Poco::Redis::RedisTypeTraits::TypeId: + { + const auto & bs = static_cast *>( + value.get())->value(); + if (bs.isNull()) + throw Exception{"Unexpected null value", ErrorCodes::TYPE_MISMATCH}; + return parse(bs.value()); + } + default: + { + throw Exception{"Type mismatch, cannot convert to Int64, got type id = " + toString(value->type()), + ErrorCodes::TYPE_MISMATCH}; + } + } + }; + switch (type) { case ValueType::UInt8: @@ -158,25 +187,17 @@ namespace DB case ValueType::Date: { - if (value->type() != Poco::Redis::RedisTypeTraits::TypeId) - throw Exception{"Type mismatch, expected Int64 (Timestamp), got type id = " + toString(value->type()) + " for column " + name, - ErrorCodes::TYPE_MISMATCH}; - + Int64 int_value = getInt64IfCould(); static_cast(column).getData().push_back(UInt16{DateLUT::instance().toDayNum( - static_cast( - static_cast *>(value.get())->value()).epochTime())}); + static_cast(int_value).epochTime())}); break; } case ValueType::DateTime: { - if (value->type() != Poco::Redis::RedisTypeTraits::TypeId) - throw Exception{"Type mismatch, expected Int64 (Timestamp), got type id = " + toString(value->type()) + " for column " + name, - ErrorCodes::TYPE_MISMATCH}; - + Int64 int_value = getInt64IfCould(); static_cast(column).getData().push_back( - static_cast( - static_cast *>(value.get())->value()).epochTime()); + static_cast(int_value).epochTime()); break; } case ValueType::UUID: From 1265646dbfa817e04abe8eeeaa1e67ad9181429e Mon Sep 17 00:00:00 2001 From: comunodi Date: Wed, 17 Apr 2019 13:11:38 +0300 Subject: [PATCH 023/309] Cast types to expected in tests --- .../test_external_dictionaries/dictionary.py | 3 + .../external_sources.py | 18 +++++ .../test_external_dictionaries/test.py | 78 +++++++++++-------- 3 files changed, 65 insertions(+), 34 deletions(-) diff --git a/dbms/tests/integration/test_external_dictionaries/dictionary.py b/dbms/tests/integration/test_external_dictionaries/dictionary.py index 7e44aef455c..c468c2bfc67 100644 --- a/dbms/tests/integration/test_external_dictionaries/dictionary.py +++ b/dbms/tests/integration/test_external_dictionaries/dictionary.py @@ -46,6 +46,9 @@ class Row(object): def get_value_by_name(self, name): return self.data[name] + def set_value(self, name, value): + self.data[name] = value + class Field(object): def __init__(self, name, field_type, is_key=False, is_range_key=False, default=None, hierarchical=False, range_hash_type=None, default_value_for_get=None): diff --git a/dbms/tests/integration/test_external_dictionaries/external_sources.py b/dbms/tests/integration/test_external_dictionaries/external_sources.py index 57c862cbfe6..f7ab5315526 100644 --- a/dbms/tests/integration/test_external_dictionaries/external_sources.py +++ b/dbms/tests/integration/test_external_dictionaries/external_sources.py @@ -6,6 +6,8 @@ import redis from tzlocal import get_localzone import datetime import os +import dateutil.parser +import time class ExternalSource(object): @@ -36,6 +38,9 @@ class ExternalSource(object): def compatible_with_layout(self, layout): return True + def prepare_value_for_type(self, field, value): + return value + class SourceMySQL(ExternalSource): TYPE_MAPPING = { @@ -406,3 +411,16 @@ class SourceRedis(ExternalSource): print(cmd) self.client.execute_command(cmd) return + + def prepare_value_for_type(self, field, value): + if field.field_type == "Date": + dt = dateutil.parser.parse(value) + return int(time.mktime(dt.timetuple()) // 86400) + if field.field_type == "DateTime": + dt = dateutil.parser.parse(value) + return int(time.mktime(dt.timetuple())) + if field.field_type == "Float32": + return str(value) + if field.field_type == "Float64": + return str(value) + return value diff --git a/dbms/tests/integration/test_external_dictionaries/test.py b/dbms/tests/integration/test_external_dictionaries/test.py index d8b92f4e542..c42727c76a8 100644 --- a/dbms/tests/integration/test_external_dictionaries/test.py +++ b/dbms/tests/integration/test_external_dictionaries/test.py @@ -1,6 +1,5 @@ import pytest import os -import time from helpers.cluster import ClickHouseCluster from dictionary import Field, Row, Dictionary, DictionaryStructure, Layout @@ -138,28 +137,39 @@ def started_cluster(): finally: cluster.shutdown() +def prepare_row(dct, fields, values): + prepared_values = [] + for field, value in zip(fields, values): + prepared_values.append(dct.source.prepare_value_for_type(field, value)) + return Row(fields, prepared_values) + +def prepare_data(dct, fields, values_by_row): + data = [] + for row in values_by_row: + data.append(prepare_row(dct, fields, row)) + return data def test_simple_dictionaries(started_cluster): fields = FIELDS["simple"] - data = [ - Row(fields, - [1, 22, 333, 4444, 55555, -6, -77, - -888, -999, '550e8400-e29b-41d4-a716-446655440003', - '1973-06-28', '1985-02-28 23:43:25', 'hello', 22.543, 3332154213.4, 0]), - Row(fields, - [2, 3, 4, 5, 6, -7, -8, - -9, -10, '550e8400-e29b-41d4-a716-446655440002', - '1978-06-28', '1986-02-28 23:42:25', 'hello', 21.543, 3222154213.4, 1]), + values_by_row = [ + [1, 22, 333, 4444, 55555, -6, -77, + -888, -999, '550e8400-e29b-41d4-a716-446655440003', + '1973-06-28', '1985-02-28 23:43:25', 'hello', 22.543, 3332154213.4, 0], + [2, 3, 4, 5, 6, -7, -8, + -9, -10, '550e8400-e29b-41d4-a716-446655440002', + '1978-06-28', '1986-02-28 23:42:25', 'hello', 21.543, 3222154213.4, 1], ] simple_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "simple"] for dct in simple_dicts: + data = prepare_data(dct, fields, values_by_row) dct.load_data(data) node.query("system reload dictionaries") queries_with_answers = [] for dct in simple_dicts: + data = prepare_data(dct, fields, values_by_row) for row in data: for field in fields: if not field.is_key: @@ -193,27 +203,27 @@ def test_simple_dictionaries(started_cluster): def test_complex_dictionaries(started_cluster): fields = FIELDS["complex"] - data = [ - Row(fields, - [1, 'world', 22, 333, 4444, 55555, -6, - -77, -888, -999, '550e8400-e29b-41d4-a716-446655440003', - '1973-06-28', '1985-02-28 23:43:25', - 'hello', 22.543, 3332154213.4]), - Row(fields, - [2, 'qwerty2', 52, 2345, 6544, 9191991, -2, - -717, -81818, -92929, '550e8400-e29b-41d4-a716-446655440007', - '1975-09-28', '2000-02-28 23:33:24', - 'my', 255.543, 3332221.44]), + values_by_row = [ + [1, 'world', 22, 333, 4444, 55555, -6, + -77, -888, -999, '550e8400-e29b-41d4-a716-446655440003', + '1973-06-28', '1985-02-28 23:43:25', + 'hello', 22.543, 3332154213.4], + [2, 'qwerty2', 52, 2345, 6544, 9191991, -2, + -717, -81818, -92929, '550e8400-e29b-41d4-a716-446655440007', + '1975-09-28', '2000-02-28 23:33:24', + 'my', 255.543, 3332221.44], ] complex_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "complex" and not d.is_kv] for dct in complex_dicts: + data = prepare_data(dct, fields, values_by_row) dct.load_data(data) node.query("system reload dictionaries") queries_with_answers = [] for dct in complex_dicts: + data = prepare_data(dct, fields, values_by_row) for row in data: for field in fields: if not field.is_key: @@ -232,29 +242,29 @@ def test_complex_dictionaries(started_cluster): def test_ranged_dictionaries(started_cluster): fields = FIELDS["ranged"] - data = [ - Row(fields, - [1, '2019-02-10', '2019-02-01', '2019-02-28', - 22, 333, 4444, 55555, -6, -77, -888, -999, - '550e8400-e29b-41d4-a716-446655440003', - '1973-06-28', '1985-02-28 23:43:25', 'hello', - 22.543, 3332154213.4]), - Row(fields, - [2, '2019-04-10', '2019-04-01', '2019-04-28', - 11, 3223, 41444, 52515, -65, -747, -8388, -9099, - '550e8400-e29b-41d4-a716-446655440004', - '1973-06-29', '2002-02-28 23:23:25', '!!!!', - 32.543, 3332543.4]), + values_by_row = [ + [1, '2019-02-10', '2019-02-01', '2019-02-28', + 22, 333, 4444, 55555, -6, -77, -888, -999, + '550e8400-e29b-41d4-a716-446655440003', + '1973-06-28', '1985-02-28 23:43:25', 'hello', + 22.543, 3332154213.4], + [2, '2019-04-10', '2019-04-01', '2019-04-28', + 11, 3223, 41444, 52515, -65, -747, -8388, -9099, + '550e8400-e29b-41d4-a716-446655440004', + '1973-06-29', '2002-02-28 23:23:25', '!!!!', + 32.543, 3332543.4], ] ranged_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "ranged" and not d.is_kv] for dct in ranged_dicts: + data = prepare_data(dct, fields, values_by_row) dct.load_data(data) node.query("system reload dictionaries") queries_with_answers = [] for dct in ranged_dicts: + data = prepare_data(dct, fields, values_by_row) for row in data: for field in fields: if not field.is_key and not field.is_range: From b05113188c0adb964141b89eadb69670aaa1bc13 Mon Sep 17 00:00:00 2001 From: comunodi Date: Wed, 17 Apr 2019 13:14:07 +0300 Subject: [PATCH 024/309] Style fix --- dbms/src/Dictionaries/RedisDictionarySource.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index e55e5549b27..4d511cd569c 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -204,11 +204,11 @@ namespace DB return "Redis: " + host + ':' + DB::toString(port); } - RedisStorageType::Id RedisDictionarySource::parseStorageType(const std::string & storage_type) { + RedisStorageType::Id RedisDictionarySource::parseStorageType(const std::string & storage_type) + { RedisStorageType::Id storage_type_id = RedisStorageType::valueOf(storage_type); - if (storage_type_id == RedisStorageType::UNKNOWN) { + if (storage_type_id == RedisStorageType::UNKNOWN) storage_type_id = RedisStorageType::SIMPLE; - } return storage_type_id; } } From f5806e4fb263e9287f81ee3bcdf2547c3735250b Mon Sep 17 00:00:00 2001 From: comunodi Date: Wed, 17 Apr 2019 14:35:02 +0300 Subject: [PATCH 025/309] Disable unsupported sources in tests --- .../test_external_dictionaries/external_sources.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dbms/tests/integration/test_external_dictionaries/external_sources.py b/dbms/tests/integration/test_external_dictionaries/external_sources.py index f7ab5315526..6830f9500c8 100644 --- a/dbms/tests/integration/test_external_dictionaries/external_sources.py +++ b/dbms/tests/integration/test_external_dictionaries/external_sources.py @@ -412,6 +412,11 @@ class SourceRedis(ExternalSource): self.client.execute_command(cmd) return + def compatible_with_layout(self, layout): + if not layout.is_simple: + return False + return True + def prepare_value_for_type(self, field, value): if field.field_type == "Date": dt = dateutil.parser.parse(value) From ccf89f4be6d6dee496c91da369b9579bc4754820 Mon Sep 17 00:00:00 2001 From: comunodi Date: Fri, 24 May 2019 02:42:21 +0300 Subject: [PATCH 026/309] Change LOG_ERROR to LOG_INFO --- dbms/src/Dictionaries/RedisBlockInputStream.cpp | 4 ++-- dbms/src/Dictionaries/RedisDictionarySource.cpp | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 9da5a92160b..ed000f1c1a7 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -60,7 +60,7 @@ namespace DB template void insertNumber(IColumn & column, const Poco::Redis::RedisType::Ptr & value, const std::string & name) { - LOG_ERROR(&Logger::get("Redis"), "Got value: " + value->toString() + "with type=" + + LOG_INFO(&Logger::get("Redis"), "Got value: " + value->toString() + "with type=" + ", isInteger=" + DB::toString(value->isInteger()) + ", isSimpleString=" + DB::toString(value->isSimpleString()) + ", isBulkString=" + DB::toString(value->isBulkString()) + @@ -223,7 +223,7 @@ namespace DB for (size_t i = 0; i < 5; ++i) if (description.sample_block.columns() >= i + 1) - LOG_ERROR(&Logger::get("Redis"), description.sample_block.getByPosition(i).dumpStructure()); + LOG_INFO(&Logger::get("Redis"), description.sample_block.getByPosition(i).dumpStructure()); const size_t size = description.sample_block.columns(); // const size_t size = 2; diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index 4d511cd569c..a691161c968 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -144,16 +144,16 @@ namespace DB BlockInputStreamPtr RedisDictionarySource::loadAll() { - LOG_ERROR(&Logger::get("Redis"), "Redis in loadAll"); + LOG_INFO(&Logger::get("Redis"), "Redis in loadAll"); Poco::Redis::Command command_for_keys("KEYS"); command_for_keys << "*"; - LOG_ERROR(&Logger::get("Redis"), "Command for keys: " + command_for_keys.toString()); + LOG_INFO(&Logger::get("Redis"), "Command for keys: " + command_for_keys.toString()); Poco::Redis::Array keys = client->execute(command_for_keys); - LOG_ERROR(&Logger::get("Redis"), "Command for keys executed"); - LOG_ERROR(&Logger::get("Redis"), "KEYS: " + keys.toString()); + LOG_INFO(&Logger::get("Redis"), "Command for keys executed"); + LOG_INFO(&Logger::get("Redis"), "KEYS: " + keys.toString()); if (storage_type == RedisStorageType::HASH_MAP && dict_struct.key->size() == 2) { @@ -163,13 +163,13 @@ namespace DB Poco::Redis::Command command_for_secondary_keys("HKEYS"); command_for_secondary_keys.addRedisType(key); Poco::Redis::Array reply_for_primary_key = client->execute(command_for_secondary_keys); - LOG_ERROR(&Logger::get("Redis"), "Command for hkeys executed"); + LOG_INFO(&Logger::get("Redis"), "Command for hkeys executed"); Poco::SharedPtr primary_with_secondary; primary_with_secondary->addRedisType(key); for (const auto & secondary_key : reply_for_primary_key) primary_with_secondary->addRedisType(secondary_key); - LOG_ERROR(&Logger::get("Redis"), "HKEYS: " + primary_with_secondary->toString()); + LOG_INFO(&Logger::get("Redis"), "HKEYS: " + primary_with_secondary->toString()); hkeys.add(*primary_with_secondary); } keys = hkeys; @@ -181,7 +181,7 @@ namespace DB BlockInputStreamPtr RedisDictionarySource::loadIds(const std::vector & ids) { - LOG_ERROR(&Logger::get("Redis"), "Redis in loadIds"); + LOG_INFO(&Logger::get("Redis"), "Redis in loadIds"); if (storage_type != RedisStorageType::SIMPLE) throw Exception{"Cannot use loadIds with \'simple\' storage type", ErrorCodes::UNSUPPORTED_METHOD}; @@ -194,7 +194,7 @@ namespace DB for (UInt64 id : ids) keys << static_cast(id); - LOG_ERROR(&Logger::get("Redis"), "KEYS: " + keys.toString()); + LOG_INFO(&Logger::get("Redis"), "KEYS: " + keys.toString()); return std::make_shared(client, std::move(keys), sample_block, max_block_size); } From 1f0afdcf6893c02ad62d3ad2ad035f96a3ef096f Mon Sep 17 00:00:00 2001 From: comunodi Date: Sat, 25 May 2019 03:28:09 +0300 Subject: [PATCH 027/309] Parse all args as strings --- .../Dictionaries/RedisBlockInputStream.cpp | 137 +++++------------- .../Dictionaries/RedisDictionarySource.cpp | 12 +- 2 files changed, 44 insertions(+), 105 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index ed000f1c1a7..507d36b7b16 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -57,153 +57,88 @@ namespace DB using ValueType = ExternalResultDescription::ValueType; using RedisArray = Poco::Redis::Array; - template - void insertNumber(IColumn & column, const Poco::Redis::RedisType::Ptr & value, const std::string & name) + std::string getStringOrThrow(const Poco::Redis::RedisType::Ptr & value, const std::string & column_name) { - LOG_INFO(&Logger::get("Redis"), "Got value: " + value->toString() + "with type=" + - ", isInteger=" + DB::toString(value->isInteger()) + - ", isSimpleString=" + DB::toString(value->isSimpleString()) + - ", isBulkString=" + DB::toString(value->isBulkString()) + - ", isArray=" + DB::toString(value->isArray()) + - ", isError=" + DB::toString(value->isError())); + LOG_INFO(&Logger::get("Redis"), + "isNullableString=" + DB::toString(value->isBulkString()) + + ", isSimpleString=" + DB::toString(value->isSimpleString())); switch (value->type()) { - case Poco::Redis::RedisTypeTraits::TypeId: - static_cast &>(column).getData().push_back( - static_cast *>(value.get())->value()); - break; - case Poco::Redis::RedisTypeTraits::TypeId: - static_cast &>(column).getData().push_back( - parse(static_cast *>(value.get())->value())); - break; case Poco::Redis::RedisTypeTraits::TypeId: { - const auto & bs = - static_cast *>(value.get())->value(); + const auto & bs = static_cast *>(value.get())->value(); if (bs.isNull()) - static_cast &>(column).getData().emplace_back(); - else - static_cast &>(column).getData().push_back(parse(bs.value())); - break; + throw Exception{"Type mismatch, expected not null String for column " + column_name, + ErrorCodes::TYPE_MISMATCH}; + return bs.value(); } + case Poco::Redis::RedisTypeTraits::TypeId: + return static_cast *>(value.get())->value(); default: - throw Exception( - "Type mismatch, expected a number, got " + value->toString() + - " with type id = " + toString(value->type()) + " for column " + name, - ErrorCodes::TYPE_MISMATCH); + throw Exception{"Type mismatch, expected std::string, got type id = " + toString(value->type()) + " for column " + column_name, + ErrorCodes::TYPE_MISMATCH}; } } + template + inline void insert(IColumn & column, const String & stringValue) + { + static_cast &>(column).insertValue(parse(stringValue)); + } + void insertValue(IColumn & column, const ValueType type, const Poco::Redis::RedisType::Ptr & value, const std::string & name) { - auto getStringIfCould = [&value, &name]() - { - switch (value->type()) - { - case Poco::Redis::RedisTypeTraits::TypeId: - { - const auto & bs = static_cast *>(value.get())->value(); - if (bs.isNull()) - throw Exception{"Type mismatch, expected not null String for column " + name, - ErrorCodes::TYPE_MISMATCH}; - return bs.value(); - } - case Poco::Redis::RedisTypeTraits::TypeId: - return static_cast *>(value.get())->value(); - default: - throw Exception{"Type mismatch, expected std::string, got type id = " + toString(value->type()) + " for column " + name, - ErrorCodes::TYPE_MISMATCH}; - } - }; - - auto getInt64IfCould = [&value]() - { - switch (value->type()) - { - case Poco::Redis::RedisTypeTraits::TypeId: - { - return static_cast *>(value.get())->value(); - } - case Poco::Redis::RedisTypeTraits::TypeId: - { - return parse( - static_cast *>(value.get())->value()); - } - case Poco::Redis::RedisTypeTraits::TypeId: - { - const auto & bs = static_cast *>( - value.get())->value(); - if (bs.isNull()) - throw Exception{"Unexpected null value", ErrorCodes::TYPE_MISMATCH}; - return parse(bs.value()); - } - default: - { - throw Exception{"Type mismatch, cannot convert to Int64, got type id = " + toString(value->type()), - ErrorCodes::TYPE_MISMATCH}; - } - } - }; + String stringValue = getStringOrThrow(value, name); switch (type) { case ValueType::UInt8: - insertNumber(column, value, name); + insert(column, stringValue); break; case ValueType::UInt16: - insertNumber(column, value, name); + insert(column, stringValue); break; case ValueType::UInt32: - insertNumber(column, value, name); + insert(column, stringValue); break; case ValueType::UInt64: - insertNumber(column, value, name); + insert(column, stringValue); break; case ValueType::Int8: - insertNumber(column, value, name); + insert(column, stringValue); break; case ValueType::Int16: - insertNumber(column, value, name); + insert(column, stringValue); break; case ValueType::Int32: - insertNumber(column, value, name); + insert(column, stringValue); break; case ValueType::Int64: - insertNumber(column, value, name); + insert(column, stringValue); break; case ValueType::Float32: - insertNumber(column, value, name); + insert(column, stringValue); break; case ValueType::Float64: - insertNumber(column, value, name); + insert(column, stringValue); break; - case ValueType::String: - { - String string = getStringIfCould(); - static_cast(column).insertDataWithTerminatingZero(string.data(), string.size() + 1); + insert(column, stringValue); break; - } - case ValueType::Date: { - Int64 int_value = getInt64IfCould(); - static_cast(column).getData().push_back(UInt16{DateLUT::instance().toDayNum( - static_cast(int_value).epochTime())}); + static_cast(column).insertValue(parse(stringValue).getDayNum()); break; } case ValueType::DateTime: { - Int64 int_value = getInt64IfCould(); - static_cast(column).getData().push_back( - static_cast(int_value).epochTime()); + static_cast(column).insertValue(static_cast(parse(stringValue))); break; } case ValueType::UUID: { - String string = getStringIfCould(); - static_cast(column).getData().push_back(parse(string)); + static_cast(column).insertValue(parse(stringValue)); break; } } @@ -226,12 +161,6 @@ namespace DB LOG_INFO(&Logger::get("Redis"), description.sample_block.getByPosition(i).dumpStructure()); const size_t size = description.sample_block.columns(); -// const size_t size = 2; -// if (size != description.sample_block.columns()) -// throw Exception{"Unsupported number of columns for key-value storage: " -// + DB::toString(description.sample_block.columns()) -// + " (expected: " + DB::toString(size) + ")", -// ErrorCodes::LOGICAL_ERROR}; MutableColumns columns(description.sample_block.columns()); diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index a691161c968..0c99c785887 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -84,15 +84,20 @@ namespace DB , sample_block{sample_block} , client{std::make_shared(host, port)} { + LOG_INFO(&Logger::get("Redis"), "in ctor"); + LOG_INFO(&Logger::get("Redis"), dict_struct.attributes.size()); if (dict_struct.attributes.size() != 1) throw Exception{"Invalid number of non key columns for Redis source: " + DB::toString(dict_struct.attributes.size()) + ", expected 1", ErrorCodes::INVALID_CONFIG_PARAMETER}; + LOG_INFO(&Logger::get("Redis"), "After first check"); + if (storage_type == RedisStorageType::HASH_MAP) { + LOG_INFO(&Logger::get("Redis"), "SET STORAGE_TYPE"); if (!dict_struct.key.has_value()) - throw Exception{"Redis source with storage type \'hash_map\' mush have key", + throw Exception{"Redis source with storage type \'hash_map\' must have key", ErrorCodes::INVALID_CONFIG_PARAMETER}; if (dict_struct.key.value().size() > 2) throw Exception{"Redis source with complex keys having more than 2 attributes are unsupported", @@ -100,8 +105,11 @@ namespace DB // suppose key[0] is primary key, key[1] is secondary key } + LOG_INFO(&Logger::get("Redis"), "After second check"); + if (db_index != 0) { + LOG_INFO(&Logger::get("Redis"), "SET DB_INDEX"); Poco::Redis::Command command("SELECT"); command << static_cast(db_index); std::string reply = client->execute(command); @@ -109,6 +117,8 @@ namespace DB throw Exception{"Selecting db with index " + DB::toString(db_index) + " failed with reason " + reply, ErrorCodes::CANNOT_SELECT}; } + + LOG_INFO(&Logger::get("Redis"), "After third check"); } From ba879d95f7be0bd4f40089905a79dc7a3f142ebb Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 26 May 2019 01:53:31 +0300 Subject: [PATCH 028/309] Unify keys handling --- dbms/src/Dictionaries/RedisBlockInputStream.cpp | 9 +-------- dbms/src/Dictionaries/RedisDictionarySource.cpp | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 507d36b7b16..afa411ce7f4 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -123,24 +123,17 @@ namespace DB insert(column, stringValue); break; case ValueType::String: - insert(column, stringValue); + static_cast(column).insert(parse(stringValue)); break; case ValueType::Date: - { static_cast(column).insertValue(parse(stringValue).getDayNum()); break; - } - case ValueType::DateTime: - { static_cast(column).insertValue(static_cast(parse(stringValue))); break; - } case ValueType::UUID: - { static_cast(column).insertValue(parse(stringValue)); break; - } } } diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index 0c99c785887..d77019cb423 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -202,7 +202,7 @@ namespace DB Poco::Redis::Array keys; for (UInt64 id : ids) - keys << static_cast(id); + keys << DB::toString(id); LOG_INFO(&Logger::get("Redis"), "KEYS: " + keys.toString()); From 179ad928746e4185d8818c5a2976d28f2e08cf64 Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 26 May 2019 15:58:40 +0300 Subject: [PATCH 029/309] Fix diff with master --- dbms/src/Dictionaries/RedisBlockInputStream.h | 2 +- dbms/src/Dictionaries/RedisDictionarySource.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.h b/dbms/src/Dictionaries/RedisBlockInputStream.h index 95a563cee80..dc64ee0fdd4 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.h +++ b/dbms/src/Dictionaries/RedisBlockInputStream.h @@ -1,8 +1,8 @@ #pragma once #include -#include #include +#include namespace Poco diff --git a/dbms/src/Dictionaries/RedisDictionarySource.h b/dbms/src/Dictionaries/RedisDictionarySource.h index 37014e76360..7a0ffaaceb7 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.h +++ b/dbms/src/Dictionaries/RedisDictionarySource.h @@ -1,6 +1,7 @@ #pragma once #include +#include #if USE_POCO_REDIS # include "DictionaryStructure.h" @@ -31,7 +32,7 @@ namespace DB UNKNOWN }; - Id valueOf(const std::string& value) + Id valueOf(const std::string & value) { if (value == "simple") return SIMPLE; From b3d8ec3e0444b74c1e558a9c859952b5c7ddd16d Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 26 May 2019 18:55:09 +0300 Subject: [PATCH 030/309] Handle Null keys --- dbms/src/Dictionaries/RedisBlockInputStream.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index afa411ce7f4..bed8846cff5 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -57,6 +57,15 @@ namespace DB using ValueType = ExternalResultDescription::ValueType; using RedisArray = Poco::Redis::Array; + bool isNull(const Poco::Redis::RedisType::Ptr & value) + { + if (value.isNull()) + return true; + if (value->isBulkString()) + return static_cast *>(value.get())->value().isNull(); + return false; + } + std::string getStringOrThrow(const Poco::Redis::RedisType::Ptr & value, const std::string & column_name) { LOG_INFO(&Logger::get("Redis"), @@ -214,7 +223,7 @@ namespace DB Poco::Redis::Array values = client->execute(commandForValues); for (const auto & value : values) { - if (value.isNull()) + if (isNull(value)) insertDefaultValue(*columns[2], *description.sample_block.getByPosition(2).column); else insertValueByIdx(2, value); @@ -251,7 +260,7 @@ namespace DB Poco::Redis::Array values = client->execute(commandForValues); for (const auto & value : values) { - if (value.isNull()) + if (isNull(value)) insertDefaultValue(*columns[1], *description.sample_block.getByPosition(1).column); else insertValueByIdx(1, value); From a8ce7530c9558a94a8b864c7bbb3f89b3bb9ca89 Mon Sep 17 00:00:00 2001 From: comunodi Date: Tue, 28 May 2019 23:06:06 +0300 Subject: [PATCH 031/309] Put keys in result block only if value exists --- .../Dictionaries/RedisBlockInputStream.cpp | 41 ++++++++++--------- .../Dictionaries/RedisDictionarySource.cpp | 26 +----------- dbms/src/Dictionaries/RedisDictionarySource.h | 3 +- 3 files changed, 24 insertions(+), 46 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index bed8846cff5..639b1360c74 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -36,6 +36,7 @@ namespace DB extern const int TYPE_MISMATCH; extern const int LOGICAL_ERROR; extern const int LIMIT_EXCEEDED; + extern const int SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT; } @@ -59,11 +60,8 @@ namespace DB bool isNull(const Poco::Redis::RedisType::Ptr & value) { - if (value.isNull()) - return true; - if (value->isBulkString()) - return static_cast *>(value.get())->value().isNull(); - return false; + return value->isBulkString() && + static_cast *>(value.get())->value().isNull(); } std::string getStringOrThrow(const Poco::Redis::RedisType::Ptr & value, const std::string & column_name) @@ -158,10 +156,6 @@ namespace DB if (all_read) return {}; - for (size_t i = 0; i < 5; ++i) - if (description.sample_block.columns() >= i + 1) - LOG_INFO(&Logger::get("Redis"), description.sample_block.getByPosition(i).dumpStructure()); - const size_t size = description.sample_block.columns(); MutableColumns columns(description.sample_block.columns()); @@ -220,6 +214,7 @@ namespace DB commandForValues.addRedisType(secondary_key); } + // FIXME: fix insert Poco::Redis::Array values = client->execute(commandForValues); for (const auto & value : values) { @@ -235,10 +230,10 @@ namespace DB } else { - size_t num_rows = 0; Poco::Redis::Command commandForValues("MGET"); - while (num_rows < max_block_size) + // keys.size() > 0 + for (size_t num_rows = 0; num_rows < max_block_size; ++num_rows) { if (cursor >= keys.size()) { @@ -247,23 +242,29 @@ namespace DB } const auto & key = *(keys.begin() + cursor); - insertValueByIdx(0, key); commandForValues.addRedisType(key); - - ++num_rows; ++cursor; } - if (num_rows == 0) - return {}; - Poco::Redis::Array values = client->execute(commandForValues); - for (const auto & value : values) + if (commandForValues.size() != values.size() + 1) + throw Exception{"Inconsistent sizes of keys and values in Redis request", + ErrorCodes::SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT}; + + for (size_t num_rows = 0; num_rows < values.size(); ++num_rows) { - if (isNull(value)) + const auto & key = *(keys.begin() + cursor - num_rows - 1); + const auto & value = *(values.begin() + values.size() - num_rows - 1); + if (value.isNull()) + { + insertValueByIdx(0, key); insertDefaultValue(*columns[1], *description.sample_block.getByPosition(1).column); - else + } + else if (!isNull(value)) // null string means 'no value for requested key' + { + insertValueByIdx(0, key); insertValueByIdx(1, value); + } } } diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index d77019cb423..051f6dfaf34 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -52,9 +52,6 @@ namespace DB # include # include "RedisBlockInputStream.h" -# include "Poco/Logger.h" -# include "common/logger_useful.h" - namespace DB { @@ -84,18 +81,13 @@ namespace DB , sample_block{sample_block} , client{std::make_shared(host, port)} { - LOG_INFO(&Logger::get("Redis"), "in ctor"); - LOG_INFO(&Logger::get("Redis"), dict_struct.attributes.size()); if (dict_struct.attributes.size() != 1) throw Exception{"Invalid number of non key columns for Redis source: " + DB::toString(dict_struct.attributes.size()) + ", expected 1", ErrorCodes::INVALID_CONFIG_PARAMETER}; - LOG_INFO(&Logger::get("Redis"), "After first check"); - if (storage_type == RedisStorageType::HASH_MAP) { - LOG_INFO(&Logger::get("Redis"), "SET STORAGE_TYPE"); if (!dict_struct.key.has_value()) throw Exception{"Redis source with storage type \'hash_map\' must have key", ErrorCodes::INVALID_CONFIG_PARAMETER}; @@ -105,11 +97,8 @@ namespace DB // suppose key[0] is primary key, key[1] is secondary key } - LOG_INFO(&Logger::get("Redis"), "After second check"); - if (db_index != 0) { - LOG_INFO(&Logger::get("Redis"), "SET DB_INDEX"); Poco::Redis::Command command("SELECT"); command << static_cast(db_index); std::string reply = client->execute(command); @@ -117,8 +106,6 @@ namespace DB throw Exception{"Selecting db with index " + DB::toString(db_index) + " failed with reason " + reply, ErrorCodes::CANNOT_SELECT}; } - - LOG_INFO(&Logger::get("Redis"), "After third check"); } @@ -154,17 +141,11 @@ namespace DB BlockInputStreamPtr RedisDictionarySource::loadAll() { - LOG_INFO(&Logger::get("Redis"), "Redis in loadAll"); - Poco::Redis::Command command_for_keys("KEYS"); command_for_keys << "*"; - LOG_INFO(&Logger::get("Redis"), "Command for keys: " + command_for_keys.toString()); Poco::Redis::Array keys = client->execute(command_for_keys); - LOG_INFO(&Logger::get("Redis"), "Command for keys executed"); - LOG_INFO(&Logger::get("Redis"), "KEYS: " + keys.toString()); - if (storage_type == RedisStorageType::HASH_MAP && dict_struct.key->size() == 2) { Poco::Redis::Array hkeys; @@ -173,13 +154,12 @@ namespace DB Poco::Redis::Command command_for_secondary_keys("HKEYS"); command_for_secondary_keys.addRedisType(key); Poco::Redis::Array reply_for_primary_key = client->execute(command_for_secondary_keys); - LOG_INFO(&Logger::get("Redis"), "Command for hkeys executed"); Poco::SharedPtr primary_with_secondary; primary_with_secondary->addRedisType(key); for (const auto & secondary_key : reply_for_primary_key) primary_with_secondary->addRedisType(secondary_key); - LOG_INFO(&Logger::get("Redis"), "HKEYS: " + primary_with_secondary->toString()); + hkeys.add(*primary_with_secondary); } keys = hkeys; @@ -191,8 +171,6 @@ namespace DB BlockInputStreamPtr RedisDictionarySource::loadIds(const std::vector & ids) { - LOG_INFO(&Logger::get("Redis"), "Redis in loadIds"); - if (storage_type != RedisStorageType::SIMPLE) throw Exception{"Cannot use loadIds with \'simple\' storage type", ErrorCodes::UNSUPPORTED_METHOD}; @@ -204,8 +182,6 @@ namespace DB for (UInt64 id : ids) keys << DB::toString(id); - LOG_INFO(&Logger::get("Redis"), "KEYS: " + keys.toString()); - return std::make_shared(client, std::move(keys), sample_block, max_block_size); } diff --git a/dbms/src/Dictionaries/RedisDictionarySource.h b/dbms/src/Dictionaries/RedisDictionarySource.h index 7a0ffaaceb7..d56de626a9a 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.h +++ b/dbms/src/Dictionaries/RedisDictionarySource.h @@ -2,6 +2,7 @@ #include #include + #if USE_POCO_REDIS # include "DictionaryStructure.h" @@ -95,7 +96,7 @@ namespace DB const DictionaryStructure dict_struct; const std::string host; const UInt16 port; - const UInt8 db_index; // [0..15] + const UInt8 db_index; const RedisStorageType::Id storage_type; Block sample_block; From 67059d8ed1b86234611b5e2cb02cd7df5332c06e Mon Sep 17 00:00:00 2001 From: comunodi Date: Tue, 28 May 2019 23:17:30 +0300 Subject: [PATCH 032/309] Add tests only for kv storages --- dbms/tests/integration/pytest.ini | 2 +- .../test_external_dictionaries/dictionary.py | 27 +- .../external_sources.py | 111 ++++-- .../test_external_dictionaries/test.py | 118 +++---- .../test_external_dictionaries/test_kv.py | 321 ++++++++++++++++++ 5 files changed, 474 insertions(+), 105 deletions(-) create mode 100644 dbms/tests/integration/test_external_dictionaries/test_kv.py diff --git a/dbms/tests/integration/pytest.ini b/dbms/tests/integration/pytest.ini index e51d0efad3d..dc5bb603b63 100644 --- a/dbms/tests/integration/pytest.ini +++ b/dbms/tests/integration/pytest.ini @@ -1,3 +1,3 @@ [pytest] -python_files = test.py +python_files = test*.py norecursedirs = _instances diff --git a/dbms/tests/integration/test_external_dictionaries/dictionary.py b/dbms/tests/integration/test_external_dictionaries/dictionary.py index c468c2bfc67..05aa9bfa59d 100644 --- a/dbms/tests/integration/test_external_dictionaries/dictionary.py +++ b/dbms/tests/integration/test_external_dictionaries/dictionary.py @@ -1,4 +1,4 @@ -#-*- coding: utf-8 -*- +# -*- coding: utf-8 -*- import copy @@ -9,7 +9,7 @@ class Layout(object): 'cache': '128', 'complex_key_hashed': '', 'complex_key_cache': '128', - 'range_hashed': '' + 'range_hashed': '', } def __init__(self, name): @@ -18,13 +18,13 @@ class Layout(object): self.is_simple = False self.is_ranged = False if self.name.startswith('complex'): - self.layout_type = "complex" + self.layout_type = 'complex' self.is_complex = True - elif name.startswith("range"): - self.layout_type = "ranged" + elif name.startswith('range'): + self.layout_type = 'ranged' self.is_ranged = True else: - self.layout_type = "simple" + self.layout_type = 'simple' self.is_simple = True def get_str(self): @@ -33,8 +33,7 @@ class Layout(object): def get_key_block_name(self): if self.is_complex: return 'key' - else: - return 'id' + return 'id' class Row(object): @@ -90,13 +89,12 @@ class Field(object): class DictionaryStructure(object): - def __init__(self, layout, fields, is_kv=False): + def __init__(self, layout, fields): self.layout = layout self.keys = [] self.range_key = None self.ordinary_fields = [] self.range_fields = [] - self.is_kv = is_kv for field in fields: if field.is_key: @@ -121,14 +119,12 @@ class DictionaryStructure(object): fields_strs = [] for field in self.ordinary_fields: fields_strs.append(field.get_attribute_str()) - if self.is_kv: - break key_strs = [] if self.layout.is_complex: for key_field in self.keys: key_strs.append(key_field.get_attribute_str()) - else: # same for simple and ranged + else: # same for simple and ranged for key_field in self.keys: key_strs.append(key_field.get_simple_index_str()) @@ -288,13 +284,14 @@ class DictionaryStructure(object): class Dictionary(object): - def __init__(self, name, structure, source, config_path, table_name): + def __init__(self, name, structure, source, config_path, table_name, fields=None, values=None): self.name = name self.structure = copy.deepcopy(structure) self.source = copy.deepcopy(source) self.config_path = config_path self.table_name = table_name - self.is_kv = source.is_kv + self.fields = fields + self.values = values def generate_config(self): with open(self.config_path, 'w') as result: diff --git a/dbms/tests/integration/test_external_dictionaries/external_sources.py b/dbms/tests/integration/test_external_dictionaries/external_sources.py index 6830f9500c8..a22cc6e024f 100644 --- a/dbms/tests/integration/test_external_dictionaries/external_sources.py +++ b/dbms/tests/integration/test_external_dictionaries/external_sources.py @@ -3,6 +3,7 @@ import warnings import pymysql.cursors import pymongo import redis +import aerospike from tzlocal import get_localzone import datetime import os @@ -12,7 +13,7 @@ import time class ExternalSource(object): def __init__(self, name, internal_hostname, internal_port, - docker_hostname, docker_port, user, password, is_kv): + docker_hostname, docker_port, user, password, storage_type=None): self.name = name self.internal_hostname = internal_hostname self.internal_port = int(internal_port) @@ -20,7 +21,7 @@ class ExternalSource(object): self.docker_port = int(docker_port) self.user = user self.password = password - self.is_kv = is_kv + self.storage_type = storage_type def get_source_str(self, table_name): raise NotImplementedError("Method {} is not implemented for {}".format( @@ -38,9 +39,6 @@ class ExternalSource(object): def compatible_with_layout(self, layout): return True - def prepare_value_for_type(self, field, value): - return value - class SourceMySQL(ExternalSource): TYPE_MAPPING = { @@ -388,10 +386,12 @@ class SourceRedis(ExternalSource): {host} {port} 0 + {storage_type} '''.format( host=self.docker_hostname, port=self.docker_port, + storage_type=self.storage_type, # simple or hash_map ) def prepare(self, structure, table_name, cluster): @@ -399,33 +399,96 @@ class SourceRedis(ExternalSource): self.prepared = True def load_data(self, data, table_name): - for row_num, row in enumerate(data): # FIXME: yield - self.client.execute_command("FLUSHDB") + self.client.flushdb() + for row in data: for cell_name, cell_value in row.data.items(): value_type = "$" if isinstance(cell_value, int): value_type = ":" else: cell_value = '"' + str(cell_value).replace(' ', '\s') + '"' - cmd = "SET " + "$" + cell_name + " " + value_type + str(cell_value) + cmd = "SET ${} {}{}".format(cell_name, value_type, cell_value) print(cmd) self.client.execute_command(cmd) - return + + def load_kv_data(self, values): + self.client.flushdb() + if len(values[0]) == 2: + self.client.mset({value[0]: value[1] for value in values}) + else: + for value in values: + self.client.hset(value[0], value[1], value[2]) def compatible_with_layout(self, layout): - if not layout.is_simple: - return False - return True + if layout.is_simple and self.storage_type == "simple" or layout.is_complex and self.storage_type == "simple": + return True + return False - def prepare_value_for_type(self, field, value): - if field.field_type == "Date": - dt = dateutil.parser.parse(value) - return int(time.mktime(dt.timetuple()) // 86400) - if field.field_type == "DateTime": - dt = dateutil.parser.parse(value) - return int(time.mktime(dt.timetuple())) - if field.field_type == "Float32": - return str(value) - if field.field_type == "Float64": - return str(value) - return value + +class SourceAerospike(ExternalSource): + def __init__(self, name, internal_hostname, internal_port, + docker_hostname, docker_port, user, password, storage_type=None): + ExternalSource.__init__(self, name, internal_hostname, internal_port, + docker_hostname, docker_port, user, password, storage_type) + self.namespace = "test" + self.set = "test_set" + + def get_source_str(self, table_name): + print("AEROSPIKE get source str") + return ''' + + {host} + {port} + + '''.format( + host=self.docker_hostname, + port=self.docker_port, + storage_type=self.storage_type, # simple or hash_map + ) + + def prepare(self, structure, table_name, cluster): + config = { + 'hosts': [ (self.internal_hostname, self.internal_port) ] + } + self.client = aerospike.client(config).connect() + self.prepared = True + print("PREPARED AEROSPIKE") + print(config) + + def compatible_with_layout(self, layout): + print("compatible AEROSPIKE") + return layout.is_simple + + def _flush_aerospike_db(self): + keys = [] + + def handle_record((key, metadata, record)): + print("Handle record {} {}".format(key, record)) + keys.append(key) + + def print_record((key, metadata, record)): + print("Print record {} {}".format(key, record)) + + scan = self.client.scan(self.namespace, self.set) + scan.foreach(handle_record) + + [self.client.remove(key) for key in keys] + + def load_kv_data(self, values): + self._flush_aerospike_db() + + print("Load KV Data Aerospike") + if len(values[0]) == 2: + for value in values: + key = (self.namespace, self.set, value[0]) + print(key) + self.client.put(key, {"bin_value": value[1]}, policy={"key": aerospike.POLICY_KEY_SEND}) + assert self.client.exists(key) + else: + assert("VALUES SIZE != 2") + + # print(values) + + def load_data(self, data, table_name): + print("Load Data Aerospike") + # print(data) diff --git a/dbms/tests/integration/test_external_dictionaries/test.py b/dbms/tests/integration/test_external_dictionaries/test.py index c42727c76a8..841a9124af0 100644 --- a/dbms/tests/integration/test_external_dictionaries/test.py +++ b/dbms/tests/integration/test_external_dictionaries/test.py @@ -3,8 +3,8 @@ import os from helpers.cluster import ClickHouseCluster from dictionary import Field, Row, Dictionary, DictionaryStructure, Layout -from external_sources import SourceMySQL, SourceClickHouse, SourceFile, SourceExecutableCache, SourceExecutableHashed, SourceMongo -from external_sources import SourceHTTP, SourceHTTPS, SourceRedis +from external_sources import SourceMySQL, SourceClickHouse, SourceFile, SourceExecutableCache, SourceExecutableHashed +from external_sources import SourceMongo, SourceHTTP, SourceHTTPS SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) @@ -78,16 +78,15 @@ LAYOUTS = [ ] SOURCES = [ - SourceRedis("Redis", "localhost", "6380", "redis1", "6379", "", "", True), - SourceMongo("MongoDB", "localhost", "27018", "mongo1", "27017", "root", "clickhouse", False), - SourceMySQL("MySQL", "localhost", "3308", "mysql1", "3306", "root", "clickhouse", False), - SourceClickHouse("RemoteClickHouse", "localhost", "9000", "clickhouse1", "9000", "default", "", False), - SourceClickHouse("LocalClickHouse", "localhost", "9000", "node", "9000", "default", "", False), - SourceFile("File", "localhost", "9000", "node", "9000", "", "", False), - SourceExecutableHashed("ExecutableHashed", "localhost", "9000", "node", "9000", "", "", False), - SourceExecutableCache("ExecutableCache", "localhost", "9000", "node", "9000", "", "", False), - SourceHTTP("SourceHTTP", "localhost", "9000", "clickhouse1", "9000", "", "", False), - SourceHTTPS("SourceHTTPS", "localhost", "9000", "clickhouse1", "9000", "", "", False), + SourceMongo("MongoDB", "localhost", "27018", "mongo1", "27017", "root", "clickhouse"), + SourceMySQL("MySQL", "localhost", "3308", "mysql1", "3306", "root", "clickhouse"), + SourceClickHouse("RemoteClickHouse", "localhost", "9000", "clickhouse1", "9000", "default", ""), + SourceClickHouse("LocalClickHouse", "localhost", "9000", "node", "9000", "default", ""), + SourceFile("File", "localhost", "9000", "node", "9000", "", ""), + SourceExecutableHashed("ExecutableHashed", "localhost", "9000", "node", "9000", "", ""), + SourceExecutableCache("ExecutableCache", "localhost", "9000", "node", "9000", "", ""), + SourceHTTP("SourceHTTP", "localhost", "9000", "clickhouse1", "9000", "", ""), + SourceHTTPS("SourceHTTPS", "localhost", "9000", "clickhouse1", "9000", "", ""), ] DICTIONARIES = [] @@ -95,6 +94,7 @@ DICTIONARIES = [] cluster = None node = None + def setup_module(module): global DICTIONARIES global cluster @@ -107,9 +107,9 @@ def setup_module(module): for layout in LAYOUTS: for source in SOURCES: if source.compatible_with_layout(layout): - structure = DictionaryStructure(layout, FIELDS[layout.layout_type], source.is_kv) + structure = DictionaryStructure(layout, FIELDS[layout.layout_type]) dict_name = source.name + "_" + layout.name - dict_path = os.path.join(dict_configs_path, dict_name + '.xml') # FIXME: single xml config for every column + dict_path = os.path.join(dict_configs_path, dict_name + '.xml') dictionary = Dictionary(dict_name, structure, source, dict_path, "table_" + dict_name) dictionary.generate_config() DICTIONARIES.append(dictionary) @@ -120,9 +120,10 @@ def setup_module(module): for fname in os.listdir(dict_configs_path): main_configs.append(os.path.join(dict_configs_path, fname)) cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs')) - node = cluster.add_instance('node', main_configs=main_configs, with_mysql=True, with_mongo=True, with_redis=True) + node = cluster.add_instance('node', main_configs=main_configs, with_mysql=True, with_mongo=True) cluster.add_instance('clickhouse1') + @pytest.fixture(scope="module") def started_cluster(): try: @@ -137,39 +138,28 @@ def started_cluster(): finally: cluster.shutdown() -def prepare_row(dct, fields, values): - prepared_values = [] - for field, value in zip(fields, values): - prepared_values.append(dct.source.prepare_value_for_type(field, value)) - return Row(fields, prepared_values) - -def prepare_data(dct, fields, values_by_row): - data = [] - for row in values_by_row: - data.append(prepare_row(dct, fields, row)) - return data def test_simple_dictionaries(started_cluster): fields = FIELDS["simple"] - values_by_row = [ - [1, 22, 333, 4444, 55555, -6, -77, - -888, -999, '550e8400-e29b-41d4-a716-446655440003', - '1973-06-28', '1985-02-28 23:43:25', 'hello', 22.543, 3332154213.4, 0], - [2, 3, 4, 5, 6, -7, -8, - -9, -10, '550e8400-e29b-41d4-a716-446655440002', - '1978-06-28', '1986-02-28 23:42:25', 'hello', 21.543, 3222154213.4, 1], + data = [ + Row(fields, + [1, 22, 333, 4444, 55555, -6, -77, + -888, -999, '550e8400-e29b-41d4-a716-446655440003', + '1973-06-28', '1985-02-28 23:43:25', 'hello', 22.543, 3332154213.4, 0]), + Row(fields, + [2, 3, 4, 5, 6, -7, -8, + -9, -10, '550e8400-e29b-41d4-a716-446655440002', + '1978-06-28', '1986-02-28 23:42:25', 'hello', 21.543, 3222154213.4, 1]), ] simple_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "simple"] for dct in simple_dicts: - data = prepare_data(dct, fields, values_by_row) dct.load_data(data) node.query("system reload dictionaries") queries_with_answers = [] for dct in simple_dicts: - data = prepare_data(dct, fields, values_by_row) for row in data: for field in fields: if not field.is_key: @@ -181,8 +171,6 @@ def test_simple_dictionaries(started_cluster): for query in dct.get_select_get_or_default_queries(field, row): queries_with_answers.append((query, field.default_value_for_get)) - if dct.is_kv: - break for query in dct.get_hierarchical_queries(data[0]): queries_with_answers.append((query, [1])) @@ -201,29 +189,30 @@ def test_simple_dictionaries(started_cluster): answer = str(answer).replace(' ', '') assert node.query(query) == str(answer) + '\n' + def test_complex_dictionaries(started_cluster): fields = FIELDS["complex"] - values_by_row = [ - [1, 'world', 22, 333, 4444, 55555, -6, - -77, -888, -999, '550e8400-e29b-41d4-a716-446655440003', - '1973-06-28', '1985-02-28 23:43:25', - 'hello', 22.543, 3332154213.4], - [2, 'qwerty2', 52, 2345, 6544, 9191991, -2, - -717, -81818, -92929, '550e8400-e29b-41d4-a716-446655440007', - '1975-09-28', '2000-02-28 23:33:24', - 'my', 255.543, 3332221.44], + data = [ + Row(fields, + [1, 'world', 22, 333, 4444, 55555, -6, + -77, -888, -999, '550e8400-e29b-41d4-a716-446655440003', + '1973-06-28', '1985-02-28 23:43:25', + 'hello', 22.543, 3332154213.4]), + Row(fields, + [2, 'qwerty2', 52, 2345, 6544, 9191991, -2, + -717, -81818, -92929, '550e8400-e29b-41d4-a716-446655440007', + '1975-09-28', '2000-02-28 23:33:24', + 'my', 255.543, 3332221.44]), ] - complex_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "complex" and not d.is_kv] + complex_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "complex"] for dct in complex_dicts: - data = prepare_data(dct, fields, values_by_row) dct.load_data(data) node.query("system reload dictionaries") queries_with_answers = [] for dct in complex_dicts: - data = prepare_data(dct, fields, values_by_row) for row in data: for field in fields: if not field.is_key: @@ -240,38 +229,37 @@ def test_complex_dictionaries(started_cluster): print query assert node.query(query) == str(answer) + '\n' + def test_ranged_dictionaries(started_cluster): fields = FIELDS["ranged"] - values_by_row = [ - [1, '2019-02-10', '2019-02-01', '2019-02-28', - 22, 333, 4444, 55555, -6, -77, -888, -999, - '550e8400-e29b-41d4-a716-446655440003', - '1973-06-28', '1985-02-28 23:43:25', 'hello', - 22.543, 3332154213.4], - [2, '2019-04-10', '2019-04-01', '2019-04-28', - 11, 3223, 41444, 52515, -65, -747, -8388, -9099, - '550e8400-e29b-41d4-a716-446655440004', - '1973-06-29', '2002-02-28 23:23:25', '!!!!', - 32.543, 3332543.4], + data = [ + Row(fields, + [1, '2019-02-10', '2019-02-01', '2019-02-28', + 22, 333, 4444, 55555, -6, -77, -888, -999, + '550e8400-e29b-41d4-a716-446655440003', + '1973-06-28', '1985-02-28 23:43:25', 'hello', + 22.543, 3332154213.4]), + Row(fields, + [2, '2019-04-10', '2019-04-01', '2019-04-28', + 11, 3223, 41444, 52515, -65, -747, -8388, -9099, + '550e8400-e29b-41d4-a716-446655440004', + '1973-06-29', '2002-02-28 23:23:25', '!!!!', + 32.543, 3332543.4]), ] - ranged_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "ranged" and not d.is_kv] + ranged_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "ranged"] for dct in ranged_dicts: - data = prepare_data(dct, fields, values_by_row) dct.load_data(data) node.query("system reload dictionaries") queries_with_answers = [] for dct in ranged_dicts: - data = prepare_data(dct, fields, values_by_row) for row in data: for field in fields: if not field.is_key and not field.is_range: for query in dct.get_select_get_queries(field, row): queries_with_answers.append((query, row.get_value_by_name(field.name))) - if dct.is_kv: - break for query, answer in queries_with_answers: print query diff --git a/dbms/tests/integration/test_external_dictionaries/test_kv.py b/dbms/tests/integration/test_external_dictionaries/test_kv.py new file mode 100644 index 00000000000..b085e89b7d9 --- /dev/null +++ b/dbms/tests/integration/test_external_dictionaries/test_kv.py @@ -0,0 +1,321 @@ +import os + +import pytest +from dictionary import Field, Row, Dictionary, DictionaryStructure, Layout +from external_sources import SourceRedis, SourceAerospike + +from helpers.cluster import ClickHouseCluster + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + +FIELDS = { + "simple": [ + Field("KeyField", 'UInt64', is_key=True, default_value_for_get=9999999), + Field("UInt8_", 'UInt8', default_value_for_get=55), + Field("UInt16_", 'UInt16', default_value_for_get=66), + Field("UInt32_", 'UInt32', default_value_for_get=77), + Field("UInt64_", 'UInt64', default_value_for_get=88), + Field("Int8_", 'Int8', default_value_for_get=-55), + Field("Int16_", 'Int16', default_value_for_get=-66), + Field("Int32_", 'Int32', default_value_for_get=-77), + Field("Int64_", 'Int64', default_value_for_get=-88), + Field("UUID_", 'UUID', default_value_for_get='550e8400-0000-0000-0000-000000000000'), + Field("Date_", 'Date', default_value_for_get='2018-12-30'), + Field("DateTime_", 'DateTime', default_value_for_get='2018-12-30 00:00:00'), + Field("String_", 'String', default_value_for_get='hi'), + Field("Float32_", 'Float32', default_value_for_get=555.11), + Field("Float64_", 'Float64', default_value_for_get=777.11), + Field("ParentKeyField", "UInt64", default_value_for_get=444, hierarchical=True), + ], + "complex": [ + Field("KeyField1", 'UInt64', is_key=True, default_value_for_get=9999999), + Field("KeyField2", 'String', is_key=True, default_value_for_get='xxxxxxxxx'), + Field("UInt8_", 'UInt8', default_value_for_get=55), + Field("UInt16_", 'UInt16', default_value_for_get=66), + Field("UInt32_", 'UInt32', default_value_for_get=77), + Field("UInt64_", 'UInt64', default_value_for_get=88), + Field("Int8_", 'Int8', default_value_for_get=-55), + Field("Int16_", 'Int16', default_value_for_get=-66), + Field("Int32_", 'Int32', default_value_for_get=-77), + Field("Int64_", 'Int64', default_value_for_get=-88), + Field("UUID_", 'UUID', default_value_for_get='550e8400-0000-0000-0000-000000000000'), + Field("Date_", 'Date', default_value_for_get='2018-12-30'), + Field("DateTime_", 'DateTime', default_value_for_get='2018-12-30 00:00:00'), + Field("String_", 'String', default_value_for_get='hi'), + Field("Float32_", 'Float32', default_value_for_get=555.11), + Field("Float64_", 'Float64', default_value_for_get=777.11), + ], + "ranged": [ + Field("KeyField1", 'UInt64', is_key=True), + Field("KeyField2", 'Date', is_range_key=True), + Field("StartDate", 'Date', range_hash_type='min'), + Field("EndDate", 'Date', range_hash_type='max'), + Field("UInt8_", 'UInt8', default_value_for_get=55), + Field("UInt16_", 'UInt16', default_value_for_get=66), + Field("UInt32_", 'UInt32', default_value_for_get=77), + Field("UInt64_", 'UInt64', default_value_for_get=88), + Field("Int8_", 'Int8', default_value_for_get=-55), + Field("Int16_", 'Int16', default_value_for_get=-66), + Field("Int32_", 'Int32', default_value_for_get=-77), + Field("Int64_", 'Int64', default_value_for_get=-88), + Field("UUID_", 'UUID', default_value_for_get='550e8400-0000-0000-0000-000000000000'), + Field("Date_", 'Date', default_value_for_get='2018-12-30'), + Field("DateTime_", 'DateTime', default_value_for_get='2018-12-30 00:00:00'), + Field("String_", 'String', default_value_for_get='hi'), + Field("Float32_", 'Float32', default_value_for_get=555.11), + Field("Float64_", 'Float64', default_value_for_get=777.11), + ], +} + +VALUES = { + "simple": [ + [ + 1, 22, 333, 4444, 55555, -6, -77, + -888, -999, '550e8400-e29b-41d4-a716-446655440003', + '1973-06-28', '1985-02-28 23:43:25', 'hello', 22.543, 3332154213.4, 0, + ], + [ + 2, 3, 4, 5, 6, -7, -8, + -9, -10, '550e8400-e29b-41d4-a716-446655440002', + '1978-06-28', '1986-02-28 23:42:25', 'hello', 21.543, 3222154213.4, 1, + ], + ], + "complex": [ + [ + 1, 'world', 22, 333, 4444, 55555, -6, + -77, -888, -999, '550e8400-e29b-41d4-a716-446655440003', + '1973-06-28', '1985-02-28 23:43:25', + 'hello', 22.543, 3332154213.4, + ], + [ + 2, 'qwerty2', 52, 2345, 6544, 9191991, -2, + -717, -81818, -92929, '550e8400-e29b-41d4-a716-446655440007', + '1975-09-28', '2000-02-28 23:33:24', + 'my', 255.543, 3332221.44, + ], + ], + "ranged": [ + [ + 1, '2019-02-10', '2019-02-01', '2019-02-28', + 22, 333, 4444, 55555, -6, -77, -888, -999, + '550e8400-e29b-41d4-a716-446655440003', + '1973-06-28', '1985-02-28 23:43:25', 'hello', + 22.543, 3332154213.4, + ], + [ + 2, '2019-04-10', '2019-04-01', '2019-04-28', + 11, 3223, 41444, 52515, -65, -747, -8388, -9099, + '550e8400-e29b-41d4-a716-446655440004', + '1973-06-29', '2002-02-28 23:23:25', '!!!!', + 32.543, 3332543.4, + ], + ], +} + +LAYOUTS = [ + Layout("flat"), + Layout("hashed"), + Layout("cache"), + Layout("complex_key_hashed"), + Layout("complex_key_cache"), + Layout("range_hashed"), +] + +SOURCES = [ + SourceRedis("RedisSimple", "localhost", "6380", "redis1", "6379", "", "", storage_type="simple"), + # SourceRedis("RedisHash", "localhost", "6380", "redis1", "6379", "", "", storage_type="hash_map"), + # SourceAerospike("Aerospike", "localhost", "3000", "aerospike1", "3000", "", ""), +] + +DICTIONARIES = [] + +cluster = None +node = None + + +def setup_kv_dict(suffix, layout, fields, kv_source, dict_configs_path, values): + global DICTIONARIES + + structure = DictionaryStructure(layout, fields) + dict_name = "{}_{}_{}".format(kv_source.name, layout.name, suffix) + dict_path = os.path.join(dict_configs_path, dict_name + '.xml') + dictionary = Dictionary(dict_name, structure, kv_source, dict_path, "table_" + dict_name, fields, values) + dictionary.generate_config() + DICTIONARIES.append(dictionary) + + +def setup_module(module): + global DICTIONARIES + global cluster + global node + + dict_configs_path = os.path.join(SCRIPT_DIR, 'configs/dictionaries') + for f in os.listdir(dict_configs_path): + os.remove(os.path.join(dict_configs_path, f)) + + for layout in LAYOUTS: + for source in SOURCES: + if source.compatible_with_layout(layout): + if layout.layout_type == "simple": + fields_len = len(FIELDS["simple"]) + for i in range(fields_len - 1): + local_fields = [FIELDS["simple"][0], FIELDS["simple"][i + 1]] + local_values = [[value[0], value[i + 1]] for value in VALUES["simple"]] + setup_kv_dict(i + 1, layout, local_fields, source, dict_configs_path, local_values) + elif layout.layout_type == "complex": + fields_len = len(FIELDS["complex"]) + for i in range(fields_len - 2): + local_fields = [FIELDS['complex'][1], FIELDS['complex'][i + 2]] + local_values = [[value[1], value[i + 2]] for value in VALUES["complex"]] + setup_kv_dict(i + 2, layout, local_fields, source, dict_configs_path, local_values) + elif layout.layout_type == "ranged": + fields_len = len(FIELDS["ranged"]) + local_fields = FIELDS["ranged"][0:5] + local_values = VALUES["ranged"][0:5] + for i in range(fields_len - 4): + local_fields[4] = FIELDS["ranged"][i + 4] + for j, value in enumerate(VALUES["ranged"]): + local_values[j][4] = value[i + 4] + setup_kv_dict(i + 2, layout, local_fields, source, dict_configs_path, local_values) + else: + print "Source", source.name, "incompatible with layout", layout.name + + main_configs = [] + for fname in os.listdir(dict_configs_path): + main_configs.append(os.path.join(dict_configs_path, fname)) + cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs')) + # TODO: add your kv source flag below + node = cluster.add_instance('node', main_configs=main_configs, with_redis=True) + cluster.add_instance('clickhouse1') + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + for dictionary in DICTIONARIES: + print "Preparing", dictionary.name + dictionary.prepare_source(cluster) + print "Prepared" + + yield cluster + + finally: + cluster.shutdown() + + +def prepare_data(fields, values_by_row): + return [Row(fields, values) for values in values_by_row] + + +def test_simple_kv_dictionaries(started_cluster): + simple_kv_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "simple"] + + for dct in simple_kv_dicts: + queries_with_answers = [] + fields = dct.fields + print("FIELDS AND VALUES FOR " + dct.name) + print(fields) + print(dct.values) + data = prepare_data(fields, dct.values) + dct.source.load_kv_data(dct.values) + + try: + node.query("system reload dictionary '{}'".format(dct.name)) + except Exception: + print(dct.name) + raise + + for row in data: + for field in fields: + if not field.is_key: + for query in dct.get_select_get_queries(field, row): + queries_with_answers.append((query, row.get_value_by_name(field.name))) + + for query in dct.get_select_has_queries(field, row): + queries_with_answers.append((query, 1)) + + for query in dct.get_select_get_or_default_queries(field, row): + queries_with_answers.append((query, field.default_value_for_get)) + if dct.fields[1].hierarchical: + for query in dct.get_hierarchical_queries(data[0]): + queries_with_answers.append((query, [1])) + + for query in dct.get_hierarchical_queries(data[1]): + queries_with_answers.append((query, [2, 1])) + + for query in dct.get_is_in_queries(data[0], data[1]): + queries_with_answers.append((query, 0)) + + for query in dct.get_is_in_queries(data[1], data[0]): + queries_with_answers.append((query, 1)) + + for query, answer in queries_with_answers: + if isinstance(answer, list): + answer = str(answer).replace(' ', '') + print query + assert node.query(query) == str(answer) + '\n', query + + +def test_complex_dictionaries(started_cluster): + complex_kv_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "complex"] + + for dct in complex_kv_dicts: + queries_with_answers = [] + fields = dct.fields + print("FIELDS AND VALUES FOR " + dct.name) + print(fields) + print(dct.values) + data = prepare_data(fields, dct.values) + dct.source.load_kv_data(dct.values) + + try: + node.query("system reload dictionary '{}'".format(dct.name)) + except Exception: + print(dct.name) + raise + + for row in data: + for field in fields: + if not field.is_key: + for query in dct.get_select_get_queries(field, row): + queries_with_answers.append((query, row.get_value_by_name(field.name))) + + for query in dct.get_select_has_queries(field, row): + queries_with_answers.append((query, 1)) + + for query in dct.get_select_get_or_default_queries(field, row): + queries_with_answers.append((query, field.default_value_for_get)) + + for query, answer in queries_with_answers: + print query + assert node.query(query) == str(answer) + '\n' + + +def xtest_ranged_dictionaries(started_cluster): + complex_kv_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "ranged"] + + for dct in complex_kv_dicts: + queries_with_answers = [] + fields = dct.fields + print("FIELDS AND VALUES FOR " + dct.name) + print(fields) + print(dct.values) + data = prepare_data(fields, dct.values) + dct.source.load_kv_data(dct.values) + + try: + node.query("system reload dictionary '{}'".format(dct.name)) + except Exception: + print(dct.name) + raise + + for row in data: + for field in fields: + if not field.is_key and not field.is_range: + for query in dct.get_select_get_queries(field, row): + queries_with_answers.append((query, row.get_value_by_name(field.name))) + + for query, answer in queries_with_answers: + print query + assert node.query(query) == str(answer) + '\n' From 61a9e6c448dcffbef47eecdd5700cf7fa97810bd Mon Sep 17 00:00:00 2001 From: comunodi Date: Thu, 30 May 2019 22:44:40 +0300 Subject: [PATCH 033/309] Fix test for hashed dict --- dbms/tests/integration/test_external_dictionaries/dictionary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/tests/integration/test_external_dictionaries/dictionary.py b/dbms/tests/integration/test_external_dictionaries/dictionary.py index 05aa9bfa59d..c20afbe7840 100644 --- a/dbms/tests/integration/test_external_dictionaries/dictionary.py +++ b/dbms/tests/integration/test_external_dictionaries/dictionary.py @@ -183,7 +183,7 @@ class DictionaryStructure(object): if isinstance(val, str): val = "'" + val + "'" key_exprs_strs.append('to{type}({value})'.format(type=key.field_type, value=val)) - key_expr = ', (' + ','.join(key_exprs_strs) + ')' + key_expr = ', tuple(' + ','.join(key_exprs_strs) + ')' date_expr = '' if self.layout.is_ranged: From 4947a0cfa977920d761e2fd62538bf8a9a53b7d9 Mon Sep 17 00:00:00 2001 From: comunodi Date: Thu, 30 May 2019 23:24:23 +0300 Subject: [PATCH 034/309] Disable redundant tests --- .../test_external_dictionaries/external_sources.py | 5 ++++- dbms/tests/integration/test_external_dictionaries/test_kv.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/dbms/tests/integration/test_external_dictionaries/external_sources.py b/dbms/tests/integration/test_external_dictionaries/external_sources.py index a22cc6e024f..f26609637b8 100644 --- a/dbms/tests/integration/test_external_dictionaries/external_sources.py +++ b/dbms/tests/integration/test_external_dictionaries/external_sources.py @@ -420,7 +420,10 @@ class SourceRedis(ExternalSource): self.client.hset(value[0], value[1], value[2]) def compatible_with_layout(self, layout): - if layout.is_simple and self.storage_type == "simple" or layout.is_complex and self.storage_type == "simple": + if ( + layout.is_simple and self.storage_type == "simple" or + layout.is_complex and self.storage_type == "simple" and layout.name != "complex_key_cache" + ): return True return False diff --git a/dbms/tests/integration/test_external_dictionaries/test_kv.py b/dbms/tests/integration/test_external_dictionaries/test_kv.py index b085e89b7d9..2ac6f0e714d 100644 --- a/dbms/tests/integration/test_external_dictionaries/test_kv.py +++ b/dbms/tests/integration/test_external_dictionaries/test_kv.py @@ -292,7 +292,7 @@ def test_complex_dictionaries(started_cluster): assert node.query(query) == str(answer) + '\n' -def xtest_ranged_dictionaries(started_cluster): +def test_ranged_dictionaries(started_cluster): complex_kv_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "ranged"] for dct in complex_kv_dicts: From 2f74c0db70aa85273bd37ab67c906428fe02ef52 Mon Sep 17 00:00:00 2001 From: comunodi Date: Thu, 30 May 2019 23:39:56 +0300 Subject: [PATCH 035/309] Delete useless import --- .../integration/test_external_dictionaries/external_sources.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dbms/tests/integration/test_external_dictionaries/external_sources.py b/dbms/tests/integration/test_external_dictionaries/external_sources.py index f26609637b8..20516e5c997 100644 --- a/dbms/tests/integration/test_external_dictionaries/external_sources.py +++ b/dbms/tests/integration/test_external_dictionaries/external_sources.py @@ -7,7 +7,6 @@ import aerospike from tzlocal import get_localzone import datetime import os -import dateutil.parser import time From c58effc2af382192d389852ecf1bb0373bd85f3b Mon Sep 17 00:00:00 2001 From: comunodi Date: Fri, 31 May 2019 00:06:39 +0300 Subject: [PATCH 036/309] Fix inserting keys with hash map --- .../Dictionaries/RedisBlockInputStream.cpp | 107 ++++++++++-------- .../Dictionaries/RedisDictionarySource.cpp | 6 - 2 files changed, 58 insertions(+), 55 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 639b1360c74..c12418ab087 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -6,13 +6,8 @@ # include # include -# include # include # include -# include -# include -# include -# include # include # include @@ -25,9 +20,6 @@ # include "DictionaryStructure.h" # include "RedisBlockInputStream.h" -# include "Poco/Logger.h" -# include "common/logger_useful.h" - namespace DB { @@ -36,7 +28,7 @@ namespace DB extern const int TYPE_MISMATCH; extern const int LOGICAL_ERROR; extern const int LIMIT_EXCEEDED; - extern const int SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT; + extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH; } @@ -66,9 +58,6 @@ namespace DB std::string getStringOrThrow(const Poco::Redis::RedisType::Ptr & value, const std::string & column_name) { - LOG_INFO(&Logger::get("Redis"), - "isNullableString=" + DB::toString(value->isBulkString()) + - ", isSimpleString=" + DB::toString(value->isSimpleString())); switch (value->type()) { case Poco::Redis::RedisTypeTraits::TypeId: @@ -179,7 +168,7 @@ namespace DB if (keys.begin()->get()->isArray()) { size_t num_rows = 0; - while (num_rows < max_block_size) + while (num_rows < max_block_size && !all_read) { if (cursor >= keys.size()) { @@ -206,6 +195,7 @@ namespace DB Poco::Redis::Command commandForValues("HMGET"); const auto & primary_key = *keys_array.begin(); + commandForValues.addRedisType(primary_key); for (size_t i = 1; i < keys_array.size(); ++i) { const auto & secondary_key = *(keys_array.begin() + i); @@ -213,57 +203,76 @@ namespace DB insertValueByIdx(1, secondary_key); commandForValues.addRedisType(secondary_key); } + ++cursor; - // FIXME: fix insert Poco::Redis::Array values = client->execute(commandForValues); - for (const auto & value : values) - { - if (isNull(value)) - insertDefaultValue(*columns[2], *description.sample_block.getByPosition(2).column); - else - insertValueByIdx(2, value); - } + if (commandForValues.size() != values.size() + 2) // 'HMGET' primary_key secondary_keys + throw Exception{"Inconsistent sizes of keys and values in Redis request", + ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH}; - num_rows += keys_array.size() - 1; - cursor += keys_array.size() - 1; + for (size_t i = 0; i < values.size(); ++i) + { + const auto & secondary_key = *(keys_array.begin() + i + 1); + const auto & value = *(values.begin() + i); + if (value.isNull()) + { + insertValueByIdx(0, primary_key); + insertValueByIdx(1, secondary_key); + insertDefaultValue(*columns[2], *description.sample_block.getByPosition(2).column); + ++num_rows; + } + else if (!isNull(value)) // null string means 'no value for requested key' + { + insertValueByIdx(0, primary_key); + insertValueByIdx(1, secondary_key); + insertValueByIdx(2, value); + ++num_rows; + } + } } } else { - Poco::Redis::Command commandForValues("MGET"); - - // keys.size() > 0 - for (size_t num_rows = 0; num_rows < max_block_size; ++num_rows) + size_t num_rows = 0; + while (num_rows < max_block_size && !all_read) { - if (cursor >= keys.size()) + Poco::Redis::Command commandForValues("MGET"); + + // keys.size() > 0 + for (size_t i = 0; i < max_block_size && cursor < keys.size(); ++i) + { + const auto & key = *(keys.begin() + cursor); + commandForValues.addRedisType(key); + ++cursor; + } + + if (commandForValues.size() == 1) // only 'MGET' { all_read = true; break; } - const auto & key = *(keys.begin() + cursor); - commandForValues.addRedisType(key); - ++cursor; - } + Poco::Redis::Array values = client->execute(commandForValues); + if (commandForValues.size() != values.size() + 1) // 'MGET' keys + throw Exception{"Inconsistent sizes of keys and values in Redis request", + ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH}; - Poco::Redis::Array values = client->execute(commandForValues); - if (commandForValues.size() != values.size() + 1) - throw Exception{"Inconsistent sizes of keys and values in Redis request", - ErrorCodes::SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT}; - - for (size_t num_rows = 0; num_rows < values.size(); ++num_rows) - { - const auto & key = *(keys.begin() + cursor - num_rows - 1); - const auto & value = *(values.begin() + values.size() - num_rows - 1); - if (value.isNull()) + for (size_t i = 0; i < values.size(); ++i) { - insertValueByIdx(0, key); - insertDefaultValue(*columns[1], *description.sample_block.getByPosition(1).column); - } - else if (!isNull(value)) // null string means 'no value for requested key' - { - insertValueByIdx(0, key); - insertValueByIdx(1, value); + const auto & key = *(keys.begin() + cursor - i - 1); + const auto & value = *(values.begin() + values.size() - i - 1); + if (value.isNull()) + { + insertValueByIdx(0, key); + insertDefaultValue(*columns[1], *description.sample_block.getByPosition(1).column); + ++num_rows; + } + else if (!isNull(value)) // null string means 'no value for requested key' + { + insertValueByIdx(0, key); + insertValueByIdx(1, value); + ++num_rows; + } } } } diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index 051f6dfaf34..d0256f3272e 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -36,16 +36,10 @@ namespace DB #if USE_POCO_REDIS # include -# include # include # include -# include -# include -# include -# include # include # include -# include # include # include From a964af386cb3dc91a7495ebdbf926640bed4cf6f Mon Sep 17 00:00:00 2001 From: comunodi Date: Fri, 31 May 2019 00:16:12 +0300 Subject: [PATCH 037/309] Optimize includes --- dbms/src/Dictionaries/RedisBlockInputStream.cpp | 6 ++---- dbms/src/Dictionaries/RedisDictionarySource.cpp | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index c12418ab087..56f12c74822 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -1,7 +1,6 @@ #include #if USE_POCO_REDIS -# include # include # include @@ -15,8 +14,8 @@ # include # include # include -# include # include + # include "DictionaryStructure.h" # include "RedisBlockInputStream.h" @@ -146,8 +145,7 @@ namespace DB return {}; const size_t size = description.sample_block.columns(); - - MutableColumns columns(description.sample_block.columns()); + MutableColumns columns(size); for (const auto i : ext::range(0, size)) columns[i] = description.sample_block.getByPosition(i).column->cloneEmpty(); diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index d0256f3272e..282f4187a45 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -41,9 +41,9 @@ namespace DB # include # include -# include # include -# include +# include + # include "RedisBlockInputStream.h" From 08c2f183dd4d6ba6db785cd2de9398aff4fb4041 Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 2 Jun 2019 04:22:06 +0300 Subject: [PATCH 038/309] Fix complex dict with two keys --- dbms/src/Dictionaries/RedisBlockInputStream.cpp | 9 +++------ dbms/src/Dictionaries/RedisDictionarySource.cpp | 9 +++++---- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 56f12c74822..7f23a421989 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -192,22 +192,19 @@ namespace DB } Poco::Redis::Command commandForValues("HMGET"); - const auto & primary_key = *keys_array.begin(); - commandForValues.addRedisType(primary_key); - for (size_t i = 1; i < keys_array.size(); ++i) + for (size_t i = 0; i < keys_array.size(); ++i) { const auto & secondary_key = *(keys_array.begin() + i); - insertValueByIdx(0, primary_key); - insertValueByIdx(1, secondary_key); commandForValues.addRedisType(secondary_key); } ++cursor; Poco::Redis::Array values = client->execute(commandForValues); - if (commandForValues.size() != values.size() + 2) // 'HMGET' primary_key secondary_keys + if (keys_array.size() != values.size() + 1) // 'HMGET' primary_key secondary_keys throw Exception{"Inconsistent sizes of keys and values in Redis request", ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH}; + const auto & primary_key = *keys_array.begin(); for (size_t i = 0; i < values.size(); ++i) { const auto & secondary_key = *(keys_array.begin() + i + 1); diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index 282f4187a45..8def8abcf0e 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -147,14 +147,15 @@ namespace DB { Poco::Redis::Command command_for_secondary_keys("HKEYS"); command_for_secondary_keys.addRedisType(key); + Poco::Redis::Array reply_for_primary_key = client->execute(command_for_secondary_keys); - Poco::SharedPtr primary_with_secondary; - primary_with_secondary->addRedisType(key); + Poco::Redis::Array primary_with_secondary; + primary_with_secondary.addRedisType(key); for (const auto & secondary_key : reply_for_primary_key) - primary_with_secondary->addRedisType(secondary_key); + primary_with_secondary.addRedisType(secondary_key); - hkeys.add(*primary_with_secondary); + hkeys.add(primary_with_secondary); } keys = hkeys; } From 12af7869cc692195500ee1eeec69f64e5842050b Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 2 Jun 2019 04:30:06 +0300 Subject: [PATCH 039/309] Add tests for complex dict with two keys --- .../test_external_dictionaries/dictionary.py | 2 ++ .../external_sources.py | 28 ++++++++----------- .../test_external_dictionaries/test_kv.py | 14 ++++++---- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/dbms/tests/integration/test_external_dictionaries/dictionary.py b/dbms/tests/integration/test_external_dictionaries/dictionary.py index c20afbe7840..ca07ea27037 100644 --- a/dbms/tests/integration/test_external_dictionaries/dictionary.py +++ b/dbms/tests/integration/test_external_dictionaries/dictionary.py @@ -8,6 +8,8 @@ class Layout(object): 'hashed': '', 'cache': '128', 'complex_key_hashed': '', + 'complex_key_hashed_one_key': '', + 'complex_key_hashed_two_keys': '', 'complex_key_cache': '128', 'range_hashed': '', } diff --git a/dbms/tests/integration/test_external_dictionaries/external_sources.py b/dbms/tests/integration/test_external_dictionaries/external_sources.py index 20516e5c997..2232bcc92a4 100644 --- a/dbms/tests/integration/test_external_dictionaries/external_sources.py +++ b/dbms/tests/integration/test_external_dictionaries/external_sources.py @@ -7,12 +7,11 @@ import aerospike from tzlocal import get_localzone import datetime import os -import time class ExternalSource(object): def __init__(self, name, internal_hostname, internal_port, - docker_hostname, docker_port, user, password, storage_type=None): + docker_hostname, docker_port, user, password): self.name = name self.internal_hostname = internal_hostname self.internal_port = int(internal_port) @@ -20,7 +19,6 @@ class ExternalSource(object): self.docker_port = int(docker_port) self.user = user self.password = password - self.storage_type = storage_type def get_source_str(self, table_name): raise NotImplementedError("Method {} is not implemented for {}".format( @@ -379,6 +377,14 @@ class SourceHTTPS(SourceHTTPBase): class SourceRedis(ExternalSource): + def __init__( + self, name, internal_hostname, internal_port, docker_hostname, docker_port, user, password, storage_type + ): + super(SourceRedis, self).__init__( + name, internal_hostname, internal_port, docker_hostname, docker_port, user, password + ) + self.storage_type = storage_type + def get_source_str(self, table_name): return ''' @@ -397,19 +403,6 @@ class SourceRedis(ExternalSource): self.client = redis.StrictRedis(host=self.internal_hostname, port=self.internal_port) self.prepared = True - def load_data(self, data, table_name): - self.client.flushdb() - for row in data: - for cell_name, cell_value in row.data.items(): - value_type = "$" - if isinstance(cell_value, int): - value_type = ":" - else: - cell_value = '"' + str(cell_value).replace(' ', '\s') + '"' - cmd = "SET ${} {}{}".format(cell_name, value_type, cell_value) - print(cmd) - self.client.execute_command(cmd) - def load_kv_data(self, values): self.client.flushdb() if len(values[0]) == 2: @@ -421,7 +414,8 @@ class SourceRedis(ExternalSource): def compatible_with_layout(self, layout): if ( layout.is_simple and self.storage_type == "simple" or - layout.is_complex and self.storage_type == "simple" and layout.name != "complex_key_cache" + layout.is_complex and self.storage_type == "simple" and layout.name == "complex_key_hashed_one_key" or + layout.is_complex and self.storage_type == "hash_map" and layout.name == "complex_key_hashed_two_keys" ): return True return False diff --git a/dbms/tests/integration/test_external_dictionaries/test_kv.py b/dbms/tests/integration/test_external_dictionaries/test_kv.py index 2ac6f0e714d..69fa48d5e2e 100644 --- a/dbms/tests/integration/test_external_dictionaries/test_kv.py +++ b/dbms/tests/integration/test_external_dictionaries/test_kv.py @@ -116,14 +116,15 @@ LAYOUTS = [ Layout("flat"), Layout("hashed"), Layout("cache"), - Layout("complex_key_hashed"), + Layout('complex_key_hashed_one_key'), + Layout('complex_key_hashed_two_keys'), Layout("complex_key_cache"), Layout("range_hashed"), ] SOURCES = [ SourceRedis("RedisSimple", "localhost", "6380", "redis1", "6379", "", "", storage_type="simple"), - # SourceRedis("RedisHash", "localhost", "6380", "redis1", "6379", "", "", storage_type="hash_map"), + SourceRedis("RedisHash", "localhost", "6380", "redis1", "6379", "", "", storage_type="hash_map"), # SourceAerospike("Aerospike", "localhost", "3000", "aerospike1", "3000", "", ""), ] @@ -165,8 +166,12 @@ def setup_module(module): elif layout.layout_type == "complex": fields_len = len(FIELDS["complex"]) for i in range(fields_len - 2): - local_fields = [FIELDS['complex'][1], FIELDS['complex'][i + 2]] - local_values = [[value[1], value[i + 2]] for value in VALUES["complex"]] + if layout.name == 'complex_key_hashed_two_keys': + local_fields = [FIELDS['complex'][0], FIELDS['complex'][1], FIELDS['complex'][i + 2]] + local_values = [[value[0], value[1], value[i + 2]] for value in VALUES["complex"]] + else: + local_fields = [FIELDS['complex'][1], FIELDS['complex'][i + 2]] + local_values = [[value[1], value[i + 2]] for value in VALUES["complex"]] setup_kv_dict(i + 2, layout, local_fields, source, dict_configs_path, local_values) elif layout.layout_type == "ranged": fields_len = len(FIELDS["ranged"]) @@ -184,7 +189,6 @@ def setup_module(module): for fname in os.listdir(dict_configs_path): main_configs.append(os.path.join(dict_configs_path, fname)) cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs')) - # TODO: add your kv source flag below node = cluster.add_instance('node', main_configs=main_configs, with_redis=True) cluster.add_instance('clickhouse1') From aed927b6df8a6bfacce2ac22221f5f30107f77f5 Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 2 Jun 2019 04:37:35 +0300 Subject: [PATCH 040/309] Remove unused param --- .../test_external_dictionaries/external_sources.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dbms/tests/integration/test_external_dictionaries/external_sources.py b/dbms/tests/integration/test_external_dictionaries/external_sources.py index 2232bcc92a4..f6802a87c57 100644 --- a/dbms/tests/integration/test_external_dictionaries/external_sources.py +++ b/dbms/tests/integration/test_external_dictionaries/external_sources.py @@ -423,9 +423,9 @@ class SourceRedis(ExternalSource): class SourceAerospike(ExternalSource): def __init__(self, name, internal_hostname, internal_port, - docker_hostname, docker_port, user, password, storage_type=None): + docker_hostname, docker_port, user, password): ExternalSource.__init__(self, name, internal_hostname, internal_port, - docker_hostname, docker_port, user, password, storage_type) + docker_hostname, docker_port, user, password) self.namespace = "test" self.set = "test_set" @@ -439,7 +439,6 @@ class SourceAerospike(ExternalSource): '''.format( host=self.docker_hostname, port=self.docker_port, - storage_type=self.storage_type, # simple or hash_map ) def prepare(self, structure, table_name, cluster): From 102967015e8813129384dcd0f6e377e5b730f167 Mon Sep 17 00:00:00 2001 From: comunodi Date: Sun, 2 Jun 2019 16:29:43 +0300 Subject: [PATCH 041/309] Add RU docs for Redis --- .../dicts/external_dicts_dict_sources.md | 6 ++- .../dicts/external_dicts_dict_sources.md | 47 ++++++++++++++----- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/docs/en/query_language/dicts/external_dicts_dict_sources.md b/docs/en/query_language/dicts/external_dicts_dict_sources.md index 029bad304c7..f78b67d6495 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_sources.md +++ b/docs/en/query_language/dicts/external_dicts_dict_sources.md @@ -27,11 +27,11 @@ Types of sources (`source_type`): - [Executable file](#dicts-external_dicts_dict_sources-executable) - [HTTP(s)](#dicts-external_dicts_dict_sources-http) - DBMS + - [ODBC](#dicts-external_dicts_dict_sources-odbc) - [MySQL](#dicts-external_dicts_dict_sources-mysql) - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) - [Redis](#dicts-external_dicts_dict_sources-redis) - - [ODBC](#dicts-external_dicts_dict_sources-odbc) ## Local File {#dicts-external_dicts_dict_sources-local_file} @@ -434,6 +434,8 @@ Example of settings: localhost 6379 + simple + 0 ``` @@ -442,5 +444,7 @@ Setting fields: - `host` – The Redis host. - `port` – The port on the Redis server. +- `storage_type` – The structure of internal Redis storage using for work with keys. `simple` is for simple sources and for hashed single key sources, `hash_map` is for hashed sources with two keys. Ranged sources and cache sources with complex key are unsupported. May be omitted, default value is `simple`. +- `db_index` – The specific numeric index of Redis logical database. May be omitted, default value is 0. [Original article](https://clickhouse.yandex/docs/en/query_language/dicts/external_dicts_dict_sources/) diff --git a/docs/ru/query_language/dicts/external_dicts_dict_sources.md b/docs/ru/query_language/dicts/external_dicts_dict_sources.md index c9e419eb09c..436c4e95daf 100644 --- a/docs/ru/query_language/dicts/external_dicts_dict_sources.md +++ b/docs/ru/query_language/dicts/external_dicts_dict_sources.md @@ -1,5 +1,5 @@ -# Источники внешних словарей +# Источники внешних словарей {#dicts-external_dicts_dict_sources} Внешний словарь можно подключить из множества источников. @@ -24,17 +24,18 @@ Типы источников (`source_type`): -- [Локальный файл](#ispolniaemyi-fail) -- [Исполняемый файл](#ispolniaemyi-fail) -- [HTTP(s)](#http-s) +- [Локальный файл](#dicts-external_dicts_dict_sources-local_file) +- [Исполняемый файл](#dicts-external_dicts_dict_sources-executable) +- [HTTP(s)](#dicts-external_dicts_dict_sources-http) - СУБД: - [ODBC](#dicts-external_dicts_dict_sources-odbc) - - [MySQL](#mysql) - - [ClickHouse](#clickhouse) - - [MongoDB](#mongodb) + - [MySQL](#dicts-external_dicts_dict_sources-mysql) + - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) + - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) + - [Redis](#dicts-external_dicts_dict_sources-redis) -## Локальный файл +## Локальный файл {#dicts-external_dicts_dict_sources-local_file} Пример настройки: @@ -53,7 +54,7 @@ - `format` - Формат файла. Поддерживаются все форматы, описанные в разделе "[Форматы](../../interfaces/formats.md#formats)". -## Исполняемый файл +## Исполняемый файл {#dicts-external_dicts_dict_sources-executable} Работа с исполняемым файлом зависит от [размещения словаря в памяти](external_dicts_dict_layout.md). Если тип размещения словаря `cache` и `complex_key_cache`, то ClickHouse запрашивает необходимые ключи, отправляя запрос в `STDIN` исполняемого файла. @@ -74,7 +75,7 @@ - `format` - Формат файла. Поддерживаются все форматы, описанные в разделе "[Форматы](../../interfaces/formats.md#formats)". -## HTTP(s) +## HTTP(s) {#dicts-external_dicts_dict_sources-http} Работа с HTTP(s) сервером зависит от [размещения словаря в памяти](external_dicts_dict_layout.md). Если тип размещения словаря `cache` и `complex_key_cache`, то ClickHouse запрашивает необходимые ключи, отправляя запрос методом `POST`. @@ -360,7 +361,7 @@ MySQL можно подключить на локальном хосте чер ``` -### ClickHouse +### ClickHouse {#dicts-external_dicts_dict_sources-clickhouse} Пример настройки: @@ -390,7 +391,7 @@ MySQL можно подключить на локальном хосте чер - `invalidate_query` - запрос для проверки статуса словаря. Необязательный параметр. Читайте подробнее в разделе [Обновление словарей](external_dicts_dict_lifetime.md). -### MongoDB +### MongoDB {#dicts-external_dicts_dict_sources-mongodb} Пример настройки: @@ -416,4 +417,26 @@ MySQL можно подключить на локальном хосте чер - `db` - имя базы данных. - `collection` - имя коллекции. +### Redis {#dicts-external_dicts_dict_sources-redis} + +Пример настройки: + +```xml + + + localhost + 6379 + simple + 0 + + +``` + +Поля настройки: + +- `host` – хост Redis. +- `port` – порт сервера Redis. +- `storage_type` – способ хранения ключей. Необходимо использовать `simple` для источников с одним столбцом ключей, `hash_map` -- для источников с двумя столбцами ключей. Источники с более, чем двумя столбцами ключей, не поддерживаются. Может отсутствовать, значение по умолчанию `simple`. +- `db_index` – номер базы данных. Может отсутствовать, значение по умолчанию 0. + [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/dicts/external_dicts_dict_sources/) From ce6968367315d7f299dc3dea6ae9152a2628b1b0 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Thu, 23 May 2019 09:03:39 +0000 Subject: [PATCH 042/309] Copy-paste from StorageFile. --- dbms/src/Storages/StorageS3.cpp | 335 ++++++++++++++++++++++++++++++++ dbms/src/Storages/StorageS3.h | 89 +++++++++ 2 files changed, 424 insertions(+) create mode 100644 dbms/src/Storages/StorageS3.cpp create mode 100644 dbms/src/Storages/StorageS3.h diff --git a/dbms/src/Storages/StorageS3.cpp b/dbms/src/Storages/StorageS3.cpp new file mode 100644 index 00000000000..6f1b62dc240 --- /dev/null +++ b/dbms/src/Storages/StorageS3.cpp @@ -0,0 +1,335 @@ +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_WRITE_TO_FILE_DESCRIPTOR; + extern const int CANNOT_SEEK_THROUGH_FILE; + extern const int DATABASE_ACCESS_DENIED; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int UNKNOWN_IDENTIFIER; + extern const int INCORRECT_FILE_NAME; + extern const int FILE_DOESNT_EXIST; + extern const int EMPTY_LIST_OF_COLUMNS_PASSED; +} + + +static std::string getTablePath(const std::string & db_dir_path, const std::string & table_name, const std::string & format_name) +{ + return db_dir_path + escapeForFileName(table_name) + "/data." + escapeForFileName(format_name); +} + +/// Both db_dir_path and table_path must be converted to absolute paths (in particular, path cannot contain '..'). +static void checkCreationIsAllowed(Context & context_global, const std::string & db_dir_path, const std::string & table_path, int table_fd) +{ + if (context_global.getApplicationType() != Context::ApplicationType::SERVER) + return; + + if (table_fd >= 0) + throw Exception("Using file descriptor as source of storage isn't allowed for server daemons", ErrorCodes::DATABASE_ACCESS_DENIED); + else if (!startsWith(table_path, db_dir_path)) + throw Exception("Part path " + table_path + " is not inside " + db_dir_path, ErrorCodes::DATABASE_ACCESS_DENIED); + + Poco::File table_path_poco_file = Poco::File(table_path); + if (!table_path_poco_file.exists()) + throw Exception("File " + table_path + " is not exist", ErrorCodes::FILE_DOESNT_EXIST); + else if (table_path_poco_file.isDirectory()) + throw Exception("File " + table_path + " must not be a directory", ErrorCodes::INCORRECT_FILE_NAME); +} + + +StorageS3::StorageS3( + const std::string & table_path_, + int table_fd_, + const std::string & db_dir_path, + const std::string & table_name_, + const std::string & format_name_, + const ColumnsDescription & columns_, + Context & context_) + : IStorage(columns_), + table_name(table_name_), format_name(format_name_), context_global(context_), table_fd(table_fd_) +{ + if (table_fd < 0) /// Will use file + { + use_table_fd = false; + + if (!table_path_.empty()) /// Is user's file + { + Poco::Path poco_path = Poco::Path(table_path_); + if (poco_path.isRelative()) + poco_path = Poco::Path(db_dir_path, poco_path); + + path = poco_path.absolute().toString(); + checkCreationIsAllowed(context_global, db_dir_path, path, table_fd); + is_db_table = false; + } + else /// Is DB's file + { + if (db_dir_path.empty()) + throw Exception("Storage " + getName() + " requires data path", ErrorCodes::INCORRECT_FILE_NAME); + + path = getTablePath(db_dir_path, table_name, format_name); + is_db_table = true; + Poco::File(Poco::Path(path).parent()).createDirectories(); + } + } + else /// Will use FD + { + checkCreationIsAllowed(context_global, db_dir_path, path, table_fd); + + is_db_table = false; + use_table_fd = true; + + /// Save initial offset, it will be used for repeating SELECTs + /// If FD isn't seekable (lseek returns -1), then the second and subsequent SELECTs will fail. + table_fd_init_offset = lseek(table_fd, 0, SEEK_CUR); + } +} + + +class StorageS3BlockInputStream : public IBlockInputStream +{ +public: + StorageS3BlockInputStream(StorageS3 & storage_, const Context & context, UInt64 max_block_size) + : storage(storage_) + { + if (storage.use_table_fd) + { + unique_lock = std::unique_lock(storage.rwlock); + + /// We could use common ReadBuffer and WriteBuffer in storage to leverage cache + /// and add ability to seek unseekable files, but cache sync isn't supported. + + if (storage.table_fd_was_used) /// We need seek to initial position + { + if (storage.table_fd_init_offset < 0) + throw Exception("File descriptor isn't seekable, inside " + storage.getName(), ErrorCodes::CANNOT_SEEK_THROUGH_FILE); + + /// ReadBuffer's seek() doesn't make sense, since cache is empty + if (lseek(storage.table_fd, storage.table_fd_init_offset, SEEK_SET) < 0) + throwFromErrno("Cannot seek file descriptor, inside " + storage.getName(), ErrorCodes::CANNOT_SEEK_THROUGH_FILE); + } + + storage.table_fd_was_used = true; + read_buf = std::make_unique(storage.table_fd); + } + else + { + shared_lock = std::shared_lock(storage.rwlock); + + read_buf = std::make_unique(storage.path); + } + + reader = FormatFactory::instance().getInput(storage.format_name, *read_buf, storage.getSampleBlock(), context, max_block_size); + } + + String getName() const override + { + return storage.getName(); + } + + Block readImpl() override + { + return reader->read(); + } + + Block getHeader() const override { return reader->getHeader(); } + + void readPrefixImpl() override + { + reader->readPrefix(); + } + + void readSuffixImpl() override + { + reader->readSuffix(); + } + +private: + StorageS3 & storage; + Block sample_block; + std::unique_ptr read_buf; + BlockInputStreamPtr reader; + + std::shared_lock shared_lock; + std::unique_lock unique_lock; +}; + + +BlockInputStreams StorageS3::read( + const Names & /*column_names*/, + const SelectQueryInfo & /*query_info*/, + const Context & context, + QueryProcessingStage::Enum /*processed_stage*/, + size_t max_block_size, + unsigned /*num_streams*/) +{ + BlockInputStreamPtr block_input = std::make_shared(*this, context, max_block_size); + const ColumnsDescription & columns = getColumns(); + auto column_defaults = columns.getDefaults(); + if (column_defaults.empty()) + return {block_input}; + return {std::make_shared(block_input, column_defaults, context)}; +} + + +class StorageS3BlockOutputStream : public IBlockOutputStream +{ +public: + explicit StorageS3BlockOutputStream(StorageS3 & storage_) + : storage(storage_), lock(storage.rwlock) + { + if (storage.use_table_fd) + { + /** NOTE: Using real file binded to FD may be misleading: + * SELECT *; INSERT insert_data; SELECT *; last SELECT returns initil_fd_data + insert_data + * INSERT data; SELECT *; last SELECT returns only insert_data + */ + storage.table_fd_was_used = true; + write_buf = std::make_unique(storage.table_fd); + } + else + { + write_buf = std::make_unique(storage.path, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_APPEND | O_CREAT); + } + + writer = FormatFactory::instance().getOutput(storage.format_name, *write_buf, storage.getSampleBlock(), storage.context_global); + } + + Block getHeader() const override { return storage.getSampleBlock(); } + + void write(const Block & block) override + { + writer->write(block); + } + + void writePrefix() override + { + writer->writePrefix(); + } + + void writeSuffix() override + { + writer->writeSuffix(); + } + + void flush() override + { + writer->flush(); + } + +private: + StorageS3 & storage; + std::unique_lock lock; + std::unique_ptr write_buf; + BlockOutputStreamPtr writer; +}; + +BlockOutputStreamPtr StorageS3::write( + const ASTPtr & /*query*/, + const Context & /*context*/) +{ + return std::make_shared(*this); +} + + +void StorageS3::drop() +{ + /// Extra actions are not required. +} + + +void StorageS3::rename(const String & new_path_to_db, const String & /*new_database_name*/, const String & new_table_name) +{ + if (!is_db_table) + throw Exception("Can't rename table '" + table_name + "' binded to user-defined file (or FD)", ErrorCodes::DATABASE_ACCESS_DENIED); + + std::unique_lock lock(rwlock); + + std::string path_new = getTablePath(new_path_to_db, new_table_name, format_name); + Poco::File(Poco::Path(path_new).parent()).createDirectories(); + Poco::File(path).renameTo(path_new); + + path = std::move(path_new); +} + + +void registerStorageS3(StorageFactory & factory) +{ + factory.registerStorage("S3", [](const StorageFactory::Arguments & args) + { + ASTs & engine_args = args.engine_args; + + if (!(engine_args.size() == 1 || engine_args.size() == 2)) + throw Exception( + "Storage S3 requires 1 or 2 arguments: name of used format and source.", + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + engine_args[0] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[0], args.local_context); + String format_name = engine_args[0]->as().value.safeGet(); + + int source_fd = -1; + String source_path; + if (engine_args.size() >= 2) + { + /// Will use FD if engine_args[1] is int literal or identifier with std* name + + if (auto opt_name = getIdentifierName(engine_args[1])) + { + if (*opt_name == "stdin") + source_fd = STDIN_FILENO; + else if (*opt_name == "stdout") + source_fd = STDOUT_FILENO; + else if (*opt_name == "stderr") + source_fd = STDERR_FILENO; + else + throw Exception("Unknown identifier '" + *opt_name + "' in second arg of File storage constructor", + ErrorCodes::UNKNOWN_IDENTIFIER); + } + else if (const auto * literal = engine_args[1]->as()) + { + auto type = literal->value.getType(); + if (type == Field::Types::Int64) + source_fd = static_cast(literal->value.get()); + else if (type == Field::Types::UInt64) + source_fd = static_cast(literal->value.get()); + else if (type == Field::Types::String) + source_path = literal->value.get(); + } + } + + return StorageS3::create( + source_path, source_fd, + args.data_path, + args.table_name, format_name, args.columns, + args.context); + }); +} + +} diff --git a/dbms/src/Storages/StorageS3.h b/dbms/src/Storages/StorageS3.h new file mode 100644 index 00000000000..9414fa70cbf --- /dev/null +++ b/dbms/src/Storages/StorageS3.h @@ -0,0 +1,89 @@ +#pragma once + +#include + +#include +#include + +#include + +#include +#include +#include + + +namespace DB +{ + +class StorageS3BlockInputStream; +class StorageS3BlockOutputStream; + +class StorageS3 : public ext::shared_ptr_helper, public IStorage +{ +public: + std::string getName() const override + { + return "S3"; + } + + std::string getTableName() const override + { + return table_name; + } + + BlockInputStreams read( + const Names & column_names, + const SelectQueryInfo & query_info, + const Context & context, + QueryProcessingStage::Enum processed_stage, + size_t max_block_size, + unsigned num_streams) override; + + BlockOutputStreamPtr write( + const ASTPtr & query, + const Context & context) override; + + void drop() override; + + void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name) override; + + String getDataPath() const override { return path; } + +protected: + friend class StorageS3BlockInputStream; + friend class StorageS3BlockOutputStream; + + /** there are three options (ordered by priority): + - use specified file descriptor if (fd >= 0) + - use specified table_path if it isn't empty + - create own table inside data/db/table/ + */ + StorageS3( + const std::string & table_path_, + int table_fd_, + const std::string & db_dir_path, + const std::string & table_name_, + const std::string & format_name_, + const ColumnsDescription & columns_, + Context & context_); + +private: + + std::string table_name; + std::string format_name; + Context & context_global; + + std::string path; + int table_fd = -1; + + bool is_db_table = true; /// Table is stored in real database, not user's file + bool use_table_fd = false; /// Use table_fd insted of path + std::atomic table_fd_was_used{false}; /// To detect repeating reads from stdin + off_t table_fd_init_offset = -1; /// Initial position of fd, used for repeating reads + + mutable std::shared_mutex rwlock; + + Logger * log = &Logger::get("StorageS3"); +}; + +} From 47985cf8a7a2b93b8453d43ba2582644cf80c386 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Wed, 29 May 2019 12:54:31 +0000 Subject: [PATCH 043/309] Wrong commit. --- dbms/src/Storages/StorageS3.cpp | 254 ++++++++++---------------------- dbms/src/Storages/StorageS3.h | 13 +- 2 files changed, 77 insertions(+), 190 deletions(-) diff --git a/dbms/src/Storages/StorageS3.cpp b/dbms/src/Storages/StorageS3.cpp index 6f1b62dc240..080c1c7069e 100644 --- a/dbms/src/Storages/StorageS3.cpp +++ b/dbms/src/Storages/StorageS3.cpp @@ -7,9 +7,8 @@ #include #include -#include -#include -#include +#include +#include #include #include @@ -19,10 +18,9 @@ #include #include -#include - #include #include +#include namespace DB { @@ -40,118 +38,43 @@ namespace ErrorCodes } -static std::string getTablePath(const std::string & db_dir_path, const std::string & table_name, const std::string & format_name) -{ - return db_dir_path + escapeForFileName(table_name) + "/data." + escapeForFileName(format_name); -} - -/// Both db_dir_path and table_path must be converted to absolute paths (in particular, path cannot contain '..'). -static void checkCreationIsAllowed(Context & context_global, const std::string & db_dir_path, const std::string & table_path, int table_fd) -{ - if (context_global.getApplicationType() != Context::ApplicationType::SERVER) - return; - - if (table_fd >= 0) - throw Exception("Using file descriptor as source of storage isn't allowed for server daemons", ErrorCodes::DATABASE_ACCESS_DENIED); - else if (!startsWith(table_path, db_dir_path)) - throw Exception("Part path " + table_path + " is not inside " + db_dir_path, ErrorCodes::DATABASE_ACCESS_DENIED); - - Poco::File table_path_poco_file = Poco::File(table_path); - if (!table_path_poco_file.exists()) - throw Exception("File " + table_path + " is not exist", ErrorCodes::FILE_DOESNT_EXIST); - else if (table_path_poco_file.isDirectory()) - throw Exception("File " + table_path + " must not be a directory", ErrorCodes::INCORRECT_FILE_NAME); -} - - StorageS3::StorageS3( - const std::string & table_path_, - int table_fd_, - const std::string & db_dir_path, + const std::string & table_uri_, const std::string & table_name_, const std::string & format_name_, const ColumnsDescription & columns_, Context & context_) - : IStorage(columns_), - table_name(table_name_), format_name(format_name_), context_global(context_), table_fd(table_fd_) + : IStorage(columns_) + , table_name(table_name_) + , format_name(format_name_) + , context_global(context_) + , uri(table_uri_) { - if (table_fd < 0) /// Will use file - { - use_table_fd = false; - - if (!table_path_.empty()) /// Is user's file - { - Poco::Path poco_path = Poco::Path(table_path_); - if (poco_path.isRelative()) - poco_path = Poco::Path(db_dir_path, poco_path); - - path = poco_path.absolute().toString(); - checkCreationIsAllowed(context_global, db_dir_path, path, table_fd); - is_db_table = false; - } - else /// Is DB's file - { - if (db_dir_path.empty()) - throw Exception("Storage " + getName() + " requires data path", ErrorCodes::INCORRECT_FILE_NAME); - - path = getTablePath(db_dir_path, table_name, format_name); - is_db_table = true; - Poco::File(Poco::Path(path).parent()).createDirectories(); - } - } - else /// Will use FD - { - checkCreationIsAllowed(context_global, db_dir_path, path, table_fd); - - is_db_table = false; - use_table_fd = true; - - /// Save initial offset, it will be used for repeating SELECTs - /// If FD isn't seekable (lseek returns -1), then the second and subsequent SELECTs will fail. - table_fd_init_offset = lseek(table_fd, 0, SEEK_CUR); - } } class StorageS3BlockInputStream : public IBlockInputStream { public: - StorageS3BlockInputStream(StorageS3 & storage_, const Context & context, UInt64 max_block_size) - : storage(storage_) + StorageS3BlockInputStream(const Poco::URI & uri, + const std::string & method, + std::function callback, + const String & format, + const String & name_, + const Block & sample_block, + const Context & context, + UInt64 max_block_size, + const ConnectionTimeouts & timeouts) + : name(name_) { - if (storage.use_table_fd) - { - unique_lock = std::unique_lock(storage.rwlock); + read_buf = std::make_unique(uri, method, callback, timeouts); - /// We could use common ReadBuffer and WriteBuffer in storage to leverage cache - /// and add ability to seek unseekable files, but cache sync isn't supported. - - if (storage.table_fd_was_used) /// We need seek to initial position - { - if (storage.table_fd_init_offset < 0) - throw Exception("File descriptor isn't seekable, inside " + storage.getName(), ErrorCodes::CANNOT_SEEK_THROUGH_FILE); - - /// ReadBuffer's seek() doesn't make sense, since cache is empty - if (lseek(storage.table_fd, storage.table_fd_init_offset, SEEK_SET) < 0) - throwFromErrno("Cannot seek file descriptor, inside " + storage.getName(), ErrorCodes::CANNOT_SEEK_THROUGH_FILE); - } - - storage.table_fd_was_used = true; - read_buf = std::make_unique(storage.table_fd); - } - else - { - shared_lock = std::shared_lock(storage.rwlock); - - read_buf = std::make_unique(storage.path); - } - - reader = FormatFactory::instance().getInput(storage.format_name, *read_buf, storage.getSampleBlock(), context, max_block_size); + reader = FormatFactory::instance().getInput(format, *read_buf, sample_block, context, max_block_size); } String getName() const override { - return storage.getName(); + return name; } Block readImpl() override @@ -159,7 +82,10 @@ public: return reader->read(); } - Block getHeader() const override { return reader->getHeader(); } + Block getHeader() const override + { + return reader->getHeader(); + } void readPrefixImpl() override { @@ -172,27 +98,35 @@ public: } private: - StorageS3 & storage; - Block sample_block; - std::unique_ptr read_buf; + String name; + std::unique_ptr read_buf; BlockInputStreamPtr reader; - - std::shared_lock shared_lock; - std::unique_lock unique_lock; }; BlockInputStreams StorageS3::read( - const Names & /*column_names*/, + const Names & column_names, const SelectQueryInfo & /*query_info*/, const Context & context, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, unsigned /*num_streams*/) { - BlockInputStreamPtr block_input = std::make_shared(*this, context, max_block_size); - const ColumnsDescription & columns = getColumns(); - auto column_defaults = columns.getDefaults(); + auto request_uri = uri; + + BlockInputStreamPtr block_input = std::make_shared(request_uri, + Poco::Net::HTTPRequest::HTTP_GET, + nullptr, + //getReadPOSTDataCallback(column_names, query_info, context, processed_stage, max_block_size), + format_name, + getName(), + getSampleBlockForColumns(column_names), + context, + max_block_size, + ConnectionTimeouts::getHTTPTimeouts(context)); + + + auto column_defaults = getColumns().getDefaults(); if (column_defaults.empty()) return {block_input}; return {std::make_shared(block_input, column_defaults, context)}; @@ -202,27 +136,21 @@ BlockInputStreams StorageS3::read( class StorageS3BlockOutputStream : public IBlockOutputStream { public: - explicit StorageS3BlockOutputStream(StorageS3 & storage_) - : storage(storage_), lock(storage.rwlock) + StorageS3BlockOutputStream(const Poco::URI & uri, + const String & format, + const Block & sample_block_, + const Context & context, + const ConnectionTimeouts & timeouts) + : sample_block(sample_block_) { - if (storage.use_table_fd) - { - /** NOTE: Using real file binded to FD may be misleading: - * SELECT *; INSERT insert_data; SELECT *; last SELECT returns initil_fd_data + insert_data - * INSERT data; SELECT *; last SELECT returns only insert_data - */ - storage.table_fd_was_used = true; - write_buf = std::make_unique(storage.table_fd); - } - else - { - write_buf = std::make_unique(storage.path, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_APPEND | O_CREAT); - } - - writer = FormatFactory::instance().getOutput(storage.format_name, *write_buf, storage.getSampleBlock(), storage.context_global); + write_buf = std::make_unique(uri, Poco::Net::HTTPRequest::HTTP_POST, timeouts); + writer = FormatFactory::instance().getOutput(format, *write_buf, sample_block, context); } - Block getHeader() const override { return storage.getSampleBlock(); } + Block getHeader() const override + { + return sample_block; + } void write(const Block & block) override { @@ -237,17 +165,13 @@ public: void writeSuffix() override { writer->writeSuffix(); - } - - void flush() override - { writer->flush(); + write_buf->finalize(); } private: - StorageS3 & storage; - std::unique_lock lock; - std::unique_ptr write_buf; + Block sample_block; + std::unique_ptr write_buf; BlockOutputStreamPtr writer; }; @@ -255,7 +179,8 @@ BlockOutputStreamPtr StorageS3::write( const ASTPtr & /*query*/, const Context & /*context*/) { - return std::make_shared(*this); + return std::make_shared( + uri, format_name, getSampleBlock(), context_global, ConnectionTimeouts::getHTTPTimeouts(context_global)); } @@ -265,19 +190,7 @@ void StorageS3::drop() } -void StorageS3::rename(const String & new_path_to_db, const String & /*new_database_name*/, const String & new_table_name) -{ - if (!is_db_table) - throw Exception("Can't rename table '" + table_name + "' binded to user-defined file (or FD)", ErrorCodes::DATABASE_ACCESS_DENIED); - - std::unique_lock lock(rwlock); - - std::string path_new = getTablePath(new_path_to_db, new_table_name, format_name); - Poco::File(Poco::Path(path_new).parent()).createDirectories(); - Poco::File(path).renameTo(path_new); - - path = std::move(path_new); -} +void StorageS3::rename(const String & /*new_path_to_db*/, const String & /*new_database_name*/, const String & /*new_table_name*/) {} void registerStorageS3(StorageFactory & factory) @@ -286,49 +199,30 @@ void registerStorageS3(StorageFactory & factory) { ASTs & engine_args = args.engine_args; - if (!(engine_args.size() == 1 || engine_args.size() == 2)) + if (!(engine_args.size() == 2)) throw Exception( - "Storage S3 requires 1 or 2 arguments: name of used format and source.", + "Storage S3 requires 2 arguments: name of used format and source.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); engine_args[0] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[0], args.local_context); String format_name = engine_args[0]->as().value.safeGet(); - int source_fd = -1; String source_path; - if (engine_args.size() >= 2) + if (const auto * literal = engine_args[1]->as()) { - /// Will use FD if engine_args[1] is int literal or identifier with std* name - - if (auto opt_name = getIdentifierName(engine_args[1])) + auto type = literal->value.getType(); + if (type == Field::Types::String) { - if (*opt_name == "stdin") - source_fd = STDIN_FILENO; - else if (*opt_name == "stdout") - source_fd = STDOUT_FILENO; - else if (*opt_name == "stderr") - source_fd = STDERR_FILENO; - else - throw Exception("Unknown identifier '" + *opt_name + "' in second arg of File storage constructor", - ErrorCodes::UNKNOWN_IDENTIFIER); - } - else if (const auto * literal = engine_args[1]->as()) - { - auto type = literal->value.getType(); - if (type == Field::Types::Int64) - source_fd = static_cast(literal->value.get()); - else if (type == Field::Types::UInt64) - source_fd = static_cast(literal->value.get()); - else if (type == Field::Types::String) - source_path = literal->value.get(); + source_path = literal->value.get(); + return StorageS3::create( + source_path, + args.table_name, format_name, args.columns, + args.context); } } - return StorageS3::create( - source_path, source_fd, - args.data_path, - args.table_name, format_name, args.columns, - args.context); + throw Exception("Unknown entity in first arg of S3 storage constructor, String expected.", + ErrorCodes::UNKNOWN_IDENTIFIER); }); } diff --git a/dbms/src/Storages/StorageS3.h b/dbms/src/Storages/StorageS3.h index 9414fa70cbf..263e5033962 100644 --- a/dbms/src/Storages/StorageS3.h +++ b/dbms/src/Storages/StorageS3.h @@ -4,6 +4,7 @@ #include #include +#include #include @@ -47,8 +48,6 @@ public: void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name) override; - String getDataPath() const override { return path; } - protected: friend class StorageS3BlockInputStream; friend class StorageS3BlockOutputStream; @@ -59,9 +58,7 @@ protected: - create own table inside data/db/table/ */ StorageS3( - const std::string & table_path_, - int table_fd_, - const std::string & db_dir_path, + const std::string & table_uri_, const std::string & table_name_, const std::string & format_name_, const ColumnsDescription & columns_, @@ -73,13 +70,9 @@ private: std::string format_name; Context & context_global; - std::string path; - int table_fd = -1; + Poco::URI uri; bool is_db_table = true; /// Table is stored in real database, not user's file - bool use_table_fd = false; /// Use table_fd insted of path - std::atomic table_fd_was_used{false}; /// To detect repeating reads from stdin - off_t table_fd_init_offset = -1; /// Initial position of fd, used for repeating reads mutable std::shared_mutex rwlock; From ff09934219ba99af077b4d5665fa664f290769a8 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Fri, 31 May 2019 07:27:14 +0000 Subject: [PATCH 044/309] Table function and storage. --- dbms/src/Storages/StorageS3.cpp | 292 ++++++++---------- dbms/src/Storages/StorageS3.h | 102 +++--- dbms/src/Storages/registerStorages.cpp | 2 + dbms/src/TableFunctions/TableFunctionS3.cpp | 19 ++ dbms/src/TableFunctions/TableFunctionS3.h | 25 ++ .../TableFunctions/registerTableFunctions.cpp | 2 + 6 files changed, 232 insertions(+), 210 deletions(-) create mode 100644 dbms/src/TableFunctions/TableFunctionS3.cpp create mode 100644 dbms/src/TableFunctions/TableFunctionS3.h diff --git a/dbms/src/Storages/StorageS3.cpp b/dbms/src/Storages/StorageS3.cpp index 080c1c7069e..f49cd9e7a9e 100644 --- a/dbms/src/Storages/StorageS3.cpp +++ b/dbms/src/Storages/StorageS3.cpp @@ -1,126 +1,176 @@ -#include #include +#include #include #include - #include -#include #include #include #include -#include + #include +#include #include -#include -#include - -#include -#include #include + namespace DB { - namespace ErrorCodes { - extern const int CANNOT_WRITE_TO_FILE_DESCRIPTOR; - extern const int CANNOT_SEEK_THROUGH_FILE; - extern const int DATABASE_ACCESS_DENIED; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int UNKNOWN_IDENTIFIER; - extern const int INCORRECT_FILE_NAME; - extern const int FILE_DOESNT_EXIST; - extern const int EMPTY_LIST_OF_COLUMNS_PASSED; } - -StorageS3::StorageS3( - const std::string & table_uri_, - const std::string & table_name_, - const std::string & format_name_, - const ColumnsDescription & columns_, - Context & context_) - : IStorage(columns_) - , table_name(table_name_) - , format_name(format_name_) - , context_global(context_) - , uri(table_uri_) +IStorageS3Base::IStorageS3Base(const Poco::URI & uri_, + const Context & context_, + const std::string & table_name_, + const String & format_name_, + const ColumnsDescription & columns_) + : IStorage(columns_), uri(uri_), context_global(context_), format_name(format_name_), table_name(table_name_) { } - -class StorageS3BlockInputStream : public IBlockInputStream +namespace { -public: - StorageS3BlockInputStream(const Poco::URI & uri, - const std::string & method, - std::function callback, - const String & format, - const String & name_, - const Block & sample_block, - const Context & context, - UInt64 max_block_size, - const ConnectionTimeouts & timeouts) - : name(name_) + class StorageS3BlockInputStream : public IBlockInputStream { - read_buf = std::make_unique(uri, method, callback, timeouts); + public: + StorageS3BlockInputStream(const Poco::URI & uri, + const std::string & method, + std::function callback, + const String & format, + const String & name_, + const Block & sample_block, + const Context & context, + UInt64 max_block_size, + const ConnectionTimeouts & timeouts) + : name(name_) + { + read_buf = std::make_unique(uri, method, callback, timeouts); - reader = FormatFactory::instance().getInput(format, *read_buf, sample_block, context, max_block_size); - } + reader = FormatFactory::instance().getInput(format, *read_buf, sample_block, context, max_block_size); + } - String getName() const override + String getName() const override + { + return name; + } + + Block readImpl() override + { + return reader->read(); + } + + Block getHeader() const override + { + return reader->getHeader(); + } + + void readPrefixImpl() override + { + reader->readPrefix(); + } + + void readSuffixImpl() override + { + reader->readSuffix(); + } + + private: + String name; + std::unique_ptr read_buf; + BlockInputStreamPtr reader; + }; + + class StorageS3BlockOutputStream : public IBlockOutputStream { - return name; - } + public: + StorageS3BlockOutputStream(const Poco::URI & uri, + const String & format, + const Block & sample_block_, + const Context & context, + const ConnectionTimeouts & timeouts) + : sample_block(sample_block_) + { + write_buf = std::make_unique(uri, Poco::Net::HTTPRequest::HTTP_POST, timeouts); + writer = FormatFactory::instance().getOutput(format, *write_buf, sample_block, context); + } - Block readImpl() override - { - return reader->read(); - } + Block getHeader() const override + { + return sample_block; + } - Block getHeader() const override - { - return reader->getHeader(); - } + void write(const Block & block) override + { + writer->write(block); + } - void readPrefixImpl() override - { - reader->readPrefix(); - } + void writePrefix() override + { + writer->writePrefix(); + } - void readSuffixImpl() override - { - reader->readSuffix(); - } + void writeSuffix() override + { + writer->writeSuffix(); + writer->flush(); + write_buf->finalize(); + } -private: - String name; - std::unique_ptr read_buf; - BlockInputStreamPtr reader; -}; + private: + Block sample_block; + std::unique_ptr write_buf; + BlockOutputStreamPtr writer; + }; +} -BlockInputStreams StorageS3::read( - const Names & column_names, +std::string IStorageS3Base::getReadMethod() const +{ + return Poco::Net::HTTPRequest::HTTP_GET; +} + +std::vector> IStorageS3Base::getReadURIParams(const Names & /*column_names*/, const SelectQueryInfo & /*query_info*/, + const Context & /*context*/, + QueryProcessingStage::Enum & /*processed_stage*/, + size_t /*max_block_size*/) const +{ + return {}; +} + +std::function IStorageS3Base::getReadPOSTDataCallback(const Names & /*column_names*/, + const SelectQueryInfo & /*query_info*/, + const Context & /*context*/, + QueryProcessingStage::Enum & /*processed_stage*/, + size_t /*max_block_size*/) const +{ + return nullptr; +} + + +BlockInputStreams IStorageS3Base::read(const Names & column_names, + const SelectQueryInfo & query_info, const Context & context, - QueryProcessingStage::Enum /*processed_stage*/, + QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned /*num_streams*/) { auto request_uri = uri; + auto params = getReadURIParams(column_names, query_info, context, processed_stage, max_block_size); + for (const auto & [param, value] : params) + request_uri.addQueryParameter(param, value); BlockInputStreamPtr block_input = std::make_shared(request_uri, - Poco::Net::HTTPRequest::HTTP_GET, - nullptr, - //getReadPOSTDataCallback(column_names, query_info, context, processed_stage, max_block_size), + getReadMethod(), + getReadPOSTDataCallback(column_names, query_info, context, processed_stage, max_block_size), format_name, getName(), - getSampleBlockForColumns(column_names), + getHeaderBlock(column_names), context, max_block_size, ConnectionTimeouts::getHTTPTimeouts(context)); @@ -132,98 +182,16 @@ BlockInputStreams StorageS3::read( return {std::make_shared(block_input, column_defaults, context)}; } +void IStorageS3Base::rename(const String & /*new_path_to_db*/, const String & /*new_database_name*/, const String & /*new_table_name*/) {} -class StorageS3BlockOutputStream : public IBlockOutputStream -{ -public: - StorageS3BlockOutputStream(const Poco::URI & uri, - const String & format, - const Block & sample_block_, - const Context & context, - const ConnectionTimeouts & timeouts) - : sample_block(sample_block_) - { - write_buf = std::make_unique(uri, Poco::Net::HTTPRequest::HTTP_POST, timeouts); - writer = FormatFactory::instance().getOutput(format, *write_buf, sample_block, context); - } - - Block getHeader() const override - { - return sample_block; - } - - void write(const Block & block) override - { - writer->write(block); - } - - void writePrefix() override - { - writer->writePrefix(); - } - - void writeSuffix() override - { - writer->writeSuffix(); - writer->flush(); - write_buf->finalize(); - } - -private: - Block sample_block; - std::unique_ptr write_buf; - BlockOutputStreamPtr writer; -}; - -BlockOutputStreamPtr StorageS3::write( - const ASTPtr & /*query*/, - const Context & /*context*/) +BlockOutputStreamPtr IStorageS3Base::write(const ASTPtr & /*query*/, const Context & /*context*/) { return std::make_shared( uri, format_name, getSampleBlock(), context_global, ConnectionTimeouts::getHTTPTimeouts(context_global)); } - -void StorageS3::drop() +void registerStorageS3(StorageFactory & /*factory*/) { - /// Extra actions are not required. + // TODO. See #1394. } - - -void StorageS3::rename(const String & /*new_path_to_db*/, const String & /*new_database_name*/, const String & /*new_table_name*/) {} - - -void registerStorageS3(StorageFactory & factory) -{ - factory.registerStorage("S3", [](const StorageFactory::Arguments & args) - { - ASTs & engine_args = args.engine_args; - - if (!(engine_args.size() == 2)) - throw Exception( - "Storage S3 requires 2 arguments: name of used format and source.", - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - - engine_args[0] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[0], args.local_context); - String format_name = engine_args[0]->as().value.safeGet(); - - String source_path; - if (const auto * literal = engine_args[1]->as()) - { - auto type = literal->value.getType(); - if (type == Field::Types::String) - { - source_path = literal->value.get(); - return StorageS3::create( - source_path, - args.table_name, format_name, args.columns, - args.context); - } - } - - throw Exception("Unknown entity in first arg of S3 storage constructor, String expected.", - ErrorCodes::UNKNOWN_IDENTIFIER); - }); -} - } diff --git a/dbms/src/Storages/StorageS3.h b/dbms/src/Storages/StorageS3.h index 263e5033962..2615563b57c 100644 --- a/dbms/src/Storages/StorageS3.h +++ b/dbms/src/Storages/StorageS3.h @@ -1,82 +1,88 @@ #pragma once #include - -#include -#include #include - #include - -#include -#include #include - namespace DB { - -class StorageS3BlockInputStream; -class StorageS3BlockOutputStream; - -class StorageS3 : public ext::shared_ptr_helper, public IStorage +/** + * This class represents table engine for external urls. + * It sends HTTP GET to server when select is called and + * HTTP POST when insert is called. In POST request the data is send + * using Chunked transfer encoding, so server have to support it. + */ +class IStorageS3Base : public IStorage { public: - std::string getName() const override - { - return "S3"; - } - - std::string getTableName() const override + String getTableName() const override { return table_name; } - BlockInputStreams read( - const Names & column_names, + BlockInputStreams read(const Names & column_names, const SelectQueryInfo & query_info, const Context & context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned num_streams) override; - BlockOutputStreamPtr write( - const ASTPtr & query, - const Context & context) override; - - void drop() override; + BlockOutputStreamPtr write(const ASTPtr & query, const Context & context) override; void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name) override; protected: - friend class StorageS3BlockInputStream; - friend class StorageS3BlockOutputStream; - - /** there are three options (ordered by priority): - - use specified file descriptor if (fd >= 0) - - use specified table_path if it isn't empty - - create own table inside data/db/table/ - */ - StorageS3( - const std::string & table_uri_, + IStorageS3Base(const Poco::URI & uri_, + const Context & context_, const std::string & table_name_, - const std::string & format_name_, - const ColumnsDescription & columns_, - Context & context_); - -private: - - std::string table_name; - std::string format_name; - Context & context_global; + const String & format_name_, + const ColumnsDescription & columns_); Poco::URI uri; + const Context & context_global; - bool is_db_table = true; /// Table is stored in real database, not user's file +private: + String format_name; + String table_name; - mutable std::shared_mutex rwlock; + virtual std::string getReadMethod() const; - Logger * log = &Logger::get("StorageS3"); + virtual std::vector> getReadURIParams(const Names & column_names, + const SelectQueryInfo & query_info, + const Context & context, + QueryProcessingStage::Enum & processed_stage, + size_t max_block_size) const; + + virtual std::function getReadPOSTDataCallback(const Names & column_names, + const SelectQueryInfo & query_info, + const Context & context, + QueryProcessingStage::Enum & processed_stage, + size_t max_block_size) const; + + virtual Block getHeaderBlock(const Names & column_names) const = 0; }; +class StorageS3 : public ext::shared_ptr_helper, public IStorageS3Base +{ +public: + StorageS3(const Poco::URI & uri_, + const std::string & table_name_, + const String & format_name_, + const ColumnsDescription & columns_, + Context & context_) + : IStorageS3Base(uri_, context_, table_name_, format_name_, columns_) + { + } + + String getName() const override + { + return "S3"; + } + + Block getHeaderBlock(const Names & /*column_names*/) const override + { + return getSampleBlock(); + } +}; } diff --git a/dbms/src/Storages/registerStorages.cpp b/dbms/src/Storages/registerStorages.cpp index c21156ea44d..4c29884dfcf 100644 --- a/dbms/src/Storages/registerStorages.cpp +++ b/dbms/src/Storages/registerStorages.cpp @@ -19,6 +19,7 @@ void registerStorageDistributed(StorageFactory & factory); void registerStorageMemory(StorageFactory & factory); void registerStorageFile(StorageFactory & factory); void registerStorageURL(StorageFactory & factory); +void registerStorageS3(StorageFactory & factory); void registerStorageDictionary(StorageFactory & factory); void registerStorageSet(StorageFactory & factory); void registerStorageJoin(StorageFactory & factory); @@ -60,6 +61,7 @@ void registerStorages() registerStorageMemory(factory); registerStorageFile(factory); registerStorageURL(factory); + registerStorageS3(factory); registerStorageDictionary(factory); registerStorageSet(factory); registerStorageJoin(factory); diff --git a/dbms/src/TableFunctions/TableFunctionS3.cpp b/dbms/src/TableFunctions/TableFunctionS3.cpp new file mode 100644 index 00000000000..5c2c6215765 --- /dev/null +++ b/dbms/src/TableFunctions/TableFunctionS3.cpp @@ -0,0 +1,19 @@ +#include +#include +#include +#include + +namespace DB +{ +StoragePtr TableFunctionS3::getStorage( + const String & source, const String & format, const Block & sample_block, Context & global_context) const +{ + Poco::URI uri(source); + return StorageS3::create(uri, getName(), format, ColumnsDescription{sample_block.getNamesAndTypesList()}, global_context); +} + +void registerTableFunctionS3(TableFunctionFactory & factory) +{ + factory.registerFunction(); +} +} diff --git a/dbms/src/TableFunctions/TableFunctionS3.h b/dbms/src/TableFunctions/TableFunctionS3.h new file mode 100644 index 00000000000..83c49e0b8d1 --- /dev/null +++ b/dbms/src/TableFunctions/TableFunctionS3.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ +/* url(source, format, structure) - creates a temporary storage from url + */ +class TableFunctionS3 : public ITableFunctionFileLike +{ +public: + static constexpr auto name = "s3"; + std::string getName() const override + { + return name; + } + +private: + StoragePtr getStorage( + const String & source, const String & format, const Block & sample_block, Context & global_context) const override; +}; +} diff --git a/dbms/src/TableFunctions/registerTableFunctions.cpp b/dbms/src/TableFunctions/registerTableFunctions.cpp index 61d0ec23f7d..aad5eebe935 100644 --- a/dbms/src/TableFunctions/registerTableFunctions.cpp +++ b/dbms/src/TableFunctions/registerTableFunctions.cpp @@ -11,6 +11,7 @@ void registerTableFunctionMerge(TableFunctionFactory & factory); void registerTableFunctionRemote(TableFunctionFactory & factory); void registerTableFunctionNumbers(TableFunctionFactory & factory); void registerTableFunctionFile(TableFunctionFactory & factory); +void registerTableFunctionS3(TableFunctionFactory & factory); void registerTableFunctionURL(TableFunctionFactory & factory); void registerTableFunctionValues(TableFunctionFactory & factory); @@ -37,6 +38,7 @@ void registerTableFunctions() registerTableFunctionRemote(factory); registerTableFunctionNumbers(factory); registerTableFunctionFile(factory); + registerTableFunctionS3(factory); registerTableFunctionURL(factory); registerTableFunctionValues(factory); From 78f57c5f2a6aa1c7a4dfb87d31b7c5702e60eaec Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Fri, 31 May 2019 10:58:43 +0000 Subject: [PATCH 045/309] First attempt to make redirects. --- dbms/src/IO/HTTPCommon.cpp | 6 +- dbms/src/IO/HTTPCommon.h | 2 + dbms/src/IO/ReadWriteBufferFromS3.cpp | 29 +++++ dbms/src/IO/ReadWriteBufferFromS3.h | 147 ++++++++++++++++++++++++++ dbms/src/Storages/StorageS3.cpp | 11 +- 5 files changed, 188 insertions(+), 7 deletions(-) create mode 100644 dbms/src/IO/ReadWriteBufferFromS3.cpp create mode 100644 dbms/src/IO/ReadWriteBufferFromS3.h diff --git a/dbms/src/IO/HTTPCommon.cpp b/dbms/src/IO/HTTPCommon.cpp index ca5b5ab700b..53d2ac8c2dd 100644 --- a/dbms/src/IO/HTTPCommon.cpp +++ b/dbms/src/IO/HTTPCommon.cpp @@ -217,6 +217,11 @@ std::istream * receiveResponse( Poco::Net::HTTPClientSession & session, const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response) { auto istr = &session.receiveResponse(response); + assertResponseIsOk(request, response, istr); + return istr; +} + +void assertResponseIsOk(const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream * istr) { auto status = response.getStatus(); if (status != Poco::Net::HTTPResponse::HTTP_OK) @@ -229,7 +234,6 @@ std::istream * receiveResponse( status == HTTP_TOO_MANY_REQUESTS ? ErrorCodes::RECEIVED_ERROR_TOO_MANY_REQUESTS : ErrorCodes::RECEIVED_ERROR_FROM_REMOTE_IO_SERVER); } - return istr; } } diff --git a/dbms/src/IO/HTTPCommon.h b/dbms/src/IO/HTTPCommon.h index 6dc669c248e..1e7500cf230 100644 --- a/dbms/src/IO/HTTPCommon.h +++ b/dbms/src/IO/HTTPCommon.h @@ -57,4 +57,6 @@ PooledHTTPSessionPtr makePooledHTTPSession(const Poco::URI & uri, const Connecti */ std::istream * receiveResponse( Poco::Net::HTTPClientSession & session, const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response); +void assertResponseIsOk(const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream * istr); + } diff --git a/dbms/src/IO/ReadWriteBufferFromS3.cpp b/dbms/src/IO/ReadWriteBufferFromS3.cpp new file mode 100644 index 00000000000..0ead1bdf32d --- /dev/null +++ b/dbms/src/IO/ReadWriteBufferFromS3.cpp @@ -0,0 +1,29 @@ +#include + +#include + + +namespace DB +{ + +WriteBufferFromS3::WriteBufferFromS3( + const Poco::URI & uri, const std::string & method, const ConnectionTimeouts & timeouts, size_t buffer_size_) + : WriteBufferFromOStream(buffer_size_) + , session{makeHTTPSession(uri, timeouts)} + , request{method, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1} +{ + request.setHost(uri.getHost()); + request.setChunkedTransferEncoding(true); + + LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << uri.toString()); + + ostr = &session->sendRequest(request); +} + +void WriteBufferFromS3::finalize() +{ + receiveResponse(*session, request, response); + /// TODO: Response body is ignored. +} + +} diff --git a/dbms/src/IO/ReadWriteBufferFromS3.h b/dbms/src/IO/ReadWriteBufferFromS3.h new file mode 100644 index 00000000000..1257031ae2a --- /dev/null +++ b/dbms/src/IO/ReadWriteBufferFromS3.h @@ -0,0 +1,147 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define DEFAULT_S3_READ_BUFFER_TIMEOUT 1800 +#define DEFAULT_S3_READ_BUFFER_CONNECTION_TIMEOUT 1 +#define DEFAULT_S3_MAX_FOLLOW_REDIRECT 2 + +namespace DB +{ +/** Perform S3 HTTP POST request and provide response to read. + */ + +namespace detail +{ + template + class ReadWriteBufferFromS3Base : public ReadBuffer + { + protected: + Poco::URI uri; + std::string method; + + SessionPtr session; + std::istream * istr; /// owned by session + std::unique_ptr impl; + + public: + using OutStreamCallback = std::function; + + explicit ReadWriteBufferFromS3Base(SessionPtr session_, + Poco::URI uri, + const std::string & method = {}, + OutStreamCallback out_stream_callback = {}, + const Poco::Net::HTTPBasicCredentials & credentials = {}, + size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE) + : ReadBuffer(nullptr, 0) + , uri {uri} + , method {!method.empty() ? method : out_stream_callback ? Poco::Net::HTTPRequest::HTTP_POST : Poco::Net::HTTPRequest::HTTP_GET} + , session {session_} + { + Poco::Net::HTTPResponse response; + std::unique_ptr request; + + for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_REDIRECT; ++i) + { + // With empty path poco will send "POST HTTP/1.1" its bug. + if (uri.getPath().empty()) + uri.setPath("/"); + + request = std::make_unique(method, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); + request->setHost(uri.getHost()); // use original, not resolved host name in header + + if (out_stream_callback) + request->setChunkedTransferEncoding(true); + + if (!credentials.getUsername().empty()) + credentials.authenticate(*request); + + LOG_TRACE((&Logger::get("ReadWriteBufferFromS3")), "Sending request to " << uri.toString()); + + auto & stream_out = session->sendRequest(*request); + + if (out_stream_callback) + out_stream_callback(stream_out); + + istr = &session->receiveResponse(response); + + if (response.getStatus() != 307) + break; + + auto location_iterator = response.find("Location"); + if (location_iterator == response.end()) + break; + + uri = location_iterator->second; + } + + assertResponseIsOk(*request, response, istr); + impl = std::make_unique(*istr, buffer_size_); + } + + + bool nextImpl() override + { + if (!impl->next()) + return false; + internal_buffer = impl->buffer(); + working_buffer = internal_buffer; + return true; + } + }; +} + +class ReadWriteBufferFromS3 : public detail::ReadWriteBufferFromS3Base +{ + using Parent = detail::ReadWriteBufferFromS3Base; + +public: + explicit ReadWriteBufferFromS3(Poco::URI uri_, + const std::string & method_ = {}, + OutStreamCallback out_stream_callback = {}, + const ConnectionTimeouts & timeouts = {}, + const Poco::Net::HTTPBasicCredentials & credentials = {}, + size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE) + : Parent(makeHTTPSession(uri_, timeouts), uri_, method_, out_stream_callback, credentials, buffer_size_) + { + } +}; + +/* Perform S3 HTTP POST/PUT request. + */ +class WriteBufferFromS3 : public WriteBufferFromOStream +{ +private: + HTTPSessionPtr session; + Poco::Net::HTTPRequest request; + Poco::Net::HTTPResponse response; + +public: + explicit WriteBufferFromS3(const Poco::URI & uri, + const std::string & method = Poco::Net::HTTPRequest::HTTP_POST, // POST or PUT only + const ConnectionTimeouts & timeouts = {}, + size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE); + + /// Receives response from the server after sending all data. + void finalize(); +}; + +} diff --git a/dbms/src/Storages/StorageS3.cpp b/dbms/src/Storages/StorageS3.cpp index f49cd9e7a9e..972820a8449 100644 --- a/dbms/src/Storages/StorageS3.cpp +++ b/dbms/src/Storages/StorageS3.cpp @@ -5,8 +5,7 @@ #include #include -#include -#include +#include #include @@ -49,7 +48,7 @@ namespace const ConnectionTimeouts & timeouts) : name(name_) { - read_buf = std::make_unique(uri, method, callback, timeouts); + read_buf = std::make_unique(uri, method, callback, timeouts); reader = FormatFactory::instance().getInput(format, *read_buf, sample_block, context, max_block_size); } @@ -81,7 +80,7 @@ namespace private: String name; - std::unique_ptr read_buf; + std::unique_ptr read_buf; BlockInputStreamPtr reader; }; @@ -95,7 +94,7 @@ namespace const ConnectionTimeouts & timeouts) : sample_block(sample_block_) { - write_buf = std::make_unique(uri, Poco::Net::HTTPRequest::HTTP_POST, timeouts); + write_buf = std::make_unique(uri, Poco::Net::HTTPRequest::HTTP_POST, timeouts); writer = FormatFactory::instance().getOutput(format, *write_buf, sample_block, context); } @@ -123,7 +122,7 @@ namespace private: Block sample_block; - std::unique_ptr write_buf; + std::unique_ptr write_buf; BlockOutputStreamPtr writer; }; } From caeacafb7676ac7e6ee5d17e1b4dbf75045cc5c2 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Fri, 31 May 2019 15:50:21 +0000 Subject: [PATCH 046/309] Fixed GET redirects. --- dbms/src/IO/HTTPCommon.cpp | 2 +- dbms/src/IO/ReadWriteBufferFromS3.h | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/dbms/src/IO/HTTPCommon.cpp b/dbms/src/IO/HTTPCommon.cpp index 53d2ac8c2dd..32970276dd4 100644 --- a/dbms/src/IO/HTTPCommon.cpp +++ b/dbms/src/IO/HTTPCommon.cpp @@ -45,7 +45,7 @@ namespace ErrorCodes namespace { -void setTimeouts(Poco::Net::HTTPClientSession & session, const ConnectionTimeouts & timeouts) + void setTimeouts(Poco::Net::HTTPClientSession & session, const ConnectionTimeouts & timeouts) { #if defined(POCO_CLICKHOUSE_PATCH) || POCO_VERSION >= 0x02000000 session.setTimeout(timeouts.connection_timeout, timeouts.send_timeout, timeouts.receive_timeout); diff --git a/dbms/src/IO/ReadWriteBufferFromS3.h b/dbms/src/IO/ReadWriteBufferFromS3.h index 1257031ae2a..01fb0fb890a 100644 --- a/dbms/src/IO/ReadWriteBufferFromS3.h +++ b/dbms/src/IO/ReadWriteBufferFromS3.h @@ -31,7 +31,7 @@ namespace DB namespace detail { - template + template //FIXME Можно избавиться от template, или переделать на нормальное. class ReadWriteBufferFromS3Base : public ReadBuffer { protected: @@ -45,8 +45,8 @@ namespace detail public: using OutStreamCallback = std::function; - explicit ReadWriteBufferFromS3Base(SessionPtr session_, - Poco::URI uri, + explicit ReadWriteBufferFromS3Base(Poco::URI uri, + const ConnectionTimeouts & timeouts = {}, const std::string & method = {}, OutStreamCallback out_stream_callback = {}, const Poco::Net::HTTPBasicCredentials & credentials = {}, @@ -54,7 +54,7 @@ namespace detail : ReadBuffer(nullptr, 0) , uri {uri} , method {!method.empty() ? method : out_stream_callback ? Poco::Net::HTTPRequest::HTTP_POST : Poco::Net::HTTPRequest::HTTP_GET} - , session {session_} + , session(makeHTTPSession(uri, timeouts)) { Poco::Net::HTTPResponse response; std::unique_ptr request; @@ -91,6 +91,7 @@ namespace detail break; uri = location_iterator->second; + session = makeHTTPSession(uri, timeouts); } assertResponseIsOk(*request, response, istr); @@ -120,7 +121,7 @@ public: const ConnectionTimeouts & timeouts = {}, const Poco::Net::HTTPBasicCredentials & credentials = {}, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE) - : Parent(makeHTTPSession(uri_, timeouts), uri_, method_, out_stream_callback, credentials, buffer_size_) + : Parent(uri_, timeouts, method_, out_stream_callback, credentials, buffer_size_) { } }; From 531460396dcc8f059c447028f3b2c3c5b64921b3 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Fri, 31 May 2019 17:57:58 +0000 Subject: [PATCH 047/309] POST S3 requests. --- dbms/src/IO/ReadWriteBufferFromS3.cpp | 1 + dbms/src/IO/ReadWriteBufferFromS3.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dbms/src/IO/ReadWriteBufferFromS3.cpp b/dbms/src/IO/ReadWriteBufferFromS3.cpp index 0ead1bdf32d..05fa3569572 100644 --- a/dbms/src/IO/ReadWriteBufferFromS3.cpp +++ b/dbms/src/IO/ReadWriteBufferFromS3.cpp @@ -14,6 +14,7 @@ WriteBufferFromS3::WriteBufferFromS3( { request.setHost(uri.getHost()); request.setChunkedTransferEncoding(true); + request.setExpectContinue(true); LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << uri.toString()); diff --git a/dbms/src/IO/ReadWriteBufferFromS3.h b/dbms/src/IO/ReadWriteBufferFromS3.h index 01fb0fb890a..e28e51fb89c 100644 --- a/dbms/src/IO/ReadWriteBufferFromS3.h +++ b/dbms/src/IO/ReadWriteBufferFromS3.h @@ -137,7 +137,7 @@ private: public: explicit WriteBufferFromS3(const Poco::URI & uri, - const std::string & method = Poco::Net::HTTPRequest::HTTP_POST, // POST or PUT only + const std::string & method = Poco::Net::HTTPRequest::HTTP_POST, // POST for inserting, PUT for replacing. const ConnectionTimeouts & timeouts = {}, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE); From 062db0ec141e14e096c9b472f8e9a296265e8bd1 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Fri, 31 May 2019 18:14:39 +0000 Subject: [PATCH 048/309] Removed templateness of ReadWriteBufferFromS3. --- dbms/src/IO/ReadWriteBufferFromS3.cpp | 63 +++++++++++++++++ dbms/src/IO/ReadWriteBufferFromS3.h | 99 ++++----------------------- 2 files changed, 76 insertions(+), 86 deletions(-) diff --git a/dbms/src/IO/ReadWriteBufferFromS3.cpp b/dbms/src/IO/ReadWriteBufferFromS3.cpp index 05fa3569572..f4b6124267f 100644 --- a/dbms/src/IO/ReadWriteBufferFromS3.cpp +++ b/dbms/src/IO/ReadWriteBufferFromS3.cpp @@ -6,6 +6,69 @@ namespace DB { +ReadWriteBufferFromS3::ReadWriteBufferFromS3(Poco::URI uri_, + const std::string & method_, + OutStreamCallback out_stream_callback, + const ConnectionTimeouts & timeouts, + const Poco::Net::HTTPBasicCredentials & credentials, + size_t buffer_size_) + : ReadBuffer(nullptr, 0) + , uri {uri_} + , method {!method_.empty() ? method_ : out_stream_callback ? Poco::Net::HTTPRequest::HTTP_POST : Poco::Net::HTTPRequest::HTTP_GET} + , session(makeHTTPSession(uri_, timeouts)) +{ + Poco::Net::HTTPResponse response; + std::unique_ptr request; + + for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_REDIRECT; ++i) + { + // With empty path poco will send "POST HTTP/1.1" its bug. + if (uri.getPath().empty()) + uri.setPath("/"); + + request = std::make_unique(method, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); + request->setHost(uri.getHost()); // use original, not resolved host name in header + + if (out_stream_callback) + request->setChunkedTransferEncoding(true); + + if (!credentials.getUsername().empty()) + credentials.authenticate(*request); + + LOG_TRACE((&Logger::get("ReadWriteBufferFromS3")), "Sending request to " << uri.toString()); + + auto & stream_out = session->sendRequest(*request); + + if (out_stream_callback) + out_stream_callback(stream_out); + + istr = &session->receiveResponse(response); + + if (response.getStatus() != 307) + break; + + auto location_iterator = response.find("Location"); + if (location_iterator == response.end()) + break; + + uri = location_iterator->second; + session = makeHTTPSession(uri, timeouts); + } + + assertResponseIsOk(*request, response, istr); + impl = std::make_unique(*istr, buffer_size_); +} + + +bool ReadWriteBufferFromS3::nextImpl() +{ + if (!impl->next()) + return false; + internal_buffer = impl->buffer(); + working_buffer = internal_buffer; + return true; +} + WriteBufferFromS3::WriteBufferFromS3( const Poco::URI & uri, const std::string & method, const ConnectionTimeouts & timeouts, size_t buffer_size_) : WriteBufferFromOStream(buffer_size_) diff --git a/dbms/src/IO/ReadWriteBufferFromS3.h b/dbms/src/IO/ReadWriteBufferFromS3.h index e28e51fb89c..e33dbe80df0 100644 --- a/dbms/src/IO/ReadWriteBufferFromS3.h +++ b/dbms/src/IO/ReadWriteBufferFromS3.h @@ -29,103 +29,30 @@ namespace DB /** Perform S3 HTTP POST request and provide response to read. */ -namespace detail +class ReadWriteBufferFromS3 : public ReadBuffer { - template //FIXME Можно избавиться от template, или переделать на нормальное. - class ReadWriteBufferFromS3Base : public ReadBuffer - { - protected: - Poco::URI uri; - std::string method; +protected: + Poco::URI uri; + std::string method; - SessionPtr session; - std::istream * istr; /// owned by session - std::unique_ptr impl; - - public: - using OutStreamCallback = std::function; - - explicit ReadWriteBufferFromS3Base(Poco::URI uri, - const ConnectionTimeouts & timeouts = {}, - const std::string & method = {}, - OutStreamCallback out_stream_callback = {}, - const Poco::Net::HTTPBasicCredentials & credentials = {}, - size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE) - : ReadBuffer(nullptr, 0) - , uri {uri} - , method {!method.empty() ? method : out_stream_callback ? Poco::Net::HTTPRequest::HTTP_POST : Poco::Net::HTTPRequest::HTTP_GET} - , session(makeHTTPSession(uri, timeouts)) - { - Poco::Net::HTTPResponse response; - std::unique_ptr request; - - for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_REDIRECT; ++i) - { - // With empty path poco will send "POST HTTP/1.1" its bug. - if (uri.getPath().empty()) - uri.setPath("/"); - - request = std::make_unique(method, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); - request->setHost(uri.getHost()); // use original, not resolved host name in header - - if (out_stream_callback) - request->setChunkedTransferEncoding(true); - - if (!credentials.getUsername().empty()) - credentials.authenticate(*request); - - LOG_TRACE((&Logger::get("ReadWriteBufferFromS3")), "Sending request to " << uri.toString()); - - auto & stream_out = session->sendRequest(*request); - - if (out_stream_callback) - out_stream_callback(stream_out); - - istr = &session->receiveResponse(response); - - if (response.getStatus() != 307) - break; - - auto location_iterator = response.find("Location"); - if (location_iterator == response.end()) - break; - - uri = location_iterator->second; - session = makeHTTPSession(uri, timeouts); - } - - assertResponseIsOk(*request, response, istr); - impl = std::make_unique(*istr, buffer_size_); - } - - - bool nextImpl() override - { - if (!impl->next()) - return false; - internal_buffer = impl->buffer(); - working_buffer = internal_buffer; - return true; - } - }; -} - -class ReadWriteBufferFromS3 : public detail::ReadWriteBufferFromS3Base -{ - using Parent = detail::ReadWriteBufferFromS3Base; + HTTPSessionPtr session; + std::istream * istr; /// owned by session + std::unique_ptr impl; public: + using OutStreamCallback = std::function; + explicit ReadWriteBufferFromS3(Poco::URI uri_, const std::string & method_ = {}, OutStreamCallback out_stream_callback = {}, const ConnectionTimeouts & timeouts = {}, const Poco::Net::HTTPBasicCredentials & credentials = {}, - size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE) - : Parent(uri_, timeouts, method_, out_stream_callback, credentials, buffer_size_) - { - } + size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE); + + bool nextImpl() override; }; + /* Perform S3 HTTP POST/PUT request. */ class WriteBufferFromS3 : public WriteBufferFromOStream From 52f242daf056a4d6725de1b87fd7436769ede79e Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Fri, 31 May 2019 18:16:40 +0000 Subject: [PATCH 049/309] tests (WIP) --- .../clickhouse-test | 126 ++++++++++++++++++ .../00950_table_function_s3_wip/config.xml | 115 ++++++++++++++++ 2 files changed, 241 insertions(+) create mode 100755 dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test create mode 100644 dbms/tests/queries/0_stateless/00950_table_function_s3_wip/config.xml diff --git a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test new file mode 100755 index 00000000000..09876ea0df6 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 + +import http.server +import os +import subprocess +import threading +import unittest + + +format = 'column1 UInt32, column2 UInt32, column3 UInt32' +values = '(1, 2, 3), (2, 3, 1), (78, 43, 45)' +redirecting_host = '127.0.0.1' +redirecting_to_http_port = 12345 +redirecting_to_https_port = 12346 +preserving_data_port = 12347 + +queries = [ + "select *, column1*column2*column3 from file('{}', 'CSV', '{}')".format(os.path.expanduser('~/test.csv'), format), + "select *, column1*column2*column3 from url('https://storage.yandexcloud.net/milovidov/test.csv', 'CSV', '{}')".format(format), + "select *, column1*column2*column3 from s3('http://storage.yandexcloud.net/milovidov/test.csv', 'CSV', '{}')".format(format), + "select *, column1*column2*column3 from s3('https://storage.yandexcloud.net/milovidov/test.csv', 'CSV', '{}')".format(format), + "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(redirecting_host, redirecting_to_http_port, format), + "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(redirecting_host, redirecting_to_https_port, format), +] + +put_queries = [ + "insert into table function s3('http://{}:{}/', 'CSV', '{}') values {}" + .format(redirecting_host, preserving_data_port, format, values), +] + + +class RedirectingToHTTPHTTPServer(http.server.BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(307) + self.send_header('Content-type', 'text/xml') + self.send_header('Location', 'http://storage.yandexcloud.net/milovidov/test.csv') + self.end_headers() + self.wfile.write(bytes(r''' + + TemporaryRedirect + Please re-send this request to the specified temporary endpoint. + Continue to use the original request endpoint for future requests. + johnsmith.s3-gztb4pa9sq.amazonaws.com +''', "utf-8")) + + +class RedirectingToHTTPSHTTPServer(http.server.BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(307) + self.send_header('Content-type', 'text/xml') + self.send_header('Location', 'https://storage.yandexcloud.net/milovidov/test.csv') + self.end_headers() + self.wfile.write(bytes(r''' + + TemporaryRedirect + Please re-send this request to the specified temporary endpoint. + Continue to use the original request endpoint for future requests. + johnsmith.s3-gztb4pa9sq.amazonaws.com +''', "utf-8")) + + +received_data = [] + + +class PreservingDataServer(http.server.BaseHTTPRequestHandler): + def do_POST(self): + self.send_response(200) + self.send_header('Content-type', 'text/plain') + self.end_headers() + received_data.append(self.rfile.read()) + self.wfile.flush() + + +servers = [] +def redirecting_to_https_thread(): + server = http.server.HTTPServer((redirecting_host, redirecting_to_https_port), RedirectingToHTTPSHTTPServer) + servers.append(server) + server.handle_request() + +def redirecting_to_http_thread(): + server = http.server.HTTPServer((redirecting_host, redirecting_to_http_port), RedirectingToHTTPHTTPServer) + servers.append(server) + server.handle_request() + +def preserving_thread(): + server = http.server.HTTPServer((redirecting_host, preserving_data_port), PreservingDataServer) + servers.append(server) + server.handle_request() + + +jobs = [] +jobs.append(threading.Thread(target=redirecting_to_http_thread)) +jobs.append(threading.Thread(target=redirecting_to_https_thread)) +jobs.append(threading.Thread(target=preserving_thread)) +[ job.start() for job in jobs ] + +for query in queries: + print(query) + result = subprocess.run([ + os.path.expanduser('~/ClickHouse-bin/dbms/programs/clickhouse-local'), + '-c', + os.path.expanduser('~/config.xml'), + '-q', + query + ], stdout=subprocess.PIPE, universal_newlines=True) + result.check_returncode() + unittest.TestCase().assertEqual(list(map(str.split, result.stdout.splitlines())), [ + ['1', '2', '3', '6'], + ['3', '2', '1', '6'], + ['78', '43', '45', '150930'], + ]) + +for query in put_queries: + print(query) + result = subprocess.run([ + os.path.expanduser('~/ClickHouse-bin/dbms/programs/clickhouse-local'), + '-c', + os.path.expanduser('~/config.xml'), + '-q', + query + ], stdout=subprocess.PIPE, universal_newlines=True) + result.check_returncode() + unittest.TestCase().assertEqual(received_data[-1].decode(), '15\r\n1,2,3\n2,3,1\n78,43,45\n\r\n0\r\n\r\n') + +[ server.socket.close() for server in servers ] +[ job.join() for job in jobs ] diff --git a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/config.xml b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/config.xml new file mode 100644 index 00000000000..de0dd0a7087 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/config.xml @@ -0,0 +1,115 @@ + + default + + trace + /home/excitoon/clickhouse-server.log + /home/excitoon/clickhouse-server.err.log + 1000M + 10 + + + + + + 8 + + + + + + + + + + + ::/0 + + + + default + + + default + + + + + + + a = 1 + + + + + a + b < 1 or c - d > 5 + + + + + c = 1 + + + + + + + + + + + + + + + + 3600 + + + 0 + 0 + 0 + 0 + 0 + + + + From 1944ff1a48874123d4fc316415731663b6936e16 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Sat, 1 Jun 2019 21:18:20 +0000 Subject: [PATCH 050/309] POST to PUT, refactoring. --- ...eBufferFromS3.cpp => ReadBufferFromS3.cpp} | 39 ++-------- dbms/src/IO/ReadBufferFromS3.h | 50 +++++++++++++ dbms/src/IO/WriteBufferFromS3.cpp | 30 ++++++++ ...riteBufferFromS3.h => WriteBufferFromS3.h} | 30 +------- dbms/src/Storages/StorageS3.cpp | 63 +++------------- dbms/src/Storages/StorageS3.h | 75 +++++++------------ 6 files changed, 124 insertions(+), 163 deletions(-) rename dbms/src/IO/{ReadWriteBufferFromS3.cpp => ReadBufferFromS3.cpp} (55%) create mode 100644 dbms/src/IO/ReadBufferFromS3.h create mode 100644 dbms/src/IO/WriteBufferFromS3.cpp rename dbms/src/IO/{ReadWriteBufferFromS3.h => WriteBufferFromS3.h} (57%) diff --git a/dbms/src/IO/ReadWriteBufferFromS3.cpp b/dbms/src/IO/ReadBufferFromS3.cpp similarity index 55% rename from dbms/src/IO/ReadWriteBufferFromS3.cpp rename to dbms/src/IO/ReadBufferFromS3.cpp index f4b6124267f..aa056191988 100644 --- a/dbms/src/IO/ReadWriteBufferFromS3.cpp +++ b/dbms/src/IO/ReadBufferFromS3.cpp @@ -1,4 +1,4 @@ -#include +#include #include @@ -6,15 +6,13 @@ namespace DB { -ReadWriteBufferFromS3::ReadWriteBufferFromS3(Poco::URI uri_, - const std::string & method_, - OutStreamCallback out_stream_callback, +ReadBufferFromS3::ReadBufferFromS3(Poco::URI uri_, const ConnectionTimeouts & timeouts, const Poco::Net::HTTPBasicCredentials & credentials, size_t buffer_size_) : ReadBuffer(nullptr, 0) , uri {uri_} - , method {!method_.empty() ? method_ : out_stream_callback ? Poco::Net::HTTPRequest::HTTP_POST : Poco::Net::HTTPRequest::HTTP_GET} + , method {Poco::Net::HTTPRequest::HTTP_GET} , session(makeHTTPSession(uri_, timeouts)) { Poco::Net::HTTPResponse response; @@ -29,18 +27,12 @@ ReadWriteBufferFromS3::ReadWriteBufferFromS3(Poco::URI uri_, request = std::make_unique(method, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); request->setHost(uri.getHost()); // use original, not resolved host name in header - if (out_stream_callback) - request->setChunkedTransferEncoding(true); - if (!credentials.getUsername().empty()) credentials.authenticate(*request); LOG_TRACE((&Logger::get("ReadWriteBufferFromS3")), "Sending request to " << uri.toString()); - auto & stream_out = session->sendRequest(*request); - - if (out_stream_callback) - out_stream_callback(stream_out); + session->sendRequest(*request); istr = &session->receiveResponse(response); @@ -60,7 +52,7 @@ ReadWriteBufferFromS3::ReadWriteBufferFromS3(Poco::URI uri_, } -bool ReadWriteBufferFromS3::nextImpl() +bool ReadBufferFromS3::nextImpl() { if (!impl->next()) return false; @@ -69,25 +61,4 @@ bool ReadWriteBufferFromS3::nextImpl() return true; } -WriteBufferFromS3::WriteBufferFromS3( - const Poco::URI & uri, const std::string & method, const ConnectionTimeouts & timeouts, size_t buffer_size_) - : WriteBufferFromOStream(buffer_size_) - , session{makeHTTPSession(uri, timeouts)} - , request{method, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1} -{ - request.setHost(uri.getHost()); - request.setChunkedTransferEncoding(true); - request.setExpectContinue(true); - - LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << uri.toString()); - - ostr = &session->sendRequest(request); -} - -void WriteBufferFromS3::finalize() -{ - receiveResponse(*session, request, response); - /// TODO: Response body is ignored. -} - } diff --git a/dbms/src/IO/ReadBufferFromS3.h b/dbms/src/IO/ReadBufferFromS3.h new file mode 100644 index 00000000000..bada3f76252 --- /dev/null +++ b/dbms/src/IO/ReadBufferFromS3.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define DEFAULT_S3_READ_BUFFER_TIMEOUT 1800 +#define DEFAULT_S3_READ_BUFFER_CONNECTION_TIMEOUT 1 +#define DEFAULT_S3_MAX_FOLLOW_REDIRECT 2 + +namespace DB +{ +/** Perform S3 HTTP GET request and provide response to read. + */ +class ReadBufferFromS3 : public ReadBuffer +{ +protected: + Poco::URI uri; + std::string method; + + HTTPSessionPtr session; + std::istream * istr; /// owned by session + std::unique_ptr impl; + +public: + explicit ReadBufferFromS3(Poco::URI uri_, + const ConnectionTimeouts & timeouts = {}, + const Poco::Net::HTTPBasicCredentials & credentials = {}, + size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE); + + bool nextImpl() override; +}; + +} diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp new file mode 100644 index 00000000000..94b07a6a8df --- /dev/null +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -0,0 +1,30 @@ +#include + +#include + + +namespace DB +{ + +WriteBufferFromS3::WriteBufferFromS3( + const Poco::URI & uri, const ConnectionTimeouts & timeouts, size_t buffer_size_) + : WriteBufferFromOStream(buffer_size_) + , session{makeHTTPSession(uri, timeouts)} + , request{Poco::Net::HTTPRequest::HTTP_PUT, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1} +{ + request.setHost(uri.getHost()); + request.setChunkedTransferEncoding(true); + request.setExpectContinue(true); + + LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << uri.toString()); + + ostr = &session->sendRequest(request); +} + +void WriteBufferFromS3::finalize() +{ + receiveResponse(*session, request, response); + /// TODO: Response body is ignored. +} + +} diff --git a/dbms/src/IO/ReadWriteBufferFromS3.h b/dbms/src/IO/WriteBufferFromS3.h similarity index 57% rename from dbms/src/IO/ReadWriteBufferFromS3.h rename to dbms/src/IO/WriteBufferFromS3.h index e33dbe80df0..ee8b2d1dde9 100644 --- a/dbms/src/IO/ReadWriteBufferFromS3.h +++ b/dbms/src/IO/WriteBufferFromS3.h @@ -26,34 +26,7 @@ namespace DB { -/** Perform S3 HTTP POST request and provide response to read. - */ - -class ReadWriteBufferFromS3 : public ReadBuffer -{ -protected: - Poco::URI uri; - std::string method; - - HTTPSessionPtr session; - std::istream * istr; /// owned by session - std::unique_ptr impl; - -public: - using OutStreamCallback = std::function; - - explicit ReadWriteBufferFromS3(Poco::URI uri_, - const std::string & method_ = {}, - OutStreamCallback out_stream_callback = {}, - const ConnectionTimeouts & timeouts = {}, - const Poco::Net::HTTPBasicCredentials & credentials = {}, - size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE); - - bool nextImpl() override; -}; - - -/* Perform S3 HTTP POST/PUT request. +/* Perform S3 HTTP PUT request. */ class WriteBufferFromS3 : public WriteBufferFromOStream { @@ -64,7 +37,6 @@ private: public: explicit WriteBufferFromS3(const Poco::URI & uri, - const std::string & method = Poco::Net::HTTPRequest::HTTP_POST, // POST for inserting, PUT for replacing. const ConnectionTimeouts & timeouts = {}, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE); diff --git a/dbms/src/Storages/StorageS3.cpp b/dbms/src/Storages/StorageS3.cpp index 972820a8449..f15e19d9396 100644 --- a/dbms/src/Storages/StorageS3.cpp +++ b/dbms/src/Storages/StorageS3.cpp @@ -5,7 +5,8 @@ #include #include -#include +#include +#include #include @@ -23,23 +24,12 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } -IStorageS3Base::IStorageS3Base(const Poco::URI & uri_, - const Context & context_, - const std::string & table_name_, - const String & format_name_, - const ColumnsDescription & columns_) - : IStorage(columns_), uri(uri_), context_global(context_), format_name(format_name_), table_name(table_name_) -{ -} - namespace { class StorageS3BlockInputStream : public IBlockInputStream { public: StorageS3BlockInputStream(const Poco::URI & uri, - const std::string & method, - std::function callback, const String & format, const String & name_, const Block & sample_block, @@ -48,7 +38,7 @@ namespace const ConnectionTimeouts & timeouts) : name(name_) { - read_buf = std::make_unique(uri, method, callback, timeouts); + read_buf = std::make_unique(uri, timeouts); reader = FormatFactory::instance().getInput(format, *read_buf, sample_block, context, max_block_size); } @@ -80,7 +70,7 @@ namespace private: String name; - std::unique_ptr read_buf; + std::unique_ptr read_buf; BlockInputStreamPtr reader; }; @@ -94,7 +84,7 @@ namespace const ConnectionTimeouts & timeouts) : sample_block(sample_block_) { - write_buf = std::make_unique(uri, Poco::Net::HTTPRequest::HTTP_POST, timeouts); + write_buf = std::make_unique(uri, timeouts); writer = FormatFactory::instance().getOutput(format, *write_buf, sample_block, context); } @@ -128,45 +118,14 @@ namespace } -std::string IStorageS3Base::getReadMethod() const -{ - return Poco::Net::HTTPRequest::HTTP_GET; -} - -std::vector> IStorageS3Base::getReadURIParams(const Names & /*column_names*/, +BlockInputStreams StorageS3::read(const Names & column_names, const SelectQueryInfo & /*query_info*/, - const Context & /*context*/, - QueryProcessingStage::Enum & /*processed_stage*/, - size_t /*max_block_size*/) const -{ - return {}; -} - -std::function IStorageS3Base::getReadPOSTDataCallback(const Names & /*column_names*/, - const SelectQueryInfo & /*query_info*/, - const Context & /*context*/, - QueryProcessingStage::Enum & /*processed_stage*/, - size_t /*max_block_size*/) const -{ - return nullptr; -} - - -BlockInputStreams IStorageS3Base::read(const Names & column_names, - const SelectQueryInfo & query_info, const Context & context, - QueryProcessingStage::Enum processed_stage, + QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, unsigned /*num_streams*/) { - auto request_uri = uri; - auto params = getReadURIParams(column_names, query_info, context, processed_stage, max_block_size); - for (const auto & [param, value] : params) - request_uri.addQueryParameter(param, value); - - BlockInputStreamPtr block_input = std::make_shared(request_uri, - getReadMethod(), - getReadPOSTDataCallback(column_names, query_info, context, processed_stage, max_block_size), + BlockInputStreamPtr block_input = std::make_shared(uri, format_name, getName(), getHeaderBlock(column_names), @@ -181,9 +140,9 @@ BlockInputStreams IStorageS3Base::read(const Names & column_names, return {std::make_shared(block_input, column_defaults, context)}; } -void IStorageS3Base::rename(const String & /*new_path_to_db*/, const String & /*new_database_name*/, const String & /*new_table_name*/) {} +void StorageS3::rename(const String & /*new_path_to_db*/, const String & /*new_database_name*/, const String & /*new_table_name*/) {} -BlockOutputStreamPtr IStorageS3Base::write(const ASTPtr & /*query*/, const Context & /*context*/) +BlockOutputStreamPtr StorageS3::write(const ASTPtr & /*query*/, const Context & /*context*/) { return std::make_shared( uri, format_name, getSampleBlock(), context_global, ConnectionTimeouts::getHTTPTimeouts(context_global)); @@ -191,6 +150,6 @@ BlockOutputStreamPtr IStorageS3Base::write(const ASTPtr & /*query*/, const Conte void registerStorageS3(StorageFactory & /*factory*/) { - // TODO. See #1394. + // TODO. See #1394? } } diff --git a/dbms/src/Storages/StorageS3.h b/dbms/src/Storages/StorageS3.h index 2615563b57c..a38cd717e36 100644 --- a/dbms/src/Storages/StorageS3.h +++ b/dbms/src/Storages/StorageS3.h @@ -8,14 +8,37 @@ namespace DB { /** - * This class represents table engine for external urls. + * This class represents table engine for external S3 urls. * It sends HTTP GET to server when select is called and - * HTTP POST when insert is called. In POST request the data is send - * using Chunked transfer encoding, so server have to support it. + * HTTP PUT when insert is called. */ -class IStorageS3Base : public IStorage +class StorageS3 : public ext::shared_ptr_helper, public IStorage { public: + StorageS3(const Poco::URI & uri_, + const std::string & table_name_, + const String & format_name_, + const ColumnsDescription & columns_, + Context & context_ + ) + : IStorage(columns_) + , uri(uri_) + , context_global(context_) + , format_name(format_name_) + , table_name(table_name_) + { + } + + String getName() const override + { + return "S3"; + } + + Block getHeaderBlock(const Names & /*column_names*/) const + { + return getSampleBlock(); + } + String getTableName() const override { return table_name; @@ -33,56 +56,12 @@ public: void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name) override; protected: - IStorageS3Base(const Poco::URI & uri_, - const Context & context_, - const std::string & table_name_, - const String & format_name_, - const ColumnsDescription & columns_); - Poco::URI uri; const Context & context_global; private: String format_name; String table_name; - - virtual std::string getReadMethod() const; - - virtual std::vector> getReadURIParams(const Names & column_names, - const SelectQueryInfo & query_info, - const Context & context, - QueryProcessingStage::Enum & processed_stage, - size_t max_block_size) const; - - virtual std::function getReadPOSTDataCallback(const Names & column_names, - const SelectQueryInfo & query_info, - const Context & context, - QueryProcessingStage::Enum & processed_stage, - size_t max_block_size) const; - - virtual Block getHeaderBlock(const Names & column_names) const = 0; }; -class StorageS3 : public ext::shared_ptr_helper, public IStorageS3Base -{ -public: - StorageS3(const Poco::URI & uri_, - const std::string & table_name_, - const String & format_name_, - const ColumnsDescription & columns_, - Context & context_) - : IStorageS3Base(uri_, context_, table_name_, format_name_, columns_) - { - } - - String getName() const override - { - return "S3"; - } - - Block getHeaderBlock(const Names & /*column_names*/) const override - { - return getSampleBlock(); - } -}; } From efade38d7b6d1c42f847c0c7613f401b36a28521 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Sat, 1 Jun 2019 21:21:33 +0000 Subject: [PATCH 051/309] Fixed test, so it does not pass now. --- .../0_stateless/00950_table_function_s3_wip/clickhouse-test | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test index 09876ea0df6..8140a140474 100755 --- a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test +++ b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test @@ -63,10 +63,11 @@ received_data = [] class PreservingDataServer(http.server.BaseHTTPRequestHandler): - def do_POST(self): + def do_PUT(self): self.send_response(200) self.send_header('Content-type', 'text/plain') self.end_headers() + assert self.headers.get('Content-Length') received_data.append(self.rfile.read()) self.wfile.flush() From 19642cf822efdb6f615ec7ae343d00194a11f3c4 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 10 Jun 2019 00:51:44 +0000 Subject: [PATCH 052/309] Tests update. --- .../0_stateless/00950_table_function_s3_wip/clickhouse-test | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test index 8140a140474..b0132d5d1cd 100755 --- a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test +++ b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test @@ -3,6 +3,7 @@ import http.server import os import subprocess +import sys import threading import unittest @@ -67,6 +68,7 @@ class PreservingDataServer(http.server.BaseHTTPRequestHandler): self.send_response(200) self.send_header('Content-type', 'text/plain') self.end_headers() + print('Content-Length =', self.headers.get('Content-Length'), file=sys.stderr) assert self.headers.get('Content-Length') received_data.append(self.rfile.read()) self.wfile.flush() From 0504eb58becefcde1ce65b319268581dc395b3d6 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 10 Jun 2019 00:52:47 +0000 Subject: [PATCH 053/309] Correct S3 PUT requests in WriteBufferFromS3. --- dbms/src/IO/WriteBufferFromS3.cpp | 6 +++++- dbms/src/IO/WriteBufferFromS3.h | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 94b07a6a8df..52b28821f7f 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -18,11 +18,15 @@ WriteBufferFromS3::WriteBufferFromS3( LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << uri.toString()); - ostr = &session->sendRequest(request); + ostr = &temporary_stream; } void WriteBufferFromS3::finalize() { + const std::string & data = temporary_stream.str(); + request.setContentLength(data.size()); + ostr = &session->sendRequest(request); + *ostr << data; receiveResponse(*session, request, response); /// TODO: Response body is ignored. } diff --git a/dbms/src/IO/WriteBufferFromS3.h b/dbms/src/IO/WriteBufferFromS3.h index ee8b2d1dde9..e43ecffb2f1 100644 --- a/dbms/src/IO/WriteBufferFromS3.h +++ b/dbms/src/IO/WriteBufferFromS3.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -34,6 +35,7 @@ private: HTTPSessionPtr session; Poco::Net::HTTPRequest request; Poco::Net::HTTPResponse response; + std::ostringstream temporary_stream; /// Maybe one shall use some DB:: buffer. public: explicit WriteBufferFromS3(const Poco::URI & uri, From 7236ae0d8dcf60297255857759597ffc83c0fd02 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 10 Jun 2019 01:22:43 +0000 Subject: [PATCH 054/309] Fixed 411 error when putting to S3. --- dbms/src/IO/WriteBufferFromS3.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 52b28821f7f..854bffca979 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -13,7 +13,11 @@ WriteBufferFromS3::WriteBufferFromS3( , request{Poco::Net::HTTPRequest::HTTP_PUT, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1} { request.setHost(uri.getHost()); - request.setChunkedTransferEncoding(true); + + // request.setChunkedTransferEncoding(true); + // Chunked transfers require additional logic, see: + // https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-streaming.html + request.setExpectContinue(true); LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << uri.toString()); From ae40d68eb0e425b4e43bf6be1a51c31987aabb24 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 10 Jun 2019 01:22:54 +0000 Subject: [PATCH 055/309] Updated tests. --- .../clickhouse-test | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test index b0132d5d1cd..c896b9fe114 100755 --- a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test +++ b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test @@ -8,17 +8,30 @@ import threading import unittest +# 1) Run Go FakeS3 server. +# go run cmd/gofakes3/main.go -backend memory -host :9990 +# 2) Create a bucket. +# curl -X PUT http://localhost:9990/abc/ + format = 'column1 UInt32, column2 UInt32, column3 UInt32' -values = '(1, 2, 3), (2, 3, 1), (78, 43, 45)' +values = '(1, 2, 3), (3, 2, 1), (78, 43, 45)' redirecting_host = '127.0.0.1' redirecting_to_http_port = 12345 redirecting_to_https_port = 12346 preserving_data_port = 12347 +fakes3_port = 9990 +localhost = '127.0.0.1' +bucket = 'abc' + +prepare_put_queries = [ + "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(localhost, fakes3_port, bucket, format, values), +] queries = [ "select *, column1*column2*column3 from file('{}', 'CSV', '{}')".format(os.path.expanduser('~/test.csv'), format), "select *, column1*column2*column3 from url('https://storage.yandexcloud.net/milovidov/test.csv', 'CSV', '{}')".format(format), "select *, column1*column2*column3 from s3('http://storage.yandexcloud.net/milovidov/test.csv', 'CSV', '{}')".format(format), + "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(localhost, fakes3_port, bucket, format), "select *, column1*column2*column3 from s3('https://storage.yandexcloud.net/milovidov/test.csv', 'CSV', '{}')".format(format), "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(redirecting_host, redirecting_to_http_port, format), "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(redirecting_host, redirecting_to_https_port, format), @@ -97,6 +110,17 @@ jobs.append(threading.Thread(target=redirecting_to_https_thread)) jobs.append(threading.Thread(target=preserving_thread)) [ job.start() for job in jobs ] +for query in prepare_put_queries: + print(query) + result = subprocess.run([ + os.path.expanduser('~/ClickHouse-bin/dbms/programs/clickhouse-local'), + '-c', + os.path.expanduser('~/config.xml'), + '-q', + query + ], stdout=subprocess.PIPE, universal_newlines=True) + result.check_returncode() + for query in queries: print(query) result = subprocess.run([ @@ -123,7 +147,9 @@ for query in put_queries: query ], stdout=subprocess.PIPE, universal_newlines=True) result.check_returncode() - unittest.TestCase().assertEqual(received_data[-1].decode(), '15\r\n1,2,3\n2,3,1\n78,43,45\n\r\n0\r\n\r\n') + unittest.TestCase().assertEqual(received_data[-1].decode(), '1,2,3\n3,2,1\n78,43,45\n') + # In chunked encoding: + # unittest.TestCase().assertEqual(received_data[-1].decode(), '15\r\n1,2,3\n2,3,1\n78,43,45\n\r\n0\r\n\r\n') [ server.socket.close() for server in servers ] [ job.join() for job in jobs ] From cca3a9acc7b69746d7f3d966f0809c87f454b926 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 10 Jun 2019 01:57:37 +0000 Subject: [PATCH 056/309] Added test for PUT redirect. --- .../clickhouse-test | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test index c896b9fe114..554effe027b 100755 --- a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test +++ b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test @@ -15,10 +15,12 @@ import unittest format = 'column1 UInt32, column2 UInt32, column3 UInt32' values = '(1, 2, 3), (3, 2, 1), (78, 43, 45)' +other_values = '(1, 1, 1), (1, 1, 1), (11, 11, 11)' redirecting_host = '127.0.0.1' redirecting_to_http_port = 12345 redirecting_to_https_port = 12346 preserving_data_port = 12347 +redirecting_preserving_data_port = 12348 fakes3_port = 9990 localhost = '127.0.0.1' bucket = 'abc' @@ -40,6 +42,12 @@ queries = [ put_queries = [ "insert into table function s3('http://{}:{}/', 'CSV', '{}') values {}" .format(redirecting_host, preserving_data_port, format, values), + "insert into table function s3('http://{}:{}/', 'CSV', '{}') values {}" + .format(redirecting_host, redirecting_preserving_data_port, format, other_values), +] + +check_queries = [ + "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(localhost, fakes3_port, bucket, format), ] @@ -77,12 +85,49 @@ received_data = [] class PreservingDataServer(http.server.BaseHTTPRequestHandler): + protocol_version = 'HTTP/1.1' + + def handle_expect_100(self): + print('Received Expect-100', file=sys.stderr) + return True + def do_PUT(self): self.send_response(200) self.send_header('Content-type', 'text/plain') self.end_headers() print('Content-Length =', self.headers.get('Content-Length'), file=sys.stderr) assert self.headers.get('Content-Length') + assert self.headers['Expect'] == '100-continue' + received_data.append(self.rfile.read()) + self.wfile.flush() + + +class RedirectingPreservingDataServer(http.server.BaseHTTPRequestHandler): + protocol_version = 'HTTP/1.1' + + def handle_expect_100(self): + print('Received Expect-100', file=sys.stderr) + self.send_response(307) + self.send_header('Content-type', 'text/xml') + self.send_header('Location', 'http://{}:{}/{}/test.csv'.format(localhost, fakes3_port, bucket)) + self.end_headers() + self.wfile.write(bytes(r''' + + TemporaryRedirect + Please re-send this request to the specified temporary endpoint. + Continue to use the original request endpoint for future requests. + johnsmith.s3-gztb4pa9sq.amazonaws.com +''', "utf-8")) + return False + + def do_PUT(self): + assert False + self.send_response(200) + self.send_header('Content-type', 'text/plain') + self.end_headers() + print('Content-Length =', self.headers.get('Content-Length'), file=sys.stderr) + assert self.headers.get('Content-Length') + assert self.headers['Expect'] == '100-continue' received_data.append(self.rfile.read()) self.wfile.flush() @@ -103,11 +148,17 @@ def preserving_thread(): servers.append(server) server.handle_request() +def redirecting_preserving_thread(): + server = http.server.HTTPServer((redirecting_host, redirecting_preserving_data_port), RedirectingPreservingDataServer) + servers.append(server) + server.handle_request() + jobs = [] jobs.append(threading.Thread(target=redirecting_to_http_thread)) jobs.append(threading.Thread(target=redirecting_to_https_thread)) jobs.append(threading.Thread(target=preserving_thread)) +jobs.append(threading.Thread(target=redirecting_preserving_thread)) [ job.start() for job in jobs ] for query in prepare_put_queries: @@ -151,5 +202,21 @@ for query in put_queries: # In chunked encoding: # unittest.TestCase().assertEqual(received_data[-1].decode(), '15\r\n1,2,3\n2,3,1\n78,43,45\n\r\n0\r\n\r\n') +for query in check_queries: + print(query) + result = subprocess.run([ + os.path.expanduser('~/ClickHouse-bin/dbms/programs/clickhouse-local'), + '-c', + os.path.expanduser('~/config.xml'), + '-q', + query + ], stdout=subprocess.PIPE, universal_newlines=True) + result.check_returncode() + unittest.TestCase().assertEqual(list(map(str.split, result.stdout.splitlines())), [ + ['1', '1', '1', '1'], + ['1', '1', '1', '1'], + ['11', '11', '11', '1331'], + ]) + [ server.socket.close() for server in servers ] [ job.join() for job in jobs ] From ff691129190233e50523c935634bb277bd5653e0 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 10 Jun 2019 02:35:33 +0000 Subject: [PATCH 057/309] Attempt to make S3 PUT redirects (wip). --- dbms/src/IO/ReadBufferFromS3.cpp | 6 ++- dbms/src/IO/ReadBufferFromS3.h | 4 -- dbms/src/IO/WriteBufferFromS3.cpp | 66 ++++++++++++++++++++++--------- dbms/src/IO/WriteBufferFromS3.h | 10 ++--- 4 files changed, 57 insertions(+), 29 deletions(-) diff --git a/dbms/src/IO/ReadBufferFromS3.cpp b/dbms/src/IO/ReadBufferFromS3.cpp index aa056191988..f6061c3a8c7 100644 --- a/dbms/src/IO/ReadBufferFromS3.cpp +++ b/dbms/src/IO/ReadBufferFromS3.cpp @@ -3,6 +3,8 @@ #include +#define DEFAULT_S3_MAX_FOLLOW_GET_REDIRECT 2 + namespace DB { @@ -13,12 +15,12 @@ ReadBufferFromS3::ReadBufferFromS3(Poco::URI uri_, : ReadBuffer(nullptr, 0) , uri {uri_} , method {Poco::Net::HTTPRequest::HTTP_GET} - , session(makeHTTPSession(uri_, timeouts)) + , session {makeHTTPSession(uri_, timeouts)} { Poco::Net::HTTPResponse response; std::unique_ptr request; - for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_REDIRECT; ++i) + for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_GET_REDIRECT; ++i) { // With empty path poco will send "POST HTTP/1.1" its bug. if (uri.getPath().empty()) diff --git a/dbms/src/IO/ReadBufferFromS3.h b/dbms/src/IO/ReadBufferFromS3.h index bada3f76252..ec53a24c5a6 100644 --- a/dbms/src/IO/ReadBufferFromS3.h +++ b/dbms/src/IO/ReadBufferFromS3.h @@ -20,10 +20,6 @@ #include -#define DEFAULT_S3_READ_BUFFER_TIMEOUT 1800 -#define DEFAULT_S3_READ_BUFFER_CONNECTION_TIMEOUT 1 -#define DEFAULT_S3_MAX_FOLLOW_REDIRECT 2 - namespace DB { /** Perform S3 HTTP GET request and provide response to read. diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 854bffca979..01a1c03bc05 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -3,36 +3,66 @@ #include +#define DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT 2 + namespace DB { WriteBufferFromS3::WriteBufferFromS3( - const Poco::URI & uri, const ConnectionTimeouts & timeouts, size_t buffer_size_) + const Poco::URI & uri_, const ConnectionTimeouts & timeouts_, + const Poco::Net::HTTPBasicCredentials & credentials_, size_t buffer_size_) : WriteBufferFromOStream(buffer_size_) - , session{makeHTTPSession(uri, timeouts)} - , request{Poco::Net::HTTPRequest::HTTP_PUT, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1} + , uri {uri_} + , timeouts {timeouts_} + , credentials {credentials_} + , session {makeHTTPSession(uri_, timeouts_)} { - request.setHost(uri.getHost()); - - // request.setChunkedTransferEncoding(true); - // Chunked transfers require additional logic, see: - // https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-streaming.html - - request.setExpectContinue(true); - - LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << uri.toString()); - ostr = &temporary_stream; } void WriteBufferFromS3::finalize() { const std::string & data = temporary_stream.str(); - request.setContentLength(data.size()); - ostr = &session->sendRequest(request); - *ostr << data; - receiveResponse(*session, request, response); - /// TODO: Response body is ignored. + + std::unique_ptr request; + for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT; ++i) + { + request = std::make_unique(Poco::Net::HTTPRequest::HTTP_PUT, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); + request->setHost(uri.getHost()); // use original, not resolved host name in header + + if (!credentials.getUsername().empty()) + credentials.authenticate(*request); + + // request.setChunkedTransferEncoding(true); + // Chunked transfers require additional logic, see: + // https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-streaming.html + + request->setExpectContinue(true); + + request->setContentLength(data.size()); + + LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << uri.toString()); + + ostr = &session->sendRequest(*request); + if (session->peekResponse(response)) + { + // Received 100-continue. + *ostr << data; + } + + istr = &session->receiveResponse(response); + + if (response.getStatus() != 307) + break; + + auto location_iterator = response.find("Location"); + if (location_iterator == response.end()) + break; + + uri = location_iterator->second; + session = makeHTTPSession(uri, timeouts); + } + assertResponseIsOk(*request, response, istr); } } diff --git a/dbms/src/IO/WriteBufferFromS3.h b/dbms/src/IO/WriteBufferFromS3.h index e43ecffb2f1..3c2c343b291 100644 --- a/dbms/src/IO/WriteBufferFromS3.h +++ b/dbms/src/IO/WriteBufferFromS3.h @@ -21,10 +21,6 @@ #include -#define DEFAULT_S3_READ_BUFFER_TIMEOUT 1800 -#define DEFAULT_S3_READ_BUFFER_CONNECTION_TIMEOUT 1 -#define DEFAULT_S3_MAX_FOLLOW_REDIRECT 2 - namespace DB { /* Perform S3 HTTP PUT request. @@ -32,14 +28,18 @@ namespace DB class WriteBufferFromS3 : public WriteBufferFromOStream { private: + Poco::URI uri; + ConnectionTimeouts timeouts; + const Poco::Net::HTTPBasicCredentials & credentials; HTTPSessionPtr session; - Poco::Net::HTTPRequest request; + std::istream * istr; /// owned by session Poco::Net::HTTPResponse response; std::ostringstream temporary_stream; /// Maybe one shall use some DB:: buffer. public: explicit WriteBufferFromS3(const Poco::URI & uri, const ConnectionTimeouts & timeouts = {}, + const Poco::Net::HTTPBasicCredentials & credentials = {}, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE); /// Receives response from the server after sending all data. From 634f82d0ad8ac9ba631ff15771598ae41075235e Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 10 Jun 2019 23:33:43 +0000 Subject: [PATCH 058/309] Minor fix. --- dbms/src/IO/WriteBufferFromS3.cpp | 8 +++++--- dbms/src/IO/WriteBufferFromS3.h | 3 --- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 01a1c03bc05..c0d891ebf6d 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -15,7 +15,6 @@ WriteBufferFromS3::WriteBufferFromS3( , uri {uri_} , timeouts {timeouts_} , credentials {credentials_} - , session {makeHTTPSession(uri_, timeouts_)} { ostr = &temporary_stream; } @@ -24,9 +23,13 @@ void WriteBufferFromS3::finalize() { const std::string & data = temporary_stream.str(); + Poco::Net::HTTPResponse response; std::unique_ptr request; + HTTPSessionPtr session; + std::istream * istr; /// owned by session for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT; ++i) { + session = makeHTTPSession(uri, timeouts); request = std::make_unique(Poco::Net::HTTPRequest::HTTP_PUT, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); request->setHost(uri.getHost()); // use original, not resolved host name in header @@ -44,7 +47,7 @@ void WriteBufferFromS3::finalize() LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << uri.toString()); ostr = &session->sendRequest(*request); - if (session->peekResponse(response)) +// if (session->peekResponse(response)) { // Received 100-continue. *ostr << data; @@ -60,7 +63,6 @@ void WriteBufferFromS3::finalize() break; uri = location_iterator->second; - session = makeHTTPSession(uri, timeouts); } assertResponseIsOk(*request, response, istr); } diff --git a/dbms/src/IO/WriteBufferFromS3.h b/dbms/src/IO/WriteBufferFromS3.h index 3c2c343b291..13815f4acb9 100644 --- a/dbms/src/IO/WriteBufferFromS3.h +++ b/dbms/src/IO/WriteBufferFromS3.h @@ -31,9 +31,6 @@ private: Poco::URI uri; ConnectionTimeouts timeouts; const Poco::Net::HTTPBasicCredentials & credentials; - HTTPSessionPtr session; - std::istream * istr; /// owned by session - Poco::Net::HTTPResponse response; std::ostringstream temporary_stream; /// Maybe one shall use some DB:: buffer. public: From 63164db01964eb5232ce218ed63d20135993747f Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Tue, 11 Jun 2019 00:07:20 +0000 Subject: [PATCH 059/309] Fixed S3 PUT redirects. --- dbms/src/IO/WriteBufferFromS3.cpp | 11 ++++++++--- dbms/src/IO/WriteBufferFromS3.h | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index c0d891ebf6d..7f84edf11b9 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -10,13 +10,15 @@ namespace DB WriteBufferFromS3::WriteBufferFromS3( const Poco::URI & uri_, const ConnectionTimeouts & timeouts_, - const Poco::Net::HTTPBasicCredentials & credentials_, size_t buffer_size_) + const Poco::Net::HTTPBasicCredentials & credentials, size_t buffer_size_) : WriteBufferFromOStream(buffer_size_) , uri {uri_} , timeouts {timeouts_} - , credentials {credentials_} + , auth_request {Poco::Net::HTTPRequest::HTTP_PUT, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1} { ostr = &temporary_stream; + if (!credentials.getUsername().empty()) + credentials.authenticate(auth_request); } void WriteBufferFromS3::finalize() @@ -26,6 +28,7 @@ void WriteBufferFromS3::finalize() Poco::Net::HTTPResponse response; std::unique_ptr request; HTTPSessionPtr session; + std::istream * istr; /// owned by session for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT; ++i) { @@ -33,8 +36,10 @@ void WriteBufferFromS3::finalize() request = std::make_unique(Poco::Net::HTTPRequest::HTTP_PUT, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); request->setHost(uri.getHost()); // use original, not resolved host name in header - if (!credentials.getUsername().empty()) + if (auth_request.hasCredentials()) { + Poco::Net::HTTPBasicCredentials credentials(auth_request); credentials.authenticate(*request); + } // request.setChunkedTransferEncoding(true); // Chunked transfers require additional logic, see: diff --git a/dbms/src/IO/WriteBufferFromS3.h b/dbms/src/IO/WriteBufferFromS3.h index 13815f4acb9..3f95529d74a 100644 --- a/dbms/src/IO/WriteBufferFromS3.h +++ b/dbms/src/IO/WriteBufferFromS3.h @@ -30,7 +30,7 @@ class WriteBufferFromS3 : public WriteBufferFromOStream private: Poco::URI uri; ConnectionTimeouts timeouts; - const Poco::Net::HTTPBasicCredentials & credentials; + Poco::Net::HTTPRequest auth_request; std::ostringstream temporary_stream; /// Maybe one shall use some DB:: buffer. public: From d3db5a38902d11bdec6d7335f4bec1804c5868c9 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Fri, 14 Jun 2019 15:56:25 +0300 Subject: [PATCH 060/309] Update TableFunctionS3.h --- dbms/src/TableFunctions/TableFunctionS3.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/TableFunctions/TableFunctionS3.h b/dbms/src/TableFunctions/TableFunctionS3.h index 83c49e0b8d1..04826a01d9b 100644 --- a/dbms/src/TableFunctions/TableFunctionS3.h +++ b/dbms/src/TableFunctions/TableFunctionS3.h @@ -7,7 +7,7 @@ namespace DB { -/* url(source, format, structure) - creates a temporary storage from url +/* s3(source, format, structure) - creates a temporary storage for a file in S3 */ class TableFunctionS3 : public ITableFunctionFileLike { From 6b9397e805f056dfb64b54e188268d82d7bf045b Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 17 Jun 2019 01:10:49 +0300 Subject: [PATCH 061/309] Tests fixes. --- .../clickhouse-test | 18 +++++++++--------- .../00950_table_function_s3_wip/config.xml | 4 ++-- .../00950_table_function_s3_wip/test.csv | 3 +++ 3 files changed, 14 insertions(+), 11 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/00950_table_function_s3_wip/test.csv diff --git a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test index 554effe027b..39c62f835a5 100755 --- a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test +++ b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test @@ -8,11 +8,11 @@ import threading import unittest -# 1) Run Go FakeS3 server. -# go run cmd/gofakes3/main.go -backend memory -host :9990 -# 2) Create a bucket. -# curl -X PUT http://localhost:9990/abc/ +# Run Go FakeS3 server. +# go run cmd/gofakes3/main.go -backend memory -host :9990 -initialbucket abc +config = os.path.join(os.path.dirname(sys.argv[0]), 'config.xml') +test_csv = os.path.join(os.path.dirname(sys.argv[0]), 'test.csv') format = 'column1 UInt32, column2 UInt32, column3 UInt32' values = '(1, 2, 3), (3, 2, 1), (78, 43, 45)' other_values = '(1, 1, 1), (1, 1, 1), (11, 11, 11)' @@ -30,7 +30,7 @@ prepare_put_queries = [ ] queries = [ - "select *, column1*column2*column3 from file('{}', 'CSV', '{}')".format(os.path.expanduser('~/test.csv'), format), + "select *, column1*column2*column3 from file('{}', 'CSV', '{}')".format(test_csv, format), "select *, column1*column2*column3 from url('https://storage.yandexcloud.net/milovidov/test.csv', 'CSV', '{}')".format(format), "select *, column1*column2*column3 from s3('http://storage.yandexcloud.net/milovidov/test.csv', 'CSV', '{}')".format(format), "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(localhost, fakes3_port, bucket, format), @@ -166,7 +166,7 @@ for query in prepare_put_queries: result = subprocess.run([ os.path.expanduser('~/ClickHouse-bin/dbms/programs/clickhouse-local'), '-c', - os.path.expanduser('~/config.xml'), + config, '-q', query ], stdout=subprocess.PIPE, universal_newlines=True) @@ -177,7 +177,7 @@ for query in queries: result = subprocess.run([ os.path.expanduser('~/ClickHouse-bin/dbms/programs/clickhouse-local'), '-c', - os.path.expanduser('~/config.xml'), + config, '-q', query ], stdout=subprocess.PIPE, universal_newlines=True) @@ -193,7 +193,7 @@ for query in put_queries: result = subprocess.run([ os.path.expanduser('~/ClickHouse-bin/dbms/programs/clickhouse-local'), '-c', - os.path.expanduser('~/config.xml'), + config, '-q', query ], stdout=subprocess.PIPE, universal_newlines=True) @@ -207,7 +207,7 @@ for query in check_queries: result = subprocess.run([ os.path.expanduser('~/ClickHouse-bin/dbms/programs/clickhouse-local'), '-c', - os.path.expanduser('~/config.xml'), + config, '-q', query ], stdout=subprocess.PIPE, universal_newlines=True) diff --git a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/config.xml b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/config.xml index de0dd0a7087..7675c696456 100644 --- a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/config.xml +++ b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/config.xml @@ -2,8 +2,8 @@ default trace - /home/excitoon/clickhouse-server.log - /home/excitoon/clickhouse-server.err.log + ~/clickhouse-server.log + ~/clickhouse-server.err.log 1000M 10 diff --git a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/test.csv b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/test.csv new file mode 100644 index 00000000000..a2325127dec --- /dev/null +++ b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/test.csv @@ -0,0 +1,3 @@ +1,2,3 +3,2,1 +78,43,45 From dd32c92f2a97b2f778e6a7a32610f3e23956552f Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 17 Jun 2019 03:06:14 +0300 Subject: [PATCH 062/309] Minor fixes. --- dbms/src/IO/WriteBufferFromS3.cpp | 37 ++++++++++++++++++++++++++----- dbms/src/IO/WriteBufferFromS3.h | 13 +++++++---- dbms/src/Storages/StorageS3.cpp | 23 ++++++++++++++++--- 3 files changed, 60 insertions(+), 13 deletions(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 7f84edf11b9..49862e8c8aa 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -11,19 +11,31 @@ namespace DB WriteBufferFromS3::WriteBufferFromS3( const Poco::URI & uri_, const ConnectionTimeouts & timeouts_, const Poco::Net::HTTPBasicCredentials & credentials, size_t buffer_size_) - : WriteBufferFromOStream(buffer_size_) + : BufferWithOwnMemory(buffer_size_, nullptr, 0) , uri {uri_} , timeouts {timeouts_} , auth_request {Poco::Net::HTTPRequest::HTTP_PUT, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1} + , temporary_buffer {buffer_string} { - ostr = &temporary_stream; if (!credentials.getUsername().empty()) credentials.authenticate(auth_request); } + +void WriteBufferFromS3::nextImpl() +{ + if (!offset()) + return; + + temporary_buffer.write(working_buffer.begin(), offset()); +} + + void WriteBufferFromS3::finalize() { - const std::string & data = temporary_stream.str(); + temporary_buffer.finish(); + + const String & data = buffer_string; Poco::Net::HTTPResponse response; std::unique_ptr request; @@ -51,11 +63,11 @@ void WriteBufferFromS3::finalize() LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << uri.toString()); - ostr = &session->sendRequest(*request); -// if (session->peekResponse(response)) + std::ostream & ostr = session->sendRequest(*request); +// if (session->peekResponse(response)) // FIXME, shall not go next if not received 100-continue { // Received 100-continue. - *ostr << data; + ostr << data; } istr = &session->receiveResponse(response); @@ -72,4 +84,17 @@ void WriteBufferFromS3::finalize() assertResponseIsOk(*request, response, istr); } + +WriteBufferFromS3::~WriteBufferFromS3() +{ + try + { + next(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } +} + } diff --git a/dbms/src/IO/WriteBufferFromS3.h b/dbms/src/IO/WriteBufferFromS3.h index 3f95529d74a..58ffcfdd4b9 100644 --- a/dbms/src/IO/WriteBufferFromS3.h +++ b/dbms/src/IO/WriteBufferFromS3.h @@ -2,14 +2,14 @@ #include #include -#include #include #include #include +#include #include #include #include -#include +#include #include #include #include @@ -25,13 +25,14 @@ namespace DB { /* Perform S3 HTTP PUT request. */ -class WriteBufferFromS3 : public WriteBufferFromOStream +class WriteBufferFromS3 : public BufferWithOwnMemory { private: Poco::URI uri; ConnectionTimeouts timeouts; Poco::Net::HTTPRequest auth_request; - std::ostringstream temporary_stream; /// Maybe one shall use some DB:: buffer. + String buffer_string; + DB::WriteBufferFromString temporary_buffer; public: explicit WriteBufferFromS3(const Poco::URI & uri, @@ -39,8 +40,12 @@ public: const Poco::Net::HTTPBasicCredentials & credentials = {}, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE); + void nextImpl() override; + /// Receives response from the server after sending all data. void finalize(); + + ~WriteBufferFromS3(); }; } diff --git a/dbms/src/Storages/StorageS3.cpp b/dbms/src/Storages/StorageS3.cpp index f15e19d9396..474d603878e 100644 --- a/dbms/src/Storages/StorageS3.cpp +++ b/dbms/src/Storages/StorageS3.cpp @@ -133,7 +133,6 @@ BlockInputStreams StorageS3::read(const Names & column_names, max_block_size, ConnectionTimeouts::getHTTPTimeouts(context)); - auto column_defaults = getColumns().getDefaults(); if (column_defaults.empty()) return {block_input}; @@ -148,8 +147,26 @@ BlockOutputStreamPtr StorageS3::write(const ASTPtr & /*query*/, const Context & uri, format_name, getSampleBlock(), context_global, ConnectionTimeouts::getHTTPTimeouts(context_global)); } -void registerStorageS3(StorageFactory & /*factory*/) +void registerStorageS3(StorageFactory & factory) { - // TODO. See #1394? + factory.registerStorage("S3", [](const StorageFactory::Arguments & args) + { + ASTs & engine_args = args.engine_args; + + if (!(engine_args.size() == 1 || engine_args.size() == 2)) + throw Exception( + "Storage S3 requires exactly 2 arguments: url and name of used format.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + engine_args[0] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[0], args.local_context); + + String url = engine_args[0]->as().value.safeGet(); + Poco::URI uri(url); + + engine_args[1] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[1], args.local_context); + + String format_name = engine_args[1]->as().value.safeGet(); + + return StorageS3::create(uri, args.table_name, format_name, args.columns, args.context); + }); } } From 10a7f80af14562002738eb42d4b45031226818dc Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 17 Jun 2019 03:42:47 +0300 Subject: [PATCH 063/309] Stubs for multipart uploads. --- dbms/src/IO/WriteBufferFromS3.cpp | 66 ++++++++++++++++++++++++------- dbms/src/IO/WriteBufferFromS3.h | 11 +++++- 2 files changed, 62 insertions(+), 15 deletions(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 49862e8c8aa..0d48d239e83 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -4,6 +4,7 @@ #define DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT 2 +#define DEFAULT_S3_MINIMUM_PART_SIZE 100'000'000 namespace DB { @@ -15,10 +16,14 @@ WriteBufferFromS3::WriteBufferFromS3( , uri {uri_} , timeouts {timeouts_} , auth_request {Poco::Net::HTTPRequest::HTTP_PUT, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1} - , temporary_buffer {buffer_string} + , temporary_buffer {std::make_unique(buffer_string)} + , part_number {0} + , last_part_size {0} { if (!credentials.getUsername().empty()) credentials.authenticate(auth_request); + + initiate(); } @@ -27,16 +32,55 @@ void WriteBufferFromS3::nextImpl() if (!offset()) return; - temporary_buffer.write(working_buffer.begin(), offset()); + temporary_buffer->write(working_buffer.begin(), offset()); + + last_part_size += offset(); + + if (last_part_size > DEFAULT_S3_MINIMUM_PART_SIZE) + { + temporary_buffer->finish(); + writePart(buffer_string); + last_part_size = 0; + temporary_buffer = std::make_unique(buffer_string); + } } void WriteBufferFromS3::finalize() { - temporary_buffer.finish(); + temporary_buffer->finish(); + if (!buffer_string.empty()) + { + writePart(buffer_string); + } - const String & data = buffer_string; + complete(); +} + +WriteBufferFromS3::~WriteBufferFromS3() +{ + try + { + next(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } +} + + +void WriteBufferFromS3::initiate() +{ + // FIXME POST ?uploads + // https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadInitiate.html +} + +void WriteBufferFromS3::writePart(const String & data) +{ + // FIXME PUT ?partNumber=PartNumber&uploadId=UploadId + // https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadUploadPart.html Poco::Net::HTTPResponse response; std::unique_ptr request; HTTPSessionPtr session; @@ -44,7 +88,7 @@ void WriteBufferFromS3::finalize() std::istream * istr; /// owned by session for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT; ++i) { - session = makeHTTPSession(uri, timeouts); + session = makeHTTPSession(uri, timeouts); // FIXME apply part number to URI request = std::make_unique(Poco::Net::HTTPRequest::HTTP_PUT, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); request->setHost(uri.getHost()); // use original, not resolved host name in header @@ -85,16 +129,10 @@ void WriteBufferFromS3::finalize() } -WriteBufferFromS3::~WriteBufferFromS3() +void WriteBufferFromS3::complete() { - try - { - next(); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - } + // FIXME POST ?uploads + // https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadComplete.html } } diff --git a/dbms/src/IO/WriteBufferFromS3.h b/dbms/src/IO/WriteBufferFromS3.h index 58ffcfdd4b9..513ab167be5 100644 --- a/dbms/src/IO/WriteBufferFromS3.h +++ b/dbms/src/IO/WriteBufferFromS3.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -32,7 +33,10 @@ private: ConnectionTimeouts timeouts; Poco::Net::HTTPRequest auth_request; String buffer_string; - DB::WriteBufferFromString temporary_buffer; + std::unique_ptr temporary_buffer; + size_t part_number; + size_t last_part_size; + std::vector part_tags; public: explicit WriteBufferFromS3(const Poco::URI & uri, @@ -46,6 +50,11 @@ public: void finalize(); ~WriteBufferFromS3(); + +private: + void initiate(); + void writePart(const String & data); + void complete(); }; } From 248e26d59f383b98adb35aa736e53c0bfd20fddf Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 17 Jun 2019 10:16:43 +0300 Subject: [PATCH 064/309] Attempt to make multipart uploads. --- dbms/src/IO/WriteBufferFromS3.cpp | 163 ++++++++++++++++++++++++++---- dbms/src/IO/WriteBufferFromS3.h | 1 + 2 files changed, 145 insertions(+), 19 deletions(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 0d48d239e83..39f840f14c1 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -1,5 +1,13 @@ #include +#include + +#include +#include +#include +#include +#include + #include @@ -9,6 +17,12 @@ namespace DB { +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; +} + + WriteBufferFromS3::WriteBufferFromS3( const Poco::URI & uri_, const ConnectionTimeouts & timeouts_, const Poco::Net::HTTPBasicCredentials & credentials, size_t buffer_size_) @@ -17,7 +31,7 @@ WriteBufferFromS3::WriteBufferFromS3( , timeouts {timeouts_} , auth_request {Poco::Net::HTTPRequest::HTTP_PUT, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1} , temporary_buffer {std::make_unique(buffer_string)} - , part_number {0} + , part_number {1} , last_part_size {0} { if (!credentials.getUsername().empty()) @@ -41,7 +55,7 @@ void WriteBufferFromS3::nextImpl() temporary_buffer->finish(); writePart(buffer_string); last_part_size = 0; - temporary_buffer = std::make_unique(buffer_string); + temporary_buffer = std::make_unique(buffer_string); } } @@ -52,6 +66,7 @@ void WriteBufferFromS3::finalize() if (!buffer_string.empty()) { writePart(buffer_string); + ++part_number; } complete(); @@ -73,39 +88,87 @@ WriteBufferFromS3::~WriteBufferFromS3() void WriteBufferFromS3::initiate() { - // FIXME POST ?uploads // https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadInitiate.html -} - -void WriteBufferFromS3::writePart(const String & data) -{ - // FIXME PUT ?partNumber=PartNumber&uploadId=UploadId - // https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadUploadPart.html Poco::Net::HTTPResponse response; std::unique_ptr request; HTTPSessionPtr session; - std::istream * istr; /// owned by session + Poco::URI initiate_uri = uri; + initiate_uri.setRawQuery("uploads"); // FIXME find how to leave user params as is + for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT; ++i) { - session = makeHTTPSession(uri, timeouts); // FIXME apply part number to URI - request = std::make_unique(Poco::Net::HTTPRequest::HTTP_PUT, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); - request->setHost(uri.getHost()); // use original, not resolved host name in header + session = makeHTTPSession(initiate_uri, timeouts); + request = std::make_unique(Poco::Net::HTTPRequest::HTTP_POST, initiate_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); + request->setHost(initiate_uri.getHost()); // use original, not resolved host name in header if (auth_request.hasCredentials()) { Poco::Net::HTTPBasicCredentials credentials(auth_request); credentials.authenticate(*request); } - // request.setChunkedTransferEncoding(true); - // Chunked transfers require additional logic, see: - // https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-streaming.html + request->setContentLength(0); + + LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << initiate_uri.toString()); + + session->sendRequest(*request); + + istr = &session->receiveResponse(response); + + if (response.getStatus() != 307) + break; + + auto location_iterator = response.find("Location"); + if (location_iterator == response.end()) + break; + + initiate_uri = location_iterator->second; + } + assertResponseIsOk(*request, response, istr); + + Poco::XML::InputSource src(*istr); + Poco::XML::DOMParser parser; + Poco::AutoPtr document = parser.parse(&src); + Poco::AutoPtr nodes = document->getElementsByTagName("UploadId"); + if (nodes->length() != 1) + { + throw Exception("Incorrect XML in response, no upload id", ErrorCodes::INCORRECT_DATA); + } + upload_id = nodes->item(0)->innerText(); + if (upload_id.empty()) + { + throw Exception("Incorrect XML in response, empty upload id", ErrorCodes::INCORRECT_DATA); + } +} + + +void WriteBufferFromS3::writePart(const String & data) +{ + // https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadUploadPart.html + Poco::Net::HTTPResponse response; + std::unique_ptr request; + HTTPSessionPtr session; + std::istream * istr; /// owned by session + Poco::URI part_uri = uri; + part_uri.addQueryParameter("partNumber", std::to_string(part_number)); + part_uri.addQueryParameter("uploadId", upload_id); + + for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT; ++i) + { + session = makeHTTPSession(part_uri, timeouts); + request = std::make_unique(Poco::Net::HTTPRequest::HTTP_PUT, part_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); + request->setHost(part_uri.getHost()); // use original, not resolved host name in header + + if (auth_request.hasCredentials()) { + Poco::Net::HTTPBasicCredentials credentials(auth_request); + credentials.authenticate(*request); + } request->setExpectContinue(true); request->setContentLength(data.size()); - LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << uri.toString()); + LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << part_uri.toString()); std::ostream & ostr = session->sendRequest(*request); // if (session->peekResponse(response)) // FIXME, shall not go next if not received 100-continue @@ -123,16 +186,78 @@ void WriteBufferFromS3::writePart(const String & data) if (location_iterator == response.end()) break; - uri = location_iterator->second; + part_uri = location_iterator->second; } assertResponseIsOk(*request, response, istr); + + auto etag_iterator = response.find("ETag"); + if (etag_iterator == response.end()) + { + throw Exception("Incorrect response, no ETag", ErrorCodes::INCORRECT_DATA); + } + part_tags.push_back(etag_iterator->second); } void WriteBufferFromS3::complete() { - // FIXME POST ?uploads // https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadComplete.html + Poco::Net::HTTPResponse response; + std::unique_ptr request; + HTTPSessionPtr session; + std::istream * istr; /// owned by session + Poco::URI complete_uri = uri; + complete_uri.addQueryParameter("uploadId", upload_id); + + String data; + WriteBufferFromString buffer(data); + writeString("", buffer); // FIXME move to Poco::XML maybe?? + for (size_t i = 0; i < part_tags.size(); ++i) { + writeString("", buffer); + writeIntText(i + 1, buffer); + writeString("", buffer); + writeString(part_tags[i], buffer); + writeString("", buffer); + } + writeString("", buffer); + buffer.finish(); + + for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT; ++i) + { + session = makeHTTPSession(complete_uri, timeouts); + request = std::make_unique(Poco::Net::HTTPRequest::HTTP_POST, complete_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); + request->setHost(complete_uri.getHost()); // use original, not resolved host name in header + + if (auth_request.hasCredentials()) { + Poco::Net::HTTPBasicCredentials credentials(auth_request); + credentials.authenticate(*request); + } + + request->setExpectContinue(true); + + request->setContentLength(data.size()); + + LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << complete_uri.toString()); + + std::ostream & ostr = session->sendRequest(*request); +// if (session->peekResponse(response)) // FIXME, shall not go next if not received 100-continue + { + // Received 100-continue. + ostr << data; + } + + istr = &session->receiveResponse(response); + + if (response.getStatus() != 307) + break; + + auto location_iterator = response.find("Location"); + if (location_iterator == response.end()) + break; + + complete_uri = location_iterator->second; + } + assertResponseIsOk(*request, response, istr); } } diff --git a/dbms/src/IO/WriteBufferFromS3.h b/dbms/src/IO/WriteBufferFromS3.h index 513ab167be5..6aabfc593cf 100644 --- a/dbms/src/IO/WriteBufferFromS3.h +++ b/dbms/src/IO/WriteBufferFromS3.h @@ -36,6 +36,7 @@ private: std::unique_ptr temporary_buffer; size_t part_number; size_t last_part_size; + String upload_id; std::vector part_tags; public: From 96d093f9ad7fa13005a845ce3ef357a048e2c567 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 17 Jun 2019 17:32:57 +0000 Subject: [PATCH 065/309] Clang compatibility fixes. --- dbms/src/IO/HTTPCommon.cpp | 8 ++++---- dbms/src/IO/HTTPCommon.h | 2 +- dbms/src/IO/ReadBufferFromS3.cpp | 2 +- dbms/src/IO/WriteBufferFromS3.cpp | 12 ++++++------ dbms/src/IO/WriteBufferFromS3.h | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/dbms/src/IO/HTTPCommon.cpp b/dbms/src/IO/HTTPCommon.cpp index 32970276dd4..9caad1fcbff 100644 --- a/dbms/src/IO/HTTPCommon.cpp +++ b/dbms/src/IO/HTTPCommon.cpp @@ -216,19 +216,19 @@ PooledHTTPSessionPtr makePooledHTTPSession(const Poco::URI & uri, const Connecti std::istream * receiveResponse( Poco::Net::HTTPClientSession & session, const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response) { - auto istr = &session.receiveResponse(response); + auto & istr = session.receiveResponse(response); assertResponseIsOk(request, response, istr); - return istr; + return &istr; } -void assertResponseIsOk(const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream * istr) { +void assertResponseIsOk(const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream & istr) { auto status = response.getStatus(); if (status != Poco::Net::HTTPResponse::HTTP_OK) { std::stringstream error_message; error_message << "Received error from remote server " << request.getURI() << ". HTTP status code: " << status << " " - << response.getReason() << ", body: " << istr->rdbuf(); + << response.getReason() << ", body: " << istr.rdbuf(); throw Exception(error_message.str(), status == HTTP_TOO_MANY_REQUESTS ? ErrorCodes::RECEIVED_ERROR_TOO_MANY_REQUESTS diff --git a/dbms/src/IO/HTTPCommon.h b/dbms/src/IO/HTTPCommon.h index 1e7500cf230..412429e59d1 100644 --- a/dbms/src/IO/HTTPCommon.h +++ b/dbms/src/IO/HTTPCommon.h @@ -57,6 +57,6 @@ PooledHTTPSessionPtr makePooledHTTPSession(const Poco::URI & uri, const Connecti */ std::istream * receiveResponse( Poco::Net::HTTPClientSession & session, const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response); -void assertResponseIsOk(const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream * istr); +void assertResponseIsOk(const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream & istr); } diff --git a/dbms/src/IO/ReadBufferFromS3.cpp b/dbms/src/IO/ReadBufferFromS3.cpp index f6061c3a8c7..e26f683cdd4 100644 --- a/dbms/src/IO/ReadBufferFromS3.cpp +++ b/dbms/src/IO/ReadBufferFromS3.cpp @@ -49,7 +49,7 @@ ReadBufferFromS3::ReadBufferFromS3(Poco::URI uri_, session = makeHTTPSession(uri, timeouts); } - assertResponseIsOk(*request, response, istr); + assertResponseIsOk(*request, response, *istr); impl = std::make_unique(*istr, buffer_size_); } diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 39f840f14c1..ac2a2617397 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -92,7 +92,7 @@ void WriteBufferFromS3::initiate() Poco::Net::HTTPResponse response; std::unique_ptr request; HTTPSessionPtr session; - std::istream * istr; /// owned by session + std::istream * istr = nullptr; /// owned by session Poco::URI initiate_uri = uri; initiate_uri.setRawQuery("uploads"); // FIXME find how to leave user params as is @@ -124,7 +124,7 @@ void WriteBufferFromS3::initiate() initiate_uri = location_iterator->second; } - assertResponseIsOk(*request, response, istr); + assertResponseIsOk(*request, response, *istr); Poco::XML::InputSource src(*istr); Poco::XML::DOMParser parser; @@ -148,7 +148,7 @@ void WriteBufferFromS3::writePart(const String & data) Poco::Net::HTTPResponse response; std::unique_ptr request; HTTPSessionPtr session; - std::istream * istr; /// owned by session + std::istream * istr = nullptr; /// owned by session Poco::URI part_uri = uri; part_uri.addQueryParameter("partNumber", std::to_string(part_number)); part_uri.addQueryParameter("uploadId", upload_id); @@ -188,7 +188,7 @@ void WriteBufferFromS3::writePart(const String & data) part_uri = location_iterator->second; } - assertResponseIsOk(*request, response, istr); + assertResponseIsOk(*request, response, *istr); auto etag_iterator = response.find("ETag"); if (etag_iterator == response.end()) @@ -205,7 +205,7 @@ void WriteBufferFromS3::complete() Poco::Net::HTTPResponse response; std::unique_ptr request; HTTPSessionPtr session; - std::istream * istr; /// owned by session + std::istream * istr = nullptr; /// owned by session Poco::URI complete_uri = uri; complete_uri.addQueryParameter("uploadId", upload_id); @@ -257,7 +257,7 @@ void WriteBufferFromS3::complete() complete_uri = location_iterator->second; } - assertResponseIsOk(*request, response, istr); + assertResponseIsOk(*request, response, *istr); } } diff --git a/dbms/src/IO/WriteBufferFromS3.h b/dbms/src/IO/WriteBufferFromS3.h index 6aabfc593cf..23edbbe5fc0 100644 --- a/dbms/src/IO/WriteBufferFromS3.h +++ b/dbms/src/IO/WriteBufferFromS3.h @@ -50,7 +50,7 @@ public: /// Receives response from the server after sending all data. void finalize(); - ~WriteBufferFromS3(); + ~WriteBufferFromS3() override; private: void initiate(); From af45849ce0090148c99a6102647596f6f3f29a4e Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 17 Jun 2019 17:33:20 +0000 Subject: [PATCH 066/309] Minor test fixes. --- .../clickhouse-test | 176 +++++++++++------- 1 file changed, 108 insertions(+), 68 deletions(-) diff --git a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test index 39c62f835a5..62980b3e2ac 100755 --- a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test +++ b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test @@ -4,13 +4,13 @@ import http.server import os import subprocess import sys +import tempfile import threading +import time import unittest +import urllib -# Run Go FakeS3 server. -# go run cmd/gofakes3/main.go -backend memory -host :9990 -initialbucket abc - config = os.path.join(os.path.dirname(sys.argv[0]), 'config.xml') test_csv = os.path.join(os.path.dirname(sys.argv[0]), 'test.csv') format = 'column1 UInt32, column2 UInt32, column3 UInt32' @@ -25,6 +25,7 @@ fakes3_port = 9990 localhost = '127.0.0.1' bucket = 'abc' + prepare_put_queries = [ "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(localhost, fakes3_port, bucket, format, values), ] @@ -39,31 +40,36 @@ queries = [ "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(redirecting_host, redirecting_to_https_port, format), ] -put_queries = [ - "insert into table function s3('http://{}:{}/', 'CSV', '{}') values {}" - .format(redirecting_host, preserving_data_port, format, values), - "insert into table function s3('http://{}:{}/', 'CSV', '{}') values {}" - .format(redirecting_host, redirecting_preserving_data_port, format, other_values), -] +put_query = "insert into table function s3('http://{}:{}/', 'CSV', '{}') values {}".format(redirecting_host, preserving_data_port, format, values) + +redirect_put_query = "insert into table function s3('http://{}:{}/', 'CSV', '{}') values {}".format(redirecting_host, redirecting_preserving_data_port, format, other_values) check_queries = [ "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(localhost, fakes3_port, bucket, format), ] +def run_query(query): + result = subprocess.run([os.path.expanduser('~/ClickHouse-bin/dbms/programs/clickhouse-local'), '-c', config, '-q', query] + , stdout=subprocess.PIPE + , universal_newlines=True) + result.check_returncode() + return result.stdout + + class RedirectingToHTTPHTTPServer(http.server.BaseHTTPRequestHandler): def do_GET(self): self.send_response(307) self.send_header('Content-type', 'text/xml') self.send_header('Location', 'http://storage.yandexcloud.net/milovidov/test.csv') self.end_headers() - self.wfile.write(bytes(r''' + self.wfile.write(r''' TemporaryRedirect Please re-send this request to the specified temporary endpoint. Continue to use the original request endpoint for future requests. - johnsmith.s3-gztb4pa9sq.amazonaws.com -''', "utf-8")) + storage.yandexcloud.net +'''.encode()) class RedirectingToHTTPSHTTPServer(http.server.BaseHTTPRequestHandler): @@ -72,16 +78,17 @@ class RedirectingToHTTPSHTTPServer(http.server.BaseHTTPRequestHandler): self.send_header('Content-type', 'text/xml') self.send_header('Location', 'https://storage.yandexcloud.net/milovidov/test.csv') self.end_headers() - self.wfile.write(bytes(r''' + self.wfile.write(r''' TemporaryRedirect Please re-send this request to the specified temporary endpoint. Continue to use the original request endpoint for future requests. - johnsmith.s3-gztb4pa9sq.amazonaws.com -''', "utf-8")) + storage.yandexcloud.net +'''.encode()) received_data = [] +received_data_completed = False class PreservingDataServer(http.server.BaseHTTPRequestHandler): @@ -91,11 +98,34 @@ class PreservingDataServer(http.server.BaseHTTPRequestHandler): print('Received Expect-100', file=sys.stderr) return True + def do_POST(self): + self.send_response(200) + query = urllib.parse.urlparse(self.path).query + print('POST', query) + if query == 'uploads': + data = r''' +TEST'''.encode() + self.send_header('Content-length', str(len(data))) + self.send_header('Content-type', 'text/plain') + self.end_headers() + self.wfile.write(data) + else: + data = self.rfile.read(int(self.headers.get('Content-Length'))) + assert query == 'uploadId=TEST' + assert data == b'1hello-etag' + self.send_header('Content-type', 'text/plain') + self.end_headers() + global received_data_completed + received_data_completed = True + def do_PUT(self): self.send_response(200) self.send_header('Content-type', 'text/plain') + self.send_header('ETag', 'hello-etag') self.end_headers() + query = urllib.parse.urlparse(self.path).query print('Content-Length =', self.headers.get('Content-Length'), file=sys.stderr) + print('PUT', query) assert self.headers.get('Content-Length') assert self.headers['Expect'] == '100-continue' received_data.append(self.rfile.read()) @@ -107,29 +137,27 @@ class RedirectingPreservingDataServer(http.server.BaseHTTPRequestHandler): def handle_expect_100(self): print('Received Expect-100', file=sys.stderr) + query = urllib.parse.urlparse(self.path).query + if query: + query = '?{}'.format(query) self.send_response(307) self.send_header('Content-type', 'text/xml') - self.send_header('Location', 'http://{}:{}/{}/test.csv'.format(localhost, fakes3_port, bucket)) + self.send_header('Location', 'http://{host}:{port}/{bucket}/test.csv{query}'.format(host=localhost, port=fakes3_port, bucket=bucket, query=query)) self.end_headers() - self.wfile.write(bytes(r''' + self.wfile.write(r''' TemporaryRedirect Please re-send this request to the specified temporary endpoint. Continue to use the original request endpoint for future requests. - johnsmith.s3-gztb4pa9sq.amazonaws.com -''', "utf-8")) + {host}:{port} +'''.encode().format(host=localhost, port=fakes3_port)) return False + def do_POST(self): + assert False + def do_PUT(self): assert False - self.send_response(200) - self.send_header('Content-type', 'text/plain') - self.end_headers() - print('Content-Length =', self.headers.get('Content-Length'), file=sys.stderr) - assert self.headers.get('Content-Length') - assert self.headers['Expect'] == '100-continue' - received_data.append(self.rfile.read()) - self.wfile.flush() servers = [] @@ -146,14 +174,48 @@ def redirecting_to_http_thread(): def preserving_thread(): server = http.server.HTTPServer((redirecting_host, preserving_data_port), PreservingDataServer) servers.append(server) - server.handle_request() + while True: + server.handle_request() def redirecting_preserving_thread(): server = http.server.HTTPServer((redirecting_host, redirecting_preserving_data_port), RedirectingPreservingDataServer) servers.append(server) - server.handle_request() + while True: + server.handle_request() +def run_gofakes3(): + l = threading.Lock() + l.acquire() + + def gofakes3_thread(): + with tempfile.TemporaryDirectory() as d: + subprocess.run(['git', 'clone', 'https://github.com/johannesboyne/gofakes3'], cwd=d).check_returncode() + repo = os.path.join(d, 'gofakes3') + subprocess.run(['git', 'checkout', 'd419e1bd286f47170a4f87851a81f5c30107551a'], cwd=repo).check_returncode() + tool = os.path.join(repo, 'cmd', 'gofakes3', 'main.go') + subprocess.run(['go', 'build', tool], cwd=d).check_returncode() + l.release() + subprocess.run(['./main', '-backend', 'memory', '-host', ':{}'.format(fakes3_port), '-initialbucket', bucket], cwd=d).check_returncode() + + thread = threading.Thread(target=gofakes3_thread) + thread.start() + l.acquire() + time.sleep(0.5) + l.release() + return thread + + +def stop_subprocesses(): + pid = os.getpid() + result = subprocess.run(['pgrep', '-P', str(pid)], stdout=subprocess.PIPE) + result.check_returncode() + for child_pid in result.stdout.splitlines(): + subprocess.run(['kill', child_pid]).check_returncode() + + +run_gofakes3() + jobs = [] jobs.append(threading.Thread(target=redirecting_to_http_thread)) jobs.append(threading.Thread(target=redirecting_to_https_thread)) @@ -163,60 +225,38 @@ jobs.append(threading.Thread(target=redirecting_preserving_thread)) for query in prepare_put_queries: print(query) - result = subprocess.run([ - os.path.expanduser('~/ClickHouse-bin/dbms/programs/clickhouse-local'), - '-c', - config, - '-q', - query - ], stdout=subprocess.PIPE, universal_newlines=True) - result.check_returncode() + run_query(query) for query in queries: print(query) - result = subprocess.run([ - os.path.expanduser('~/ClickHouse-bin/dbms/programs/clickhouse-local'), - '-c', - config, - '-q', - query - ], stdout=subprocess.PIPE, universal_newlines=True) - result.check_returncode() - unittest.TestCase().assertEqual(list(map(str.split, result.stdout.splitlines())), [ + stdout = run_query(query) + unittest.TestCase().assertEqual(list(map(str.split, stdout.splitlines())), [ ['1', '2', '3', '6'], ['3', '2', '1', '6'], ['78', '43', '45', '150930'], ]) -for query in put_queries: - print(query) - result = subprocess.run([ - os.path.expanduser('~/ClickHouse-bin/dbms/programs/clickhouse-local'), - '-c', - config, - '-q', - query - ], stdout=subprocess.PIPE, universal_newlines=True) - result.check_returncode() - unittest.TestCase().assertEqual(received_data[-1].decode(), '1,2,3\n3,2,1\n78,43,45\n') - # In chunked encoding: - # unittest.TestCase().assertEqual(received_data[-1].decode(), '15\r\n1,2,3\n2,3,1\n78,43,45\n\r\n0\r\n\r\n') +query = put_query +print(query) +received_data_completed = False +run_query(query) +unittest.TestCase().assertEqual(received_data[-1].decode(), '1,2,3\n3,2,1\n78,43,45\n') +unittest.TestCase().assertTrue(received_data_completed) + +query = redirect_put_query +print(query) +run_query(query) for query in check_queries: print(query) - result = subprocess.run([ - os.path.expanduser('~/ClickHouse-bin/dbms/programs/clickhouse-local'), - '-c', - config, - '-q', - query - ], stdout=subprocess.PIPE, universal_newlines=True) - result.check_returncode() - unittest.TestCase().assertEqual(list(map(str.split, result.stdout.splitlines())), [ + stdout = run_query(query) + unittest.TestCase().assertEqual(list(map(str.split, stdout.splitlines())), [ ['1', '1', '1', '1'], ['1', '1', '1', '1'], ['11', '11', '11', '1331'], ]) +stop_subprocesses() + [ server.socket.close() for server in servers ] [ job.join() for job in jobs ] From a9153b2fb36109b0f192c4b17acda82cf78469e8 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 17 Jun 2019 18:06:28 +0000 Subject: [PATCH 067/309] Style fixes. --- dbms/src/IO/HTTPCommon.cpp | 3 ++- dbms/src/IO/WriteBufferFromS3.cpp | 16 ++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/dbms/src/IO/HTTPCommon.cpp b/dbms/src/IO/HTTPCommon.cpp index 9caad1fcbff..0a7c7e7af66 100644 --- a/dbms/src/IO/HTTPCommon.cpp +++ b/dbms/src/IO/HTTPCommon.cpp @@ -221,7 +221,8 @@ std::istream * receiveResponse( return &istr; } -void assertResponseIsOk(const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream & istr) { +void assertResponseIsOk(const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream & istr) +{ auto status = response.getStatus(); if (status != Poco::Net::HTTPResponse::HTTP_OK) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index ac2a2617397..69d216b88af 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -24,7 +24,7 @@ namespace ErrorCodes WriteBufferFromS3::WriteBufferFromS3( - const Poco::URI & uri_, const ConnectionTimeouts & timeouts_, + const Poco::URI & uri_, const ConnectionTimeouts & timeouts_, const Poco::Net::HTTPBasicCredentials & credentials, size_t buffer_size_) : BufferWithOwnMemory(buffer_size_, nullptr, 0) , uri {uri_} @@ -102,7 +102,8 @@ void WriteBufferFromS3::initiate() request = std::make_unique(Poco::Net::HTTPRequest::HTTP_POST, initiate_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); request->setHost(initiate_uri.getHost()); // use original, not resolved host name in header - if (auth_request.hasCredentials()) { + if (auth_request.hasCredentials()) + { Poco::Net::HTTPBasicCredentials credentials(auth_request); credentials.authenticate(*request); } @@ -140,7 +141,7 @@ void WriteBufferFromS3::initiate() throw Exception("Incorrect XML in response, empty upload id", ErrorCodes::INCORRECT_DATA); } } - + void WriteBufferFromS3::writePart(const String & data) { @@ -159,7 +160,8 @@ void WriteBufferFromS3::writePart(const String & data) request = std::make_unique(Poco::Net::HTTPRequest::HTTP_PUT, part_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); request->setHost(part_uri.getHost()); // use original, not resolved host name in header - if (auth_request.hasCredentials()) { + if (auth_request.hasCredentials()) + { Poco::Net::HTTPBasicCredentials credentials(auth_request); credentials.authenticate(*request); } @@ -212,7 +214,8 @@ void WriteBufferFromS3::complete() String data; WriteBufferFromString buffer(data); writeString("", buffer); // FIXME move to Poco::XML maybe?? - for (size_t i = 0; i < part_tags.size(); ++i) { + for (size_t i = 0; i < part_tags.size(); ++i) + { writeString("", buffer); writeIntText(i + 1, buffer); writeString("", buffer); @@ -228,7 +231,8 @@ void WriteBufferFromS3::complete() request = std::make_unique(Poco::Net::HTTPRequest::HTTP_POST, complete_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); request->setHost(complete_uri.getHost()); // use original, not resolved host name in header - if (auth_request.hasCredentials()) { + if (auth_request.hasCredentials()) + { Poco::Net::HTTPBasicCredentials credentials(auth_request); credentials.authenticate(*request); } From fa0de006d06a5981beca624f69f1f65d53ee66c3 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 17 Jun 2019 18:17:51 +0000 Subject: [PATCH 068/309] More style fixes. --- dbms/src/IO/WriteBufferFromS3.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 69d216b88af..70ac6136650 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -161,7 +161,7 @@ void WriteBufferFromS3::writePart(const String & data) request->setHost(part_uri.getHost()); // use original, not resolved host name in header if (auth_request.hasCredentials()) - { + { Poco::Net::HTTPBasicCredentials credentials(auth_request); credentials.authenticate(*request); } @@ -232,7 +232,7 @@ void WriteBufferFromS3::complete() request->setHost(complete_uri.getHost()); // use original, not resolved host name in header if (auth_request.hasCredentials()) - { + { Poco::Net::HTTPBasicCredentials credentials(auth_request); credentials.authenticate(*request); } From c6136c2b16373e4e296dd326507d42bba02f893f Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Tue, 18 Jun 2019 09:16:09 +0300 Subject: [PATCH 069/309] Test improvement. --- .../00950_table_function_s3_wip/clickhouse-test | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test index 62980b3e2ac..9000ea7567f 100755 --- a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test +++ b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test @@ -190,13 +190,16 @@ def run_gofakes3(): def gofakes3_thread(): with tempfile.TemporaryDirectory() as d: - subprocess.run(['git', 'clone', 'https://github.com/johannesboyne/gofakes3'], cwd=d).check_returncode() - repo = os.path.join(d, 'gofakes3') - subprocess.run(['git', 'checkout', 'd419e1bd286f47170a4f87851a81f5c30107551a'], cwd=repo).check_returncode() - tool = os.path.join(repo, 'cmd', 'gofakes3', 'main.go') - subprocess.run(['go', 'build', tool], cwd=d).check_returncode() - l.release() - subprocess.run(['./main', '-backend', 'memory', '-host', ':{}'.format(fakes3_port), '-initialbucket', bucket], cwd=d).check_returncode() + try: + subprocess.run(['git', 'clone', 'https://github.com/johannesboyne/gofakes3'], cwd=d).check_returncode() + repo = os.path.join(d, 'gofakes3') + subprocess.run(['git', 'checkout', 'd419e1bd286f47170a4f87851a81f5c30107551a'], cwd=repo).check_returncode() + tool = os.path.join(repo, 'cmd', 'gofakes3', 'main.go') + subprocess.run(['go', 'build', tool], cwd=repo).check_returncode() + finally: + l.release() + binary = os.path.join(repo, 'main') + subprocess.run([binary, '-backend', 'memory', '-host', ':{}'.format(fakes3_port), '-initialbucket', bucket]).check_returncode() thread = threading.Thread(target=gofakes3_thread) thread.start() From 395560df1b5916702ebf7aa25daaa9b3c57c73c7 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Fri, 21 Jun 2019 01:16:31 +0300 Subject: [PATCH 070/309] Fixed multipart uploads and 100-continue. --- dbms/src/IO/WriteBufferFromS3.cpp | 4 +-- .../clickhouse-test | 29 ++++++++++++++----- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 70ac6136650..c8406e00ce8 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -173,7 +173,7 @@ void WriteBufferFromS3::writePart(const String & data) LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << part_uri.toString()); std::ostream & ostr = session->sendRequest(*request); -// if (session->peekResponse(response)) // FIXME, shall not go next if not received 100-continue + if (session->peekResponse(response)) { // Received 100-continue. ostr << data; @@ -244,7 +244,7 @@ void WriteBufferFromS3::complete() LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << complete_uri.toString()); std::ostream & ostr = session->sendRequest(*request); -// if (session->peekResponse(response)) // FIXME, shall not go next if not received 100-continue + if (session->peekResponse(response)) { // Received 100-continue. ostr << data; diff --git a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test index 9000ea7567f..148570feede 100755 --- a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test +++ b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test @@ -96,6 +96,8 @@ class PreservingDataServer(http.server.BaseHTTPRequestHandler): def handle_expect_100(self): print('Received Expect-100', file=sys.stderr) + self.send_response_only(100) + self.end_headers() return True def do_POST(self): @@ -137,6 +139,9 @@ class RedirectingPreservingDataServer(http.server.BaseHTTPRequestHandler): def handle_expect_100(self): print('Received Expect-100', file=sys.stderr) + return True + + def do_POST(self): query = urllib.parse.urlparse(self.path).query if query: query = '?{}'.format(query) @@ -150,14 +155,23 @@ class RedirectingPreservingDataServer(http.server.BaseHTTPRequestHandler): Please re-send this request to the specified temporary endpoint. Continue to use the original request endpoint for future requests. {host}:{port} -'''.encode().format(host=localhost, port=fakes3_port)) - return False - - def do_POST(self): - assert False +'''.format(host=localhost, port=fakes3_port).encode()) def do_PUT(self): - assert False + query = urllib.parse.urlparse(self.path).query + if query: + query = '?{}'.format(query) + self.send_response(307) + self.send_header('Content-type', 'text/xml') + self.send_header('Location', 'http://{host}:{port}/{bucket}/test.csv{query}'.format(host=localhost, port=fakes3_port, bucket=bucket, query=query)) + self.end_headers() + self.wfile.write(r''' + + TemporaryRedirect + Please re-send this request to the specified temporary endpoint. + Continue to use the original request endpoint for future requests. + {host}:{port} +'''.format(host=localhost, port=fakes3_port).encode()) servers = [] @@ -199,7 +213,7 @@ def run_gofakes3(): finally: l.release() binary = os.path.join(repo, 'main') - subprocess.run([binary, '-backend', 'memory', '-host', ':{}'.format(fakes3_port), '-initialbucket', bucket]).check_returncode() + subprocess.run([binary, '-backend', 'memory', '-host', ':{}'.format(fakes3_port), '-initialbucket', bucket]) thread = threading.Thread(target=gofakes3_thread) thread.start() @@ -262,4 +276,5 @@ for query in check_queries: stop_subprocesses() [ server.socket.close() for server in servers ] +os._exit(0) [ job.join() for job in jobs ] From 97b7635c8a7be42467bb90aef5f4773cb7a06d3d Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Fri, 21 Jun 2019 08:24:01 +0300 Subject: [PATCH 071/309] Minimum block size to configuration. --- dbms/src/IO/WriteBufferFromS3.cpp | 11 +++++++---- dbms/src/IO/WriteBufferFromS3.h | 2 ++ dbms/src/Storages/StorageS3.cpp | 3 ++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index c8406e00ce8..b44170417b1 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -12,7 +12,6 @@ #define DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT 2 -#define DEFAULT_S3_MINIMUM_PART_SIZE 100'000'000 namespace DB { @@ -24,10 +23,14 @@ namespace ErrorCodes WriteBufferFromS3::WriteBufferFromS3( - const Poco::URI & uri_, const ConnectionTimeouts & timeouts_, - const Poco::Net::HTTPBasicCredentials & credentials, size_t buffer_size_) + const Poco::URI & uri_, + size_t minimum_upload_part_size_, + const ConnectionTimeouts & timeouts_, + const Poco::Net::HTTPBasicCredentials & credentials, size_t buffer_size_ +) : BufferWithOwnMemory(buffer_size_, nullptr, 0) , uri {uri_} + , minimum_upload_part_size {minimum_upload_part_size_} , timeouts {timeouts_} , auth_request {Poco::Net::HTTPRequest::HTTP_PUT, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1} , temporary_buffer {std::make_unique(buffer_string)} @@ -50,7 +53,7 @@ void WriteBufferFromS3::nextImpl() last_part_size += offset(); - if (last_part_size > DEFAULT_S3_MINIMUM_PART_SIZE) + if (last_part_size > minimum_upload_part_size) { temporary_buffer->finish(); writePart(buffer_string); diff --git a/dbms/src/IO/WriteBufferFromS3.h b/dbms/src/IO/WriteBufferFromS3.h index 23edbbe5fc0..9e4d8c3be2a 100644 --- a/dbms/src/IO/WriteBufferFromS3.h +++ b/dbms/src/IO/WriteBufferFromS3.h @@ -30,6 +30,7 @@ class WriteBufferFromS3 : public BufferWithOwnMemory { private: Poco::URI uri; + size_t minimum_upload_part_size; ConnectionTimeouts timeouts; Poco::Net::HTTPRequest auth_request; String buffer_string; @@ -41,6 +42,7 @@ private: public: explicit WriteBufferFromS3(const Poco::URI & uri, + size_t minimum_upload_part_size_, const ConnectionTimeouts & timeouts = {}, const Poco::Net::HTTPBasicCredentials & credentials = {}, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE); diff --git a/dbms/src/Storages/StorageS3.cpp b/dbms/src/Storages/StorageS3.cpp index 474d603878e..1de1bdbccfa 100644 --- a/dbms/src/Storages/StorageS3.cpp +++ b/dbms/src/Storages/StorageS3.cpp @@ -84,7 +84,8 @@ namespace const ConnectionTimeouts & timeouts) : sample_block(sample_block_) { - write_buf = std::make_unique(uri, timeouts); + auto minimum_upload_part_size = context.getConfigRef().getUInt64("s3_minimum_upload_part_size", 512'000'000); + write_buf = std::make_unique(uri, minimum_upload_part_size, timeouts); writer = FormatFactory::instance().getOutput(format, *write_buf, sample_block, context); } From ab456262d9df22f6cca76223ed16a805de9556d3 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Sat, 22 Jun 2019 08:58:05 +0300 Subject: [PATCH 072/309] Fixed multipart mechanism and added a warning about 10k parts. --- dbms/src/IO/WriteBufferFromS3.cpp | 10 +++++++--- dbms/src/IO/WriteBufferFromS3.h | 1 - 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index b44170417b1..5e0714b6b7a 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -12,6 +12,7 @@ #define DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT 2 +#define S3_SOFT_MAX_PARTS 10000 namespace DB { @@ -34,7 +35,6 @@ WriteBufferFromS3::WriteBufferFromS3( , timeouts {timeouts_} , auth_request {Poco::Net::HTTPRequest::HTTP_PUT, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1} , temporary_buffer {std::make_unique(buffer_string)} - , part_number {1} , last_part_size {0} { if (!credentials.getUsername().empty()) @@ -69,7 +69,6 @@ void WriteBufferFromS3::finalize() if (!buffer_string.empty()) { writePart(buffer_string); - ++part_number; } complete(); @@ -154,9 +153,14 @@ void WriteBufferFromS3::writePart(const String & data) HTTPSessionPtr session; std::istream * istr = nullptr; /// owned by session Poco::URI part_uri = uri; - part_uri.addQueryParameter("partNumber", std::to_string(part_number)); + part_uri.addQueryParameter("partNumber", std::to_string(part_tags.size() + 1)); part_uri.addQueryParameter("uploadId", upload_id); + if (part_tags.size() == S3_SOFT_MAX_PARTS) + { + LOG_WARNING(&Logger::get("WriteBufferFromS3"), "Maximum part number in S3 protocol has reached."); + } + for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT; ++i) { session = makeHTTPSession(part_uri, timeouts); diff --git a/dbms/src/IO/WriteBufferFromS3.h b/dbms/src/IO/WriteBufferFromS3.h index 9e4d8c3be2a..0eb689e468f 100644 --- a/dbms/src/IO/WriteBufferFromS3.h +++ b/dbms/src/IO/WriteBufferFromS3.h @@ -35,7 +35,6 @@ private: Poco::Net::HTTPRequest auth_request; String buffer_string; std::unique_ptr temporary_buffer; - size_t part_number; size_t last_part_size; String upload_id; std::vector part_tags; From c891590709743bc354c2a1141b847d1d0780fb5d Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Sat, 22 Jun 2019 08:59:13 +0300 Subject: [PATCH 073/309] Added even better warning. --- dbms/src/IO/WriteBufferFromS3.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 5e0714b6b7a..e48081d5609 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -158,7 +158,7 @@ void WriteBufferFromS3::writePart(const String & data) if (part_tags.size() == S3_SOFT_MAX_PARTS) { - LOG_WARNING(&Logger::get("WriteBufferFromS3"), "Maximum part number in S3 protocol has reached."); + LOG_WARNING(&Logger::get("WriteBufferFromS3"), "Maximum part number in S3 protocol has reached (too much parts). Server may not accept this whole upload."); } for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT; ++i) From 3e4af7b844c17914e8b323515a1c0dce77ae7af2 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Wed, 26 Jun 2019 00:41:14 +0000 Subject: [PATCH 074/309] Attempt to make integration tests. --- .../integration/test_storage_s3/__init__.py | 0 .../tests/integration/test_storage_s3/test.py | 275 +++++++++++++++++ .../clickhouse-test | 280 ------------------ .../00950_table_function_s3_wip/config.xml | 115 ------- .../00950_table_function_s3_wip/test.csv | 3 - 5 files changed, 275 insertions(+), 398 deletions(-) create mode 100644 dbms/tests/integration/test_storage_s3/__init__.py create mode 100644 dbms/tests/integration/test_storage_s3/test.py delete mode 100755 dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test delete mode 100644 dbms/tests/queries/0_stateless/00950_table_function_s3_wip/config.xml delete mode 100644 dbms/tests/queries/0_stateless/00950_table_function_s3_wip/test.csv diff --git a/dbms/tests/integration/test_storage_s3/__init__.py b/dbms/tests/integration/test_storage_s3/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/integration/test_storage_s3/test.py b/dbms/tests/integration/test_storage_s3/test.py new file mode 100644 index 00000000000..155b502bb15 --- /dev/null +++ b/dbms/tests/integration/test_storage_s3/test.py @@ -0,0 +1,275 @@ +import pytest + +from helpers.cluster import ClickHouseCluster + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster = ClickHouseCluster(__file__) + instance = cluster.add_instance('dummy') + cluster.start() + yield cluster + + finally: + cluster.shutdown() + + +import os +import socket +import subprocess +import sys +import tempfile +import threading +import time +import unittest + + +try: + import urllib.parse as urlparse +except ImportError: + import urlparse + +try: + from BaseHTTPServer import BaseHTTPRequestHandler +except ImportError: + from http.server import BaseHTTPRequestHandler + +try: + from BaseHTTPServer import HTTPServer +except ImportError: + from http.server import HTTPServer + + +localhost = '127.0.0.1' + +def GetFreeTCPPorts(n): + result = [] + sockets = [] + for i in range(n): + tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + tcp.bind((localhost, 0)) + addr, port = tcp.getsockname() + result.append(port) + sockets.append(tcp) + [ s.close() for s in sockets ] + return result + +test_csv = os.path.join(os.path.dirname(sys.argv[0]), 'test.csv') +format = 'column1 UInt32, column2 UInt32, column3 UInt32' +values = '(1, 2, 3), (3, 2, 1), (78, 43, 45)' +other_values = '(1, 1, 1), (1, 1, 1), (11, 11, 11)' +redirecting_host = localhost +redirecting_to_http_port, redirecting_to_https_port, preserving_data_port, redirecting_preserving_data_port = GetFreeTCPPorts(4) +bucket = 'abc' + + +def test_sophisticated_default(started_cluster): + instance = started_cluster.instances['dummy'] + def run_query(query): + return instance.query(query) + + + prepare_put_queries = [ + "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(localhost, preserving_data_port, bucket, format, values), + ] + + queries = [ + "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(redirecting_host, redirecting_to_http_port, format), + "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(redirecting_host, redirecting_to_https_port, format), + ] + + put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(redirecting_host, preserving_data_port, bucket, format, values) + + redirect_put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(redirecting_host, redirecting_preserving_data_port, bucket, format, other_values) + + check_queries = [ + "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(localhost, preserving_data_port, bucket, format), + ] + + + class RedirectingToHTTPHandler(BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(307) + self.send_header('Content-type', 'text/xml') + self.send_header('Location', 'http://storage.yandexcloud.net/milovidov/test.csv') + self.end_headers() + self.wfile.write(r''' + + TemporaryRedirect + Please re-send this request to the specified temporary endpoint. + Continue to use the original request endpoint for future requests. + storage.yandexcloud.net + '''.encode()) + self.finish() + + + class RedirectingToHTTPSHandler(BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(307) + self.send_header('Content-type', 'text/xml') + self.send_header('Location', 'https://storage.yandexcloud.net/milovidov/test.csv') + self.end_headers() + self.wfile.write(r''' + + TemporaryRedirect + Please re-send this request to the specified temporary endpoint. + Continue to use the original request endpoint for future requests. + storage.yandexcloud.net + '''.encode()) + self.finish() + + + received_data = [] + received_data_completed = False + + + class PreservingDataHandler(BaseHTTPRequestHandler): + protocol_version = 'HTTP/1.1' + + def handle_expect_100(self): + # FIXME it does not work in Python 2. :( + print('Received Expect-100') + self.send_response_only(100) + self.end_headers() + return True + + def do_POST(self): + self.send_response(200) + query = urlparse.urlparse(self.path).query + print('POST', query) + if query == 'uploads': + data = r''' + TEST'''.encode() + self.send_header('Content-length', str(len(data))) + self.send_header('Content-type', 'text/plain') + self.end_headers() + self.wfile.write(data) + else: + data = self.rfile.read(int(self.headers.get('Content-Length'))) + assert query == 'uploadId=TEST' + assert data == b'1hello-etag' + self.send_header('Content-type', 'text/plain') + self.end_headers() + global received_data_completed + received_data_completed = True + self.finish() + + def do_PUT(self): + self.send_response(200) + self.send_header('Content-type', 'text/plain') + self.send_header('ETag', 'hello-etag') + self.end_headers() + query = urlparse.urlparse(self.path).query + path = urlparse.urlparse(self.path).path + print('Content-Length =', self.headers.get('Content-Length')) + print('PUT', query) + assert self.headers.get('Content-Length') + assert self.headers['Expect'] == '100-continue' + data = self.rfile.read() + received_data.append(data) + print('PUT to {}'.format(path)) + self.server.storage[path] = data + self.finish() + + def do_GET(self): + path = urlparse.urlparse(self.path).path + if path in self.server.storage: + self.send_response(200) + self.send_header('Content-type', 'text/plain') + self.send_header('Content-length', str(len(self.server.storage[path]))) + self.end_headers() + self.wfile.write(self.server.storage[path]) + else: + self.send_response(404) + self.end_headers() + self.finish() + + + class RedirectingPreservingDataHandler(BaseHTTPRequestHandler): + protocol_version = 'HTTP/1.1' + + def handle_expect_100(self): + print('Received Expect-100') + return True + + def do_POST(self): + query = urlparse.urlparse(self.path).query + if query: + query = '?{}'.format(query) + self.send_response(307) + self.send_header('Content-type', 'text/xml') + self.send_header('Location', 'http://{host}:{port}/{bucket}/test.csv{query}'.format(host=localhost, port=preserving_data_port, bucket=bucket, query=query)) + self.end_headers() + self.wfile.write(r''' + + TemporaryRedirect + Please re-send this request to the specified temporary endpoint. + Continue to use the original request endpoint for future requests. + {host}:{port} + '''.format(host=localhost, port=preserving_data_port).encode()) + self.finish() + + def do_PUT(self): + query = urlparse.urlparse(self.path).query + if query: + query = '?{}'.format(query) + self.send_response(307) + self.send_header('Content-type', 'text/xml') + self.send_header('Location', 'http://{host}:{port}/{bucket}/test.csv{query}'.format(host=localhost, port=preserving_data_port, bucket=bucket, query=query)) + self.end_headers() + self.wfile.write(r''' + + TemporaryRedirect + Please re-send this request to the specified temporary endpoint. + Continue to use the original request endpoint for future requests. + {host}:{port} + '''.format(host=localhost, port=preserving_data_port).encode()) + self.finish() + + + servers = [] + servers.append(HTTPServer((redirecting_host, redirecting_to_https_port), RedirectingToHTTPSHandler)) + servers.append(HTTPServer((redirecting_host, redirecting_to_http_port), RedirectingToHTTPHandler)) + servers.append(HTTPServer((redirecting_host, preserving_data_port), PreservingDataHandler)) + servers[-1].storage = {} + servers.append(HTTPServer((redirecting_host, redirecting_preserving_data_port), RedirectingPreservingDataHandler)) + jobs = [ threading.Thread(target=server.serve_forever) for server in servers ] + [ job.start() for job in jobs ] + + try: + for query in prepare_put_queries: + print(query) + run_query(query) + + for query in queries: + print(query) + stdout = run_query(query) + unittest.TestCase().assertEqual(list(map(str.split, stdout.splitlines())), [ + ['1', '2', '3', '6'], + ['3', '2', '1', '6'], + ['78', '43', '45', '150930'], + ]) + + query = put_query + print(query) + received_data_completed = False + run_query(query) + unittest.TestCase().assertEqual(received_data[-1].decode(), '1,2,3\n3,2,1\n78,43,45\n') + unittest.TestCase().assertTrue(received_data_completed) + + query = redirect_put_query + print(query) + run_query(query) + + for query in check_queries: + print(query) + stdout = run_query(query) + unittest.TestCase().assertEqual(list(map(str.split, stdout.splitlines())), [ + ['1', '1', '1', '1'], + ['1', '1', '1', '1'], + ['11', '11', '11', '1331'], + ]) + + finally: + [ server.shutdown() for server in servers ] + [ job.join() for job in jobs ] diff --git a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test deleted file mode 100755 index 148570feede..00000000000 --- a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/clickhouse-test +++ /dev/null @@ -1,280 +0,0 @@ -#!/usr/bin/env python3 - -import http.server -import os -import subprocess -import sys -import tempfile -import threading -import time -import unittest -import urllib - - -config = os.path.join(os.path.dirname(sys.argv[0]), 'config.xml') -test_csv = os.path.join(os.path.dirname(sys.argv[0]), 'test.csv') -format = 'column1 UInt32, column2 UInt32, column3 UInt32' -values = '(1, 2, 3), (3, 2, 1), (78, 43, 45)' -other_values = '(1, 1, 1), (1, 1, 1), (11, 11, 11)' -redirecting_host = '127.0.0.1' -redirecting_to_http_port = 12345 -redirecting_to_https_port = 12346 -preserving_data_port = 12347 -redirecting_preserving_data_port = 12348 -fakes3_port = 9990 -localhost = '127.0.0.1' -bucket = 'abc' - - -prepare_put_queries = [ - "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(localhost, fakes3_port, bucket, format, values), -] - -queries = [ - "select *, column1*column2*column3 from file('{}', 'CSV', '{}')".format(test_csv, format), - "select *, column1*column2*column3 from url('https://storage.yandexcloud.net/milovidov/test.csv', 'CSV', '{}')".format(format), - "select *, column1*column2*column3 from s3('http://storage.yandexcloud.net/milovidov/test.csv', 'CSV', '{}')".format(format), - "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(localhost, fakes3_port, bucket, format), - "select *, column1*column2*column3 from s3('https://storage.yandexcloud.net/milovidov/test.csv', 'CSV', '{}')".format(format), - "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(redirecting_host, redirecting_to_http_port, format), - "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(redirecting_host, redirecting_to_https_port, format), -] - -put_query = "insert into table function s3('http://{}:{}/', 'CSV', '{}') values {}".format(redirecting_host, preserving_data_port, format, values) - -redirect_put_query = "insert into table function s3('http://{}:{}/', 'CSV', '{}') values {}".format(redirecting_host, redirecting_preserving_data_port, format, other_values) - -check_queries = [ - "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(localhost, fakes3_port, bucket, format), -] - - -def run_query(query): - result = subprocess.run([os.path.expanduser('~/ClickHouse-bin/dbms/programs/clickhouse-local'), '-c', config, '-q', query] - , stdout=subprocess.PIPE - , universal_newlines=True) - result.check_returncode() - return result.stdout - - -class RedirectingToHTTPHTTPServer(http.server.BaseHTTPRequestHandler): - def do_GET(self): - self.send_response(307) - self.send_header('Content-type', 'text/xml') - self.send_header('Location', 'http://storage.yandexcloud.net/milovidov/test.csv') - self.end_headers() - self.wfile.write(r''' - - TemporaryRedirect - Please re-send this request to the specified temporary endpoint. - Continue to use the original request endpoint for future requests. - storage.yandexcloud.net -'''.encode()) - - -class RedirectingToHTTPSHTTPServer(http.server.BaseHTTPRequestHandler): - def do_GET(self): - self.send_response(307) - self.send_header('Content-type', 'text/xml') - self.send_header('Location', 'https://storage.yandexcloud.net/milovidov/test.csv') - self.end_headers() - self.wfile.write(r''' - - TemporaryRedirect - Please re-send this request to the specified temporary endpoint. - Continue to use the original request endpoint for future requests. - storage.yandexcloud.net -'''.encode()) - - -received_data = [] -received_data_completed = False - - -class PreservingDataServer(http.server.BaseHTTPRequestHandler): - protocol_version = 'HTTP/1.1' - - def handle_expect_100(self): - print('Received Expect-100', file=sys.stderr) - self.send_response_only(100) - self.end_headers() - return True - - def do_POST(self): - self.send_response(200) - query = urllib.parse.urlparse(self.path).query - print('POST', query) - if query == 'uploads': - data = r''' -TEST'''.encode() - self.send_header('Content-length', str(len(data))) - self.send_header('Content-type', 'text/plain') - self.end_headers() - self.wfile.write(data) - else: - data = self.rfile.read(int(self.headers.get('Content-Length'))) - assert query == 'uploadId=TEST' - assert data == b'1hello-etag' - self.send_header('Content-type', 'text/plain') - self.end_headers() - global received_data_completed - received_data_completed = True - - def do_PUT(self): - self.send_response(200) - self.send_header('Content-type', 'text/plain') - self.send_header('ETag', 'hello-etag') - self.end_headers() - query = urllib.parse.urlparse(self.path).query - print('Content-Length =', self.headers.get('Content-Length'), file=sys.stderr) - print('PUT', query) - assert self.headers.get('Content-Length') - assert self.headers['Expect'] == '100-continue' - received_data.append(self.rfile.read()) - self.wfile.flush() - - -class RedirectingPreservingDataServer(http.server.BaseHTTPRequestHandler): - protocol_version = 'HTTP/1.1' - - def handle_expect_100(self): - print('Received Expect-100', file=sys.stderr) - return True - - def do_POST(self): - query = urllib.parse.urlparse(self.path).query - if query: - query = '?{}'.format(query) - self.send_response(307) - self.send_header('Content-type', 'text/xml') - self.send_header('Location', 'http://{host}:{port}/{bucket}/test.csv{query}'.format(host=localhost, port=fakes3_port, bucket=bucket, query=query)) - self.end_headers() - self.wfile.write(r''' - - TemporaryRedirect - Please re-send this request to the specified temporary endpoint. - Continue to use the original request endpoint for future requests. - {host}:{port} -'''.format(host=localhost, port=fakes3_port).encode()) - - def do_PUT(self): - query = urllib.parse.urlparse(self.path).query - if query: - query = '?{}'.format(query) - self.send_response(307) - self.send_header('Content-type', 'text/xml') - self.send_header('Location', 'http://{host}:{port}/{bucket}/test.csv{query}'.format(host=localhost, port=fakes3_port, bucket=bucket, query=query)) - self.end_headers() - self.wfile.write(r''' - - TemporaryRedirect - Please re-send this request to the specified temporary endpoint. - Continue to use the original request endpoint for future requests. - {host}:{port} -'''.format(host=localhost, port=fakes3_port).encode()) - - -servers = [] -def redirecting_to_https_thread(): - server = http.server.HTTPServer((redirecting_host, redirecting_to_https_port), RedirectingToHTTPSHTTPServer) - servers.append(server) - server.handle_request() - -def redirecting_to_http_thread(): - server = http.server.HTTPServer((redirecting_host, redirecting_to_http_port), RedirectingToHTTPHTTPServer) - servers.append(server) - server.handle_request() - -def preserving_thread(): - server = http.server.HTTPServer((redirecting_host, preserving_data_port), PreservingDataServer) - servers.append(server) - while True: - server.handle_request() - -def redirecting_preserving_thread(): - server = http.server.HTTPServer((redirecting_host, redirecting_preserving_data_port), RedirectingPreservingDataServer) - servers.append(server) - while True: - server.handle_request() - - -def run_gofakes3(): - l = threading.Lock() - l.acquire() - - def gofakes3_thread(): - with tempfile.TemporaryDirectory() as d: - try: - subprocess.run(['git', 'clone', 'https://github.com/johannesboyne/gofakes3'], cwd=d).check_returncode() - repo = os.path.join(d, 'gofakes3') - subprocess.run(['git', 'checkout', 'd419e1bd286f47170a4f87851a81f5c30107551a'], cwd=repo).check_returncode() - tool = os.path.join(repo, 'cmd', 'gofakes3', 'main.go') - subprocess.run(['go', 'build', tool], cwd=repo).check_returncode() - finally: - l.release() - binary = os.path.join(repo, 'main') - subprocess.run([binary, '-backend', 'memory', '-host', ':{}'.format(fakes3_port), '-initialbucket', bucket]) - - thread = threading.Thread(target=gofakes3_thread) - thread.start() - l.acquire() - time.sleep(0.5) - l.release() - return thread - - -def stop_subprocesses(): - pid = os.getpid() - result = subprocess.run(['pgrep', '-P', str(pid)], stdout=subprocess.PIPE) - result.check_returncode() - for child_pid in result.stdout.splitlines(): - subprocess.run(['kill', child_pid]).check_returncode() - - -run_gofakes3() - -jobs = [] -jobs.append(threading.Thread(target=redirecting_to_http_thread)) -jobs.append(threading.Thread(target=redirecting_to_https_thread)) -jobs.append(threading.Thread(target=preserving_thread)) -jobs.append(threading.Thread(target=redirecting_preserving_thread)) -[ job.start() for job in jobs ] - -for query in prepare_put_queries: - print(query) - run_query(query) - -for query in queries: - print(query) - stdout = run_query(query) - unittest.TestCase().assertEqual(list(map(str.split, stdout.splitlines())), [ - ['1', '2', '3', '6'], - ['3', '2', '1', '6'], - ['78', '43', '45', '150930'], - ]) - -query = put_query -print(query) -received_data_completed = False -run_query(query) -unittest.TestCase().assertEqual(received_data[-1].decode(), '1,2,3\n3,2,1\n78,43,45\n') -unittest.TestCase().assertTrue(received_data_completed) - -query = redirect_put_query -print(query) -run_query(query) - -for query in check_queries: - print(query) - stdout = run_query(query) - unittest.TestCase().assertEqual(list(map(str.split, stdout.splitlines())), [ - ['1', '1', '1', '1'], - ['1', '1', '1', '1'], - ['11', '11', '11', '1331'], - ]) - -stop_subprocesses() - -[ server.socket.close() for server in servers ] -os._exit(0) -[ job.join() for job in jobs ] diff --git a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/config.xml b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/config.xml deleted file mode 100644 index 7675c696456..00000000000 --- a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/config.xml +++ /dev/null @@ -1,115 +0,0 @@ - - default - - trace - ~/clickhouse-server.log - ~/clickhouse-server.err.log - 1000M - 10 - - - - - - 8 - - - - - - - - - - - ::/0 - - - - default - - - default - - - - - - - a = 1 - - - - - a + b < 1 or c - d > 5 - - - - - c = 1 - - - - - - - - - - - - - - - - 3600 - - - 0 - 0 - 0 - 0 - 0 - - - - diff --git a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/test.csv b/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/test.csv deleted file mode 100644 index a2325127dec..00000000000 --- a/dbms/tests/queries/0_stateless/00950_table_function_s3_wip/test.csv +++ /dev/null @@ -1,3 +0,0 @@ -1,2,3 -3,2,1 -78,43,45 From 8c4eb13be6f59677349b707303b8f65bb0737d0e Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Tue, 2 Jul 2019 23:17:00 +0000 Subject: [PATCH 075/309] Fixed unavailable test servers issue in test_storage_s3. --- .../tests/integration/test_storage_s3/test.py | 53 +++++++++++-------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/dbms/tests/integration/test_storage_s3/test.py b/dbms/tests/integration/test_storage_s3/test.py index 155b502bb15..43de0e142ef 100644 --- a/dbms/tests/integration/test_storage_s3/test.py +++ b/dbms/tests/integration/test_storage_s3/test.py @@ -40,31 +40,37 @@ except ImportError: from http.server import HTTPServer -localhost = '127.0.0.1' - -def GetFreeTCPPorts(n): - result = [] - sockets = [] - for i in range(n): - tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - tcp.bind((localhost, 0)) - addr, port = tcp.getsockname() - result.append(port) - sockets.append(tcp) - [ s.close() for s in sockets ] - return result - -test_csv = os.path.join(os.path.dirname(sys.argv[0]), 'test.csv') -format = 'column1 UInt32, column2 UInt32, column3 UInt32' -values = '(1, 2, 3), (3, 2, 1), (78, 43, 45)' -other_values = '(1, 1, 1), (1, 1, 1), (11, 11, 11)' -redirecting_host = localhost -redirecting_to_http_port, redirecting_to_https_port, preserving_data_port, redirecting_preserving_data_port = GetFreeTCPPorts(4) -bucket = 'abc' - - def test_sophisticated_default(started_cluster): instance = started_cluster.instances['dummy'] + + def GetCurrentIP(): + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(("8.8.8.8", 80)) + ip = s.getsockname()[0] + s.close() + return ip + + localhost = GetCurrentIP() + + def GetFreeTCPPorts(n): + result = [] + sockets = [] + for i in range(n): + tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + tcp.bind((localhost, 0)) + addr, port = tcp.getsockname() + result.append(port) + sockets.append(tcp) + [ s.close() for s in sockets ] + return result + + format = 'column1 UInt32, column2 UInt32, column3 UInt32' + values = '(1, 2, 3), (3, 2, 1), (78, 43, 45)' + other_values = '(1, 1, 1), (1, 1, 1), (11, 11, 11)' + redirecting_host = localhost + redirecting_to_http_port, redirecting_to_https_port, preserving_data_port, redirecting_preserving_data_port = GetFreeTCPPorts(4) + bucket = 'abc' + def run_query(query): return instance.query(query) @@ -237,6 +243,7 @@ def test_sophisticated_default(started_cluster): [ job.start() for job in jobs ] try: + subprocess.check_call(['ss', '-an']) for query in prepare_put_queries: print(query) run_query(query) From e9300ffbb55faff295650a8dc8e3411e5afce6cd Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Sun, 21 Jul 2019 11:45:01 +0000 Subject: [PATCH 076/309] Tests finally works! --- dbms/tests/integration/helpers/cluster.py | 4 +- dbms/tests/integration/runner | 2 +- .../tests/integration/test_storage_s3/test.py | 85 ++++++++++++++----- 3 files changed, 68 insertions(+), 23 deletions(-) diff --git a/dbms/tests/integration/helpers/cluster.py b/dbms/tests/integration/helpers/cluster.py index aadd2e70a52..4131ee08653 100644 --- a/dbms/tests/integration/helpers/cluster.py +++ b/dbms/tests/integration/helpers/cluster.py @@ -225,12 +225,12 @@ class ClickHouseCluster: def restart_instance_with_ip_change(self, node, new_ip): if '::' in new_ip: if node.ipv6_address is None: - raise Exception("You shoud specity ipv6_address in add_node method") + raise Exception("You should specity ipv6_address in add_node method") self._replace(node.docker_compose_path, node.ipv6_address, new_ip) node.ipv6_address = new_ip else: if node.ipv4_address is None: - raise Exception("You shoud specity ipv4_address in add_node method") + raise Exception("You should specity ipv4_address in add_node method") self._replace(node.docker_compose_path, node.ipv4_address, new_ip) node.ipv4_address = new_ip subprocess.check_call(self.base_cmd + ["stop", node.name]) diff --git a/dbms/tests/integration/runner b/dbms/tests/integration/runner index 0d0ec929b96..071df8b1fd0 100755 --- a/dbms/tests/integration/runner +++ b/dbms/tests/integration/runner @@ -107,4 +107,4 @@ if __name__ == "__main__": ) #print(cmd) - subprocess.check_call(cmd, shell=True) \ No newline at end of file + subprocess.check_call(cmd, shell=True) diff --git a/dbms/tests/integration/test_storage_s3/test.py b/dbms/tests/integration/test_storage_s3/test.py index 43de0e142ef..8dba1e9a440 100644 --- a/dbms/tests/integration/test_storage_s3/test.py +++ b/dbms/tests/integration/test_storage_s3/test.py @@ -21,7 +21,6 @@ import sys import tempfile import threading import time -import unittest try: @@ -40,6 +39,10 @@ except ImportError: from http.server import HTTPServer +received_data = [] +received_data_completed = False + + def test_sophisticated_default(started_cluster): instance = started_cluster.instances['dummy'] @@ -72,7 +75,10 @@ def test_sophisticated_default(started_cluster): bucket = 'abc' def run_query(query): - return instance.query(query) + print('Running query "{}"...'.format(query)) + result = instance.query(query) + print('Query finished') + return result prepare_put_queries = [ @@ -81,7 +87,8 @@ def test_sophisticated_default(started_cluster): queries = [ "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(redirecting_host, redirecting_to_http_port, format), - "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(redirecting_host, redirecting_to_https_port, format), +# "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(redirecting_host, redirecting_to_https_port, format), +# FIXME ] put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(redirecting_host, preserving_data_port, bucket, format, values) @@ -125,13 +132,28 @@ def test_sophisticated_default(started_cluster): self.finish() - received_data = [] - received_data_completed = False - - class PreservingDataHandler(BaseHTTPRequestHandler): protocol_version = 'HTTP/1.1' - + + def parse_request(self): + result = BaseHTTPRequestHandler.parse_request(self) + # Adaptation to Python 3. + if sys.version_info.major == 2 and result == True: + expect = self.headers.get('Expect', "") + if (expect.lower() == "100-continue" and self.protocol_version >= "HTTP/1.1" and self.request_version >= "HTTP/1.1"): + if not self.handle_expect_100(): + return False + return result + + def send_response_only(self, code, message=None): + if message is None: + if code in self.responses: + message = self.responses[code][0] + else: + message = '' + if self.request_version != 'HTTP/0.9': + self.wfile.write("%s %d %s\r\n" % (self.protocol_version, code, message)) + def handle_expect_100(self): # FIXME it does not work in Python 2. :( print('Received Expect-100') @@ -154,6 +176,7 @@ def test_sophisticated_default(started_cluster): data = self.rfile.read(int(self.headers.get('Content-Length'))) assert query == 'uploadId=TEST' assert data == b'1hello-etag' + self.send_header('Content-length', '0') # FIXME on python2 somehow connection does not close without this self.send_header('Content-type', 'text/plain') self.end_headers() global received_data_completed @@ -193,7 +216,26 @@ def test_sophisticated_default(started_cluster): class RedirectingPreservingDataHandler(BaseHTTPRequestHandler): protocol_version = 'HTTP/1.1' - + + def parse_request(self): + result = BaseHTTPRequestHandler.parse_request(self) + # Adaptation to Python 3. + if sys.version_info.major == 2 and result == True: + expect = self.headers.get('Expect', "") + if (expect.lower() == "100-continue" and self.protocol_version >= "HTTP/1.1" and self.request_version >= "HTTP/1.1"): + if not self.handle_expect_100(): + return False + return result + + def send_response_only(self, code, message=None): + if message is None: + if code in self.responses: + message = self.responses[code][0] + else: + message = '' + if self.request_version != 'HTTP/0.9': + self.wfile.write("%s %d %s\r\n" % (self.protocol_version, code, message)) + def handle_expect_100(self): print('Received Expect-100') return True @@ -243,40 +285,43 @@ def test_sophisticated_default(started_cluster): [ job.start() for job in jobs ] try: - subprocess.check_call(['ss', '-an']) + print('Phase 1') for query in prepare_put_queries: - print(query) run_query(query) + print('Phase 2') for query in queries: - print(query) stdout = run_query(query) - unittest.TestCase().assertEqual(list(map(str.split, stdout.splitlines())), [ + assert list(map(str.split, stdout.splitlines())) == [ ['1', '2', '3', '6'], ['3', '2', '1', '6'], ['78', '43', '45', '150930'], - ]) + ] + print('Phase 3') query = put_query - print(query) + global received_data_completed received_data_completed = False run_query(query) - unittest.TestCase().assertEqual(received_data[-1].decode(), '1,2,3\n3,2,1\n78,43,45\n') - unittest.TestCase().assertTrue(received_data_completed) + assert received_data[-1].decode() == '1,2,3\n3,2,1\n78,43,45\n' + assert received_data_completed + print('Phase 4') query = redirect_put_query - print(query) run_query(query) for query in check_queries: print(query) stdout = run_query(query) - unittest.TestCase().assertEqual(list(map(str.split, stdout.splitlines())), [ + assert list(map(str.split, stdout.splitlines())) == [ ['1', '1', '1', '1'], ['1', '1', '1', '1'], ['11', '11', '11', '1331'], - ]) + ] finally: + print('Shutting down') [ server.shutdown() for server in servers ] + print('Joining threads') [ job.join() for job in jobs ] + print('Done') From 81f49d97bd3484ebeabe56bc84c5555824ff87f2 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Wed, 28 Aug 2019 13:30:26 +0000 Subject: [PATCH 077/309] Minor fix. --- dbms/tests/integration/test_storage_s3/test.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dbms/tests/integration/test_storage_s3/test.py b/dbms/tests/integration/test_storage_s3/test.py index 8dba1e9a440..57b86e74c64 100644 --- a/dbms/tests/integration/test_storage_s3/test.py +++ b/dbms/tests/integration/test_storage_s3/test.py @@ -87,8 +87,6 @@ def test_sophisticated_default(started_cluster): queries = [ "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(redirecting_host, redirecting_to_http_port, format), -# "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(redirecting_host, redirecting_to_https_port, format), -# FIXME ] put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(redirecting_host, preserving_data_port, bucket, format, values) @@ -155,7 +153,6 @@ def test_sophisticated_default(started_cluster): self.wfile.write("%s %d %s\r\n" % (self.protocol_version, code, message)) def handle_expect_100(self): - # FIXME it does not work in Python 2. :( print('Received Expect-100') self.send_response_only(100) self.end_headers() @@ -176,7 +173,6 @@ def test_sophisticated_default(started_cluster): data = self.rfile.read(int(self.headers.get('Content-Length'))) assert query == 'uploadId=TEST' assert data == b'1hello-etag' - self.send_header('Content-length', '0') # FIXME on python2 somehow connection does not close without this self.send_header('Content-type', 'text/plain') self.end_headers() global received_data_completed From fa01cc162c5ea98f075519be24237f36a854089e Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Wed, 4 Sep 2019 16:10:25 +0000 Subject: [PATCH 078/309] Merge fix. --- dbms/src/Storages/StorageS3.cpp | 5 ++++- dbms/src/Storages/StorageS3.h | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/dbms/src/Storages/StorageS3.cpp b/dbms/src/Storages/StorageS3.cpp index 1de1bdbccfa..23ad2b35f13 100644 --- a/dbms/src/Storages/StorageS3.cpp +++ b/dbms/src/Storages/StorageS3.cpp @@ -140,7 +140,10 @@ BlockInputStreams StorageS3::read(const Names & column_names, return {std::make_shared(block_input, column_defaults, context)}; } -void StorageS3::rename(const String & /*new_path_to_db*/, const String & /*new_database_name*/, const String & /*new_table_name*/) {} +void StorageS3::rename(const String & /*new_path_to_db*/, const String & /*new_database_name*/, const String & /*new_table_name*/, TableStructureWriteLockHolder &) +{ + // FIXME +} BlockOutputStreamPtr StorageS3::write(const ASTPtr & /*query*/, const Context & /*context*/) { diff --git a/dbms/src/Storages/StorageS3.h b/dbms/src/Storages/StorageS3.h index a38cd717e36..ecc93f0f616 100644 --- a/dbms/src/Storages/StorageS3.h +++ b/dbms/src/Storages/StorageS3.h @@ -53,7 +53,7 @@ public: BlockOutputStreamPtr write(const ASTPtr & query, const Context & context) override; - void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name) override; + void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name, TableStructureWriteLockHolder &) override; protected: Poco::URI uri; From cdbcfc2c908130a1fd3b8840df2b462cead43fe4 Mon Sep 17 00:00:00 2001 From: l Date: Wed, 11 Sep 2019 18:39:30 +0200 Subject: [PATCH 079/309] + Redirect --- dbms/src/Common/ErrorCodes.cpp | 4 +++ dbms/src/Core/Settings.h | 2 ++ dbms/src/IO/HTTPCommon.cpp | 12 ++++++-- dbms/src/IO/HTTPCommon.h | 2 +- dbms/src/IO/ReadWriteBufferFromHTTP.cpp | 39 +++++++++++++++++++++++++ dbms/src/IO/ReadWriteBufferFromHTTP.h | 6 ++++ dbms/src/Storages/StorageURL.cpp | 3 +- 7 files changed, 63 insertions(+), 5 deletions(-) diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp index c4aa1449e0f..ec70b99305c 100644 --- a/dbms/src/Common/ErrorCodes.cpp +++ b/dbms/src/Common/ErrorCodes.cpp @@ -452,6 +452,10 @@ namespace ErrorCodes extern const int INVALID_WITH_FILL_EXPRESSION = 475; extern const int WITH_TIES_WITHOUT_ORDER_BY = 476; extern const int INVALID_USAGE_OF_INPUT = 477; + extern const int TOO_MANY_REDIRECTS = 478; + + extern const int KEEPER_EXCEPTION = 999; + extern const int POCO_EXCEPTION = 1000; extern const int KEEPER_EXCEPTION = 999; extern const int POCO_EXCEPTION = 1000; diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index 0678aaeedc6..f1e427040d4 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -169,6 +169,8 @@ struct Settings : public SettingsCollection \ M(SettingBool, add_http_cors_header, false, "Write add http CORS header.") \ \ + M(SettingUInt64, max_http_get_redirects, 0, "Max number of http GET redirects hops allowed.") \ + \ M(SettingBool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, CSVWithNames, TSVWithNames and TSKV formats).") \ M(SettingBool, input_format_with_names_use_header, false, "For TSVWithNames and CSVWithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.") \ M(SettingBool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).") \ diff --git a/dbms/src/IO/HTTPCommon.cpp b/dbms/src/IO/HTTPCommon.cpp index 946819c5f63..46857744c85 100644 --- a/dbms/src/IO/HTTPCommon.cpp +++ b/dbms/src/IO/HTTPCommon.cpp @@ -40,6 +40,7 @@ namespace ErrorCodes extern const int RECEIVED_ERROR_TOO_MANY_REQUESTS; extern const int FEATURE_IS_NOT_ENABLED_AT_BUILD_TIME; extern const int UNSUPPORTED_URI_SCHEME; + extern const int TOO_MANY_REDIRECTS; } @@ -223,8 +224,15 @@ std::istream * receiveResponse( auto istr = &session.receiveResponse(response); auto status = response.getStatus(); - if (status != Poco::Net::HTTPResponse::HTTP_OK) - { + if ( + ( request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET) && // we only accepts redirects on GET requests. + (status == Poco::Net::HTTPResponse::HTTP_MOVED_PERMANENTLY || // 301 + status == Poco::Net::HTTPResponse::HTTP_FOUND || // 302 + status == Poco::Net::HTTPResponse::HTTP_SEE_OTHER || // 303 + status == Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT) // 307 + ) { + throw Poco::URIRedirection(response.get("Location")); + } else if (status != Poco::Net::HTTPResponse::HTTP_OK) { std::stringstream error_message; error_message << "Received error from remote server " << request.getURI() << ". HTTP status code: " << status << " " << response.getReason() << ", body: " << istr->rdbuf(); diff --git a/dbms/src/IO/HTTPCommon.h b/dbms/src/IO/HTTPCommon.h index 6dc669c248e..dda8d2aac7e 100644 --- a/dbms/src/IO/HTTPCommon.h +++ b/dbms/src/IO/HTTPCommon.h @@ -9,7 +9,7 @@ #include #include #include - +#include #include diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.cpp b/dbms/src/IO/ReadWriteBufferFromHTTP.cpp index 4d046bfe2c6..4ace7624ac5 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.cpp +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.cpp @@ -1 +1,40 @@ #include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int TOO_MANY_REDIRECTS; +} + + +std::unique_ptr makeReadWriteBufferFromHTTP(const Poco::URI & uri, + const std::string & method, + std::function callback, + const DB::ConnectionTimeouts & timeouts, + const DB::SettingUInt64 max_redirects) + { + auto actual_uri =uri; + UInt64 redirects = 0; + + do + { + try + { + return std::make_unique(actual_uri, method, callback, timeouts); + } + catch (Poco::URIRedirection & exc) { + redirects++; + actual_uri = exc.uri(); + } + } while(max_redirects>redirects); + + // too many redirects.... + std::stringstream error_message; + error_message << "Too many redirects while trying to access " << uri.toString() ; + + throw Exception(error_message.str(), ErrorCodes::TOO_MANY_REDIRECTS); + } +} \ No newline at end of file diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.h b/dbms/src/IO/ReadWriteBufferFromHTTP.h index d36633220b4..9abcd0edf03 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.h +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.h @@ -16,6 +16,7 @@ #include #include #include +#include #define DEFAULT_HTTP_READ_BUFFER_TIMEOUT 1800 @@ -139,5 +140,10 @@ public: } }; +std::unique_ptr makeReadWriteBufferFromHTTP(const Poco::URI & uri, + const std::string & method, + std::function callback, + const ConnectionTimeouts & timeouts, + const SettingUInt64 max_redirects); } diff --git a/dbms/src/Storages/StorageURL.cpp b/dbms/src/Storages/StorageURL.cpp index 4f3d41604f5..ebbffd20675 100644 --- a/dbms/src/Storages/StorageURL.cpp +++ b/dbms/src/Storages/StorageURL.cpp @@ -54,8 +54,7 @@ namespace const ConnectionTimeouts & timeouts) : name(name_) { - read_buf = std::make_unique(uri, method, callback, timeouts); - + read_buf = makeReadWriteBufferFromHTTP(uri, method, callback, timeouts,context.getSettingsRef().max_http_get_redirects); reader = FormatFactory::instance().getInput(format, *read_buf, sample_block, context, max_block_size); } From adb4a580922ea73435c3b3fdbc19c427a429aad3 Mon Sep 17 00:00:00 2001 From: l Date: Wed, 11 Sep 2019 22:18:22 +0200 Subject: [PATCH 080/309] Style checks & security considerations added to setting description. --- dbms/src/Common/ErrorCodes.cpp | 3 -- dbms/src/Core/Settings.h | 2 +- dbms/src/IO/HTTPCommon.cpp | 12 ++---- dbms/src/IO/ReadWriteBufferFromHTTP.cpp | 55 +++++++++++++------------ 4 files changed, 33 insertions(+), 39 deletions(-) diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp index ec70b99305c..45e817e4795 100644 --- a/dbms/src/Common/ErrorCodes.cpp +++ b/dbms/src/Common/ErrorCodes.cpp @@ -453,9 +453,6 @@ namespace ErrorCodes extern const int WITH_TIES_WITHOUT_ORDER_BY = 476; extern const int INVALID_USAGE_OF_INPUT = 477; extern const int TOO_MANY_REDIRECTS = 478; - - extern const int KEEPER_EXCEPTION = 999; - extern const int POCO_EXCEPTION = 1000; extern const int KEEPER_EXCEPTION = 999; extern const int POCO_EXCEPTION = 1000; diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index f1e427040d4..3620654abd3 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -169,7 +169,7 @@ struct Settings : public SettingsCollection \ M(SettingBool, add_http_cors_header, false, "Write add http CORS header.") \ \ - M(SettingUInt64, max_http_get_redirects, 0, "Max number of http GET redirects hops allowed.") \ + M(SettingUInt64, max_http_get_redirects, 0, "Max number of http GET redirects hops allowed. Make sure additional security measures are in place to prevent a malicious server to redirect your requests to unexpected services.") \ \ M(SettingBool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, CSVWithNames, TSVWithNames and TSKV formats).") \ M(SettingBool, input_format_with_names_use_header, false, "For TSVWithNames and CSVWithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.") \ diff --git a/dbms/src/IO/HTTPCommon.cpp b/dbms/src/IO/HTTPCommon.cpp index 46857744c85..a47800ee021 100644 --- a/dbms/src/IO/HTTPCommon.cpp +++ b/dbms/src/IO/HTTPCommon.cpp @@ -224,15 +224,11 @@ std::istream * receiveResponse( auto istr = &session.receiveResponse(response); auto status = response.getStatus(); - if ( - ( request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET) && // we only accepts redirects on GET requests. - (status == Poco::Net::HTTPResponse::HTTP_MOVED_PERMANENTLY || // 301 - status == Poco::Net::HTTPResponse::HTTP_FOUND || // 302 - status == Poco::Net::HTTPResponse::HTTP_SEE_OTHER || // 303 - status == Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT) // 307 - ) { + if ( ( request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET ) && (status == Poco::Net::HTTPResponse::HTTP_MOVED_PERMANENTLY || status == Poco::Net::HTTPResponse::HTTP_FOUND || status == Poco::Net::HTTPResponse::HTTP_SEE_OTHER || status == Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT) ) throw Poco::URIRedirection(response.get("Location")); - } else if (status != Poco::Net::HTTPResponse::HTTP_OK) { + + if (status != Poco::Net::HTTPResponse::HTTP_OK) + { std::stringstream error_message; error_message << "Received error from remote server " << request.getURI() << ". HTTP status code: " << status << " " << response.getReason() << ", body: " << istr->rdbuf(); diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.cpp b/dbms/src/IO/ReadWriteBufferFromHTTP.cpp index 4ace7624ac5..f20adafed22 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.cpp +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.cpp @@ -6,35 +6,36 @@ namespace DB namespace ErrorCodes { - extern const int TOO_MANY_REDIRECTS; + extern const int TOO_MANY_REDIRECTS; } std::unique_ptr makeReadWriteBufferFromHTTP(const Poco::URI & uri, - const std::string & method, - std::function callback, - const DB::ConnectionTimeouts & timeouts, - const DB::SettingUInt64 max_redirects) - { - auto actual_uri =uri; - UInt64 redirects = 0; - - do - { - try - { - return std::make_unique(actual_uri, method, callback, timeouts); - } - catch (Poco::URIRedirection & exc) { - redirects++; - actual_uri = exc.uri(); - } - } while(max_redirects>redirects); - - // too many redirects.... - std::stringstream error_message; - error_message << "Too many redirects while trying to access " << uri.toString() ; - - throw Exception(error_message.str(), ErrorCodes::TOO_MANY_REDIRECTS); - } + const std::string & method, + std::function callback, + const DB::ConnectionTimeouts & timeouts, + const DB::SettingUInt64 max_redirects) + { + auto actual_uri =uri; + UInt64 redirects = 0; + + do + { + try + { + return std::make_unique(actual_uri, method, callback, timeouts); + } + catch (Poco::URIRedirection & exc) + { + redirects++; + actual_uri = exc.uri(); + } + } while(max_redirects>redirects); + + // too many redirects.... + std::stringstream error_message; + error_message << "Too many redirects while trying to access " << uri.toString() ; + + throw Exception(error_message.str(), ErrorCodes::TOO_MANY_REDIRECTS); + } } \ No newline at end of file From 688b0d91a0157a990c3737868b97ef378c0c7879 Mon Sep 17 00:00:00 2001 From: l Date: Wed, 11 Sep 2019 22:53:53 +0200 Subject: [PATCH 081/309] Style checks --- dbms/src/IO/HTTPCommon.cpp | 2 +- dbms/src/IO/ReadWriteBufferFromHTTP.cpp | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/dbms/src/IO/HTTPCommon.cpp b/dbms/src/IO/HTTPCommon.cpp index a47800ee021..299a080136d 100644 --- a/dbms/src/IO/HTTPCommon.cpp +++ b/dbms/src/IO/HTTPCommon.cpp @@ -224,7 +224,7 @@ std::istream * receiveResponse( auto istr = &session.receiveResponse(response); auto status = response.getStatus(); - if ( ( request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET ) && (status == Poco::Net::HTTPResponse::HTTP_MOVED_PERMANENTLY || status == Poco::Net::HTTPResponse::HTTP_FOUND || status == Poco::Net::HTTPResponse::HTTP_SEE_OTHER || status == Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT) ) + if ((request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET) && (status == Poco::Net::HTTPResponse::HTTP_MOVED_PERMANENTLY || status == Poco::Net::HTTPResponse::HTTP_FOUND || status == Poco::Net::HTTPResponse::HTTP_SEE_OTHER || status == Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT) ) throw Poco::URIRedirection(response.get("Location")); if (status != Poco::Net::HTTPResponse::HTTP_OK) diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.cpp b/dbms/src/IO/ReadWriteBufferFromHTTP.cpp index f20adafed22..8e6d278d816 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.cpp +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.cpp @@ -6,7 +6,7 @@ namespace DB namespace ErrorCodes { - extern const int TOO_MANY_REDIRECTS; + extern const int TOO_MANY_REDIRECTS; } @@ -18,24 +18,24 @@ std::unique_ptr makeReadWriteBufferFromHTTP(const P { auto actual_uri =uri; UInt64 redirects = 0; - + do { - try + try { return std::make_unique(actual_uri, method, callback, timeouts); - } - catch (Poco::URIRedirection & exc) + } + catch (Poco::URIRedirection & exc) { redirects++; actual_uri = exc.uri(); } } while(max_redirects>redirects); - + // too many redirects.... std::stringstream error_message; error_message << "Too many redirects while trying to access " << uri.toString() ; - + throw Exception(error_message.str(), ErrorCodes::TOO_MANY_REDIRECTS); } } \ No newline at end of file From dda57fe0f7860783a9046367f9f255aaecdffca5 Mon Sep 17 00:00:00 2001 From: l Date: Wed, 11 Sep 2019 23:08:54 +0200 Subject: [PATCH 082/309] Style checks --- dbms/src/IO/HTTPCommon.cpp | 4 ++-- dbms/src/IO/ReadWriteBufferFromHTTP.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dbms/src/IO/HTTPCommon.cpp b/dbms/src/IO/HTTPCommon.cpp index 299a080136d..eb48e544190 100644 --- a/dbms/src/IO/HTTPCommon.cpp +++ b/dbms/src/IO/HTTPCommon.cpp @@ -224,10 +224,10 @@ std::istream * receiveResponse( auto istr = &session.receiveResponse(response); auto status = response.getStatus(); - if ((request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET) && (status == Poco::Net::HTTPResponse::HTTP_MOVED_PERMANENTLY || status == Poco::Net::HTTPResponse::HTTP_FOUND || status == Poco::Net::HTTPResponse::HTTP_SEE_OTHER || status == Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT) ) + if ((request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET) && (status == Poco::Net::HTTPResponse::HTTP_MOVED_PERMANENTLY || status == Poco::Net::HTTPResponse::HTTP_FOUND || status == Poco::Net::HTTPResponse::HTTP_SEE_OTHER || status == Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT)) throw Poco::URIRedirection(response.get("Location")); - if (status != Poco::Net::HTTPResponse::HTTP_OK) + if (status != Poco::Net::HTTPResponse::HTTP_OK) { std::stringstream error_message; error_message << "Received error from remote server " << request.getURI() << ". HTTP status code: " << status << " " diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.cpp b/dbms/src/IO/ReadWriteBufferFromHTTP.cpp index 8e6d278d816..a53ad9957b0 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.cpp +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.cpp @@ -19,7 +19,7 @@ std::unique_ptr makeReadWriteBufferFromHTTP(const P auto actual_uri =uri; UInt64 redirects = 0; - do + do { try { From 2b5a420f49b353b886c6963e999a4134280cac58 Mon Sep 17 00:00:00 2001 From: l Date: Thu, 12 Sep 2019 07:41:16 +0200 Subject: [PATCH 083/309] error: no newline at end of file [-Werror,-Wnewline-eof] --- dbms/src/IO/ReadWriteBufferFromHTTP.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.cpp b/dbms/src/IO/ReadWriteBufferFromHTTP.cpp index a53ad9957b0..ec04de7802a 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.cpp +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.cpp @@ -38,4 +38,4 @@ std::unique_ptr makeReadWriteBufferFromHTTP(const P throw Exception(error_message.str(), ErrorCodes::TOO_MANY_REDIRECTS); } -} \ No newline at end of file +} From 599ff389f7c15a583ffd74eee1ea896a44413729 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Thu, 12 Sep 2019 11:57:55 +0000 Subject: [PATCH 084/309] Merge fix. --- dbms/src/Storages/StorageS3.cpp | 2 +- dbms/src/Storages/StorageS3.h | 3 +++ dbms/src/TableFunctions/TableFunctionS3.cpp | 4 ++-- dbms/src/TableFunctions/TableFunctionS3.h | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/dbms/src/Storages/StorageS3.cpp b/dbms/src/Storages/StorageS3.cpp index 23ad2b35f13..bbc478dc151 100644 --- a/dbms/src/Storages/StorageS3.cpp +++ b/dbms/src/Storages/StorageS3.cpp @@ -170,7 +170,7 @@ void registerStorageS3(StorageFactory & factory) String format_name = engine_args[1]->as().value.safeGet(); - return StorageS3::create(uri, args.table_name, format_name, args.columns, args.context); + return StorageS3::create(uri, args.database_name, args.table_name, format_name, args.columns, args.context); }); } } diff --git a/dbms/src/Storages/StorageS3.h b/dbms/src/Storages/StorageS3.h index ecc93f0f616..12f223fe0ca 100644 --- a/dbms/src/Storages/StorageS3.h +++ b/dbms/src/Storages/StorageS3.h @@ -16,6 +16,7 @@ class StorageS3 : public ext::shared_ptr_helper, public IStorage { public: StorageS3(const Poco::URI & uri_, + const std::string & database_name_, const std::string & table_name_, const String & format_name_, const ColumnsDescription & columns_, @@ -25,6 +26,7 @@ public: , uri(uri_) , context_global(context_) , format_name(format_name_) + , database_name(database_name_) , table_name(table_name_) { } @@ -61,6 +63,7 @@ protected: private: String format_name; + String database_name; String table_name; }; diff --git a/dbms/src/TableFunctions/TableFunctionS3.cpp b/dbms/src/TableFunctions/TableFunctionS3.cpp index 5c2c6215765..38ca0830e5b 100644 --- a/dbms/src/TableFunctions/TableFunctionS3.cpp +++ b/dbms/src/TableFunctions/TableFunctionS3.cpp @@ -6,10 +6,10 @@ namespace DB { StoragePtr TableFunctionS3::getStorage( - const String & source, const String & format, const Block & sample_block, Context & global_context) const + const String & source, const String & format, const ColumnsDescription & columns, Context & global_context, const std::string & table_name) const { Poco::URI uri(source); - return StorageS3::create(uri, getName(), format, ColumnsDescription{sample_block.getNamesAndTypesList()}, global_context); + return StorageS3::create(uri, getDatabaseName(), table_name, format, columns, global_context); } void registerTableFunctionS3(TableFunctionFactory & factory) diff --git a/dbms/src/TableFunctions/TableFunctionS3.h b/dbms/src/TableFunctions/TableFunctionS3.h index 04826a01d9b..a4966be13c7 100644 --- a/dbms/src/TableFunctions/TableFunctionS3.h +++ b/dbms/src/TableFunctions/TableFunctionS3.h @@ -20,6 +20,6 @@ public: private: StoragePtr getStorage( - const String & source, const String & format, const Block & sample_block, Context & global_context) const override; + const String & source, const String & format, const ColumnsDescription & columns, Context & global_context, const std::string & table_name) const override; }; } From 1b715069fb832c26267a7ff37c486171df0a43c3 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Thu, 12 Sep 2019 14:38:53 +0000 Subject: [PATCH 085/309] Merge fix. --- dbms/src/IO/WriteBufferFromS3.cpp | 5 +++++ dbms/src/Storages/StorageS3.cpp | 5 +++-- dbms/src/Storages/StorageS3.h | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index e48081d5609..4e574a11c0b 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -49,6 +49,9 @@ void WriteBufferFromS3::nextImpl() if (!offset()) return; + + LOG_TRACE((&Logger::get("WriteBufferFromS3")), "nextImpl(), offset() == " << offset()); + temporary_buffer->write(working_buffer.begin(), offset()); last_part_size += offset(); @@ -65,9 +68,11 @@ void WriteBufferFromS3::nextImpl() void WriteBufferFromS3::finalize() { + LOG_TRACE((&Logger::get("WriteBufferFromS3")), "finalize()"); temporary_buffer->finish(); if (!buffer_string.empty()) { + LOG_TRACE((&Logger::get("WriteBufferFromS3")), "finalize(), writing last part"); writePart(buffer_string); } diff --git a/dbms/src/Storages/StorageS3.cpp b/dbms/src/Storages/StorageS3.cpp index bbc478dc151..59b2ef589a9 100644 --- a/dbms/src/Storages/StorageS3.cpp +++ b/dbms/src/Storages/StorageS3.cpp @@ -140,9 +140,10 @@ BlockInputStreams StorageS3::read(const Names & column_names, return {std::make_shared(block_input, column_defaults, context)}; } -void StorageS3::rename(const String & /*new_path_to_db*/, const String & /*new_database_name*/, const String & /*new_table_name*/, TableStructureWriteLockHolder &) +void StorageS3::rename(const String & /*new_path_to_db*/, const String & new_database_name, const String & new_table_name, TableStructureWriteLockHolder &) { - // FIXME + table_name = new_table_name; + database_name = new_database_name; } BlockOutputStreamPtr StorageS3::write(const ASTPtr & /*query*/, const Context & /*context*/) diff --git a/dbms/src/Storages/StorageS3.h b/dbms/src/Storages/StorageS3.h index 12f223fe0ca..ad073aaa14c 100644 --- a/dbms/src/Storages/StorageS3.h +++ b/dbms/src/Storages/StorageS3.h @@ -29,6 +29,7 @@ public: , database_name(database_name_) , table_name(table_name_) { + setColumns(columns_); } String getName() const override From fa416dc94149cb891bc1ff2ac47a2a97d782f1ee Mon Sep 17 00:00:00 2001 From: CurtizJ Date: Thu, 12 Sep 2019 17:48:28 +0300 Subject: [PATCH 086/309] fix Redis dictionary --- dbms/src/Common/config.h.in | 12 +---- dbms/src/Core/config_core.h.in | 1 + .../Dictionaries/RedisBlockInputStream.cpp | 38 +++++++------ .../Dictionaries/RedisDictionarySource.cpp | 53 ++++++++++--------- 4 files changed, 52 insertions(+), 52 deletions(-) diff --git a/dbms/src/Common/config.h.in b/dbms/src/Common/config.h.in index 8630954f205..ad017d3bf6b 100644 --- a/dbms/src/Common/config.h.in +++ b/dbms/src/Common/config.h.in @@ -3,17 +3,6 @@ // .h autogenerated by cmake! #cmakedefine01 USE_RE2_ST -<<<<<<< HEAD -======= -#cmakedefine01 USE_VECTORCLASS -#cmakedefine01 USE_RDKAFKA -#cmakedefine01 USE_CAPNP -#cmakedefine01 USE_EMBEDDED_COMPILER -#cmakedefine01 USE_POCO_SQLODBC -#cmakedefine01 USE_POCO_DATAODBC -#cmakedefine01 USE_POCO_MONGODB -#cmakedefine01 USE_POCO_REDIS ->>>>>>> 102967015e8813129384dcd0f6e377e5b730f167 #cmakedefine01 USE_POCO_NETSSL #cmakedefine01 USE_HDFS #cmakedefine01 USE_CPUID @@ -21,3 +10,4 @@ #cmakedefine01 USE_BROTLI #cmakedefine01 USE_UNWIND #cmakedefine01 CLICKHOUSE_SPLIT_BINARY +#cmakedefine01 USE_POCO_REDIS diff --git a/dbms/src/Core/config_core.h.in b/dbms/src/Core/config_core.h.in index 840a96413df..15402294f83 100644 --- a/dbms/src/Core/config_core.h.in +++ b/dbms/src/Core/config_core.h.in @@ -9,6 +9,7 @@ #cmakedefine01 USE_POCO_SQLODBC #cmakedefine01 USE_POCO_DATAODBC #cmakedefine01 USE_POCO_MONGODB +#cmakedefine01 USE_POCO_REDIS #cmakedefine01 USE_INTERNAL_LLVM_LIBRARY #cmakedefine01 USE_SSL diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 7f23a421989..31ae9162141 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -35,8 +35,8 @@ namespace DB const std::shared_ptr & client_, const Poco::Redis::Array & keys_, const DB::Block & sample_block, - const size_t max_block_size) - : client(client_), keys(keys_), max_block_size{max_block_size} + const size_t max_block_size_) + : client(client_), keys(keys_), max_block_size{max_block_size_} { description.init(sample_block); } @@ -87,46 +87,46 @@ namespace DB switch (type) { - case ValueType::UInt8: + case ValueType::vtUInt8: insert(column, stringValue); break; - case ValueType::UInt16: + case ValueType::vtUInt16: insert(column, stringValue); break; - case ValueType::UInt32: + case ValueType::vtUInt32: insert(column, stringValue); break; - case ValueType::UInt64: + case ValueType::vtUInt64: insert(column, stringValue); break; - case ValueType::Int8: + case ValueType::vtInt8: insert(column, stringValue); break; - case ValueType::Int16: + case ValueType::vtInt16: insert(column, stringValue); break; - case ValueType::Int32: + case ValueType::vtInt32: insert(column, stringValue); break; - case ValueType::Int64: + case ValueType::vtInt64: insert(column, stringValue); break; - case ValueType::Float32: + case ValueType::vtFloat32: insert(column, stringValue); break; - case ValueType::Float64: + case ValueType::vtFloat64: insert(column, stringValue); break; - case ValueType::String: + case ValueType::vtString: static_cast(column).insert(parse(stringValue)); break; - case ValueType::Date: + case ValueType::vtDate: static_cast(column).insertValue(parse(stringValue).getDayNum()); break; - case ValueType::DateTime: + case ValueType::vtDateTime: static_cast(column).insertValue(static_cast(parse(stringValue))); break; - case ValueType::UUID: + case ValueType::vtUUID: static_cast(column).insertValue(parse(stringValue)); break; } @@ -138,7 +138,7 @@ namespace DB Block RedisBlockInputStream::readImpl() { - if (description.sample_block.rows() == 0 || keys.size() == 0) + if (keys.isNull() || description.sample_block.rows() == 0 || keys.size() == 0) all_read = true; if (all_read) @@ -163,6 +163,8 @@ namespace DB insertValue(*columns[idx], description.types[idx].first, value, name); }; + std::cerr << "keys: " << keys.toString() << "\n"; + if (keys.begin()->get()->isArray()) { size_t num_rows = 0; @@ -199,6 +201,8 @@ namespace DB } ++cursor; + std::cerr << "Redis command: " << commandForValues.toString() << "\n"; + Poco::Redis::Array values = client->execute(commandForValues); if (keys_array.size() != values.size() + 1) // 'HMGET' primary_key secondary_keys throw Exception{"Inconsistent sizes of keys and values in Redis request", diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index 8def8abcf0e..fc1593b339e 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -61,18 +61,18 @@ namespace DB RedisDictionarySource::RedisDictionarySource( - const DictionaryStructure & dict_struct, - const std::string & host, - UInt16 port, - UInt8 db_index, - RedisStorageType::Id storage_type, - const Block & sample_block) - : dict_struct{dict_struct} - , host{host} - , port{port} - , db_index{db_index} - , storage_type{storage_type} - , sample_block{sample_block} + const DictionaryStructure & dict_struct_, + const std::string & host_, + UInt16 port_, + UInt8 db_index_, + RedisStorageType::Id storage_type_, + const Block & sample_block_) + : dict_struct{dict_struct_} + , host{host_} + , port{port_} + , db_index{db_index_} + , storage_type{storage_type_} + , sample_block{sample_block_} , client{std::make_shared(host, port)} { if (dict_struct.attributes.size() != 1) @@ -80,8 +80,8 @@ namespace DB DB::toString(dict_struct.attributes.size()) + ", expected 1", ErrorCodes::INVALID_CONFIG_PARAMETER}; - if (storage_type == RedisStorageType::HASH_MAP) { + if (storage_type == RedisStorageType::HASH_MAP) if (!dict_struct.key.has_value()) throw Exception{"Redis source with storage type \'hash_map\' must have key", ErrorCodes::INVALID_CONFIG_PARAMETER}; @@ -104,17 +104,17 @@ namespace DB RedisDictionarySource::RedisDictionarySource( - const DictionaryStructure & dict_struct, - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - Block & sample_block) + const DictionaryStructure & dict_struct_, + const Poco::Util::AbstractConfiguration & config_, + const std::string & config_prefix_, + Block & sample_block_) : RedisDictionarySource( - dict_struct, - config.getString(config_prefix + ".host"), - config.getUInt(config_prefix + ".port"), - config.getUInt(config_prefix + ".db_index", 0), - parseStorageType(config.getString(config_prefix + ".storage_type", "")), - sample_block) + dict_struct_, + config_.getString(config_prefix_ + ".host"), + config_.getUInt(config_prefix_ + ".port"), + config_.getUInt(config_prefix_ + ".db_index", 0), + parseStorageType(config_.getString(config_prefix_ + ".storage_type", "")), + sample_block_) { } @@ -140,11 +140,16 @@ namespace DB Poco::Redis::Array keys = client->execute(command_for_keys); - if (storage_type == RedisStorageType::HASH_MAP && dict_struct.key->size() == 2) + if (storage_type == RedisStorageType::HASH_MAP && !keys.isNull()) { Poco::Redis::Array hkeys; for (const auto & key : keys) { + Poco::Redis::Command command_for_type("TYPE"); + auto type_reply = client->execute(command_for_type.addRedisType(key)); + if (type_reply != "hash") + continue; + Poco::Redis::Command command_for_secondary_keys("HKEYS"); command_for_secondary_keys.addRedisType(key); From a54b43cd01952698f5786d807fd74138ca66ba1f Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Fri, 13 Sep 2019 13:04:21 +0300 Subject: [PATCH 087/309] Better test, minor fix. --- dbms/tests/integration/test_storage_s3/test.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/dbms/tests/integration/test_storage_s3/test.py b/dbms/tests/integration/test_storage_s3/test.py index 57b86e74c64..73ad752a5a8 100644 --- a/dbms/tests/integration/test_storage_s3/test.py +++ b/dbms/tests/integration/test_storage_s3/test.py @@ -46,14 +46,7 @@ received_data_completed = False def test_sophisticated_default(started_cluster): instance = started_cluster.instances['dummy'] - def GetCurrentIP(): - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - s.connect(("8.8.8.8", 80)) - ip = s.getsockname()[0] - s.close() - return ip - - localhost = GetCurrentIP() + localhost = 'localhost' def GetFreeTCPPorts(n): result = [] From b3242612854cef4d1efe28371b1cd72792ddc1c8 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Fri, 13 Sep 2019 10:18:09 +0000 Subject: [PATCH 088/309] Minor test fix. --- dbms/tests/integration/test_storage_s3/test.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/dbms/tests/integration/test_storage_s3/test.py b/dbms/tests/integration/test_storage_s3/test.py index 73ad752a5a8..68765f4a6df 100644 --- a/dbms/tests/integration/test_storage_s3/test.py +++ b/dbms/tests/integration/test_storage_s3/test.py @@ -46,25 +46,23 @@ received_data_completed = False def test_sophisticated_default(started_cluster): instance = started_cluster.instances['dummy'] - localhost = 'localhost' - - def GetFreeTCPPorts(n): + def GetFreeTCPPortsAndIP(n): result = [] sockets = [] for i in range(n): tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - tcp.bind((localhost, 0)) + tcp.bind(('localhost', 0)) addr, port = tcp.getsockname() result.append(port) sockets.append(tcp) [ s.close() for s in sockets ] - return result + return result, addr format = 'column1 UInt32, column2 UInt32, column3 UInt32' values = '(1, 2, 3), (3, 2, 1), (78, 43, 45)' other_values = '(1, 1, 1), (1, 1, 1), (11, 11, 11)' + (redirecting_to_http_port, redirecting_to_https_port, preserving_data_port, redirecting_preserving_data_port), localhost = GetFreeTCPPortsAndIP(4) redirecting_host = localhost - redirecting_to_http_port, redirecting_to_https_port, preserving_data_port, redirecting_preserving_data_port = GetFreeTCPPorts(4) bucket = 'abc' def run_query(query): From 2cddcebc312c100d179d842bcbbb0dfde34bbfd6 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Fri, 13 Sep 2019 13:17:58 +0000 Subject: [PATCH 089/309] Moved server to clickhouse instance in test. --- .../tests/integration/test_storage_s3/test.py | 262 +++--------------- .../test_storage_s3/test_server.py | 250 +++++++++++++++++ 2 files changed, 285 insertions(+), 227 deletions(-) create mode 100644 dbms/tests/integration/test_storage_s3/test_server.py diff --git a/dbms/tests/integration/test_storage_s3/test.py b/dbms/tests/integration/test_storage_s3/test.py index 68765f4a6df..9cedc5b8b4f 100644 --- a/dbms/tests/integration/test_storage_s3/test.py +++ b/dbms/tests/integration/test_storage_s3/test.py @@ -14,54 +14,36 @@ def started_cluster(): cluster.shutdown() +import json import os -import socket -import subprocess -import sys -import tempfile -import threading import time -try: - import urllib.parse as urlparse -except ImportError: - import urlparse - -try: - from BaseHTTPServer import BaseHTTPRequestHandler -except ImportError: - from http.server import BaseHTTPRequestHandler - -try: - from BaseHTTPServer import HTTPServer -except ImportError: - from http.server import HTTPServer - - -received_data = [] -received_data_completed = False - - def test_sophisticated_default(started_cluster): instance = started_cluster.instances['dummy'] - - def GetFreeTCPPortsAndIP(n): - result = [] - sockets = [] - for i in range(n): - tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - tcp.bind(('localhost', 0)) - addr, port = tcp.getsockname() - result.append(port) - sockets.append(tcp) - [ s.close() for s in sockets ] - return result, addr + instance.copy_file_to_container(os.path.join(os.path.dirname(__file__), 'test_server.py'), 'test_server.py') + communication_path = '/test_sophisticated_default' + instance.exec_in_container(['python', 'test_server.py', communication_path], detach=True) format = 'column1 UInt32, column2 UInt32, column3 UInt32' values = '(1, 2, 3), (3, 2, 1), (78, 43, 45)' other_values = '(1, 1, 1), (1, 1, 1), (11, 11, 11)' - (redirecting_to_http_port, redirecting_to_https_port, preserving_data_port, redirecting_preserving_data_port), localhost = GetFreeTCPPortsAndIP(4) + for i in range(10): + try: + raw = instance.exec_in_container(['cat', communication_path]) + data = json.loads(instance.exec_in_container(['cat', communication_path])) + redirecting_to_http_port = data['redirecting_to_http_port'] + redirecting_to_https_port = data['redirecting_to_https_port'] + preserving_data_port = data['preserving_data_port'] + redirecting_preserving_data_port = data['redirecting_preserving_data_port'] + localhost = data['localhost'] + except: + time.sleep(0.5) + else: + break + else: + assert False, 'Could not initialize mock server' + str(raw) + redirecting_host = localhost bucket = 'abc' @@ -88,189 +70,6 @@ def test_sophisticated_default(started_cluster): "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(localhost, preserving_data_port, bucket, format), ] - - class RedirectingToHTTPHandler(BaseHTTPRequestHandler): - def do_GET(self): - self.send_response(307) - self.send_header('Content-type', 'text/xml') - self.send_header('Location', 'http://storage.yandexcloud.net/milovidov/test.csv') - self.end_headers() - self.wfile.write(r''' - - TemporaryRedirect - Please re-send this request to the specified temporary endpoint. - Continue to use the original request endpoint for future requests. - storage.yandexcloud.net - '''.encode()) - self.finish() - - - class RedirectingToHTTPSHandler(BaseHTTPRequestHandler): - def do_GET(self): - self.send_response(307) - self.send_header('Content-type', 'text/xml') - self.send_header('Location', 'https://storage.yandexcloud.net/milovidov/test.csv') - self.end_headers() - self.wfile.write(r''' - - TemporaryRedirect - Please re-send this request to the specified temporary endpoint. - Continue to use the original request endpoint for future requests. - storage.yandexcloud.net - '''.encode()) - self.finish() - - - class PreservingDataHandler(BaseHTTPRequestHandler): - protocol_version = 'HTTP/1.1' - - def parse_request(self): - result = BaseHTTPRequestHandler.parse_request(self) - # Adaptation to Python 3. - if sys.version_info.major == 2 and result == True: - expect = self.headers.get('Expect', "") - if (expect.lower() == "100-continue" and self.protocol_version >= "HTTP/1.1" and self.request_version >= "HTTP/1.1"): - if not self.handle_expect_100(): - return False - return result - - def send_response_only(self, code, message=None): - if message is None: - if code in self.responses: - message = self.responses[code][0] - else: - message = '' - if self.request_version != 'HTTP/0.9': - self.wfile.write("%s %d %s\r\n" % (self.protocol_version, code, message)) - - def handle_expect_100(self): - print('Received Expect-100') - self.send_response_only(100) - self.end_headers() - return True - - def do_POST(self): - self.send_response(200) - query = urlparse.urlparse(self.path).query - print('POST', query) - if query == 'uploads': - data = r''' - TEST'''.encode() - self.send_header('Content-length', str(len(data))) - self.send_header('Content-type', 'text/plain') - self.end_headers() - self.wfile.write(data) - else: - data = self.rfile.read(int(self.headers.get('Content-Length'))) - assert query == 'uploadId=TEST' - assert data == b'1hello-etag' - self.send_header('Content-type', 'text/plain') - self.end_headers() - global received_data_completed - received_data_completed = True - self.finish() - - def do_PUT(self): - self.send_response(200) - self.send_header('Content-type', 'text/plain') - self.send_header('ETag', 'hello-etag') - self.end_headers() - query = urlparse.urlparse(self.path).query - path = urlparse.urlparse(self.path).path - print('Content-Length =', self.headers.get('Content-Length')) - print('PUT', query) - assert self.headers.get('Content-Length') - assert self.headers['Expect'] == '100-continue' - data = self.rfile.read() - received_data.append(data) - print('PUT to {}'.format(path)) - self.server.storage[path] = data - self.finish() - - def do_GET(self): - path = urlparse.urlparse(self.path).path - if path in self.server.storage: - self.send_response(200) - self.send_header('Content-type', 'text/plain') - self.send_header('Content-length', str(len(self.server.storage[path]))) - self.end_headers() - self.wfile.write(self.server.storage[path]) - else: - self.send_response(404) - self.end_headers() - self.finish() - - - class RedirectingPreservingDataHandler(BaseHTTPRequestHandler): - protocol_version = 'HTTP/1.1' - - def parse_request(self): - result = BaseHTTPRequestHandler.parse_request(self) - # Adaptation to Python 3. - if sys.version_info.major == 2 and result == True: - expect = self.headers.get('Expect', "") - if (expect.lower() == "100-continue" and self.protocol_version >= "HTTP/1.1" and self.request_version >= "HTTP/1.1"): - if not self.handle_expect_100(): - return False - return result - - def send_response_only(self, code, message=None): - if message is None: - if code in self.responses: - message = self.responses[code][0] - else: - message = '' - if self.request_version != 'HTTP/0.9': - self.wfile.write("%s %d %s\r\n" % (self.protocol_version, code, message)) - - def handle_expect_100(self): - print('Received Expect-100') - return True - - def do_POST(self): - query = urlparse.urlparse(self.path).query - if query: - query = '?{}'.format(query) - self.send_response(307) - self.send_header('Content-type', 'text/xml') - self.send_header('Location', 'http://{host}:{port}/{bucket}/test.csv{query}'.format(host=localhost, port=preserving_data_port, bucket=bucket, query=query)) - self.end_headers() - self.wfile.write(r''' - - TemporaryRedirect - Please re-send this request to the specified temporary endpoint. - Continue to use the original request endpoint for future requests. - {host}:{port} - '''.format(host=localhost, port=preserving_data_port).encode()) - self.finish() - - def do_PUT(self): - query = urlparse.urlparse(self.path).query - if query: - query = '?{}'.format(query) - self.send_response(307) - self.send_header('Content-type', 'text/xml') - self.send_header('Location', 'http://{host}:{port}/{bucket}/test.csv{query}'.format(host=localhost, port=preserving_data_port, bucket=bucket, query=query)) - self.end_headers() - self.wfile.write(r''' - - TemporaryRedirect - Please re-send this request to the specified temporary endpoint. - Continue to use the original request endpoint for future requests. - {host}:{port} - '''.format(host=localhost, port=preserving_data_port).encode()) - self.finish() - - - servers = [] - servers.append(HTTPServer((redirecting_host, redirecting_to_https_port), RedirectingToHTTPSHandler)) - servers.append(HTTPServer((redirecting_host, redirecting_to_http_port), RedirectingToHTTPHandler)) - servers.append(HTTPServer((redirecting_host, preserving_data_port), PreservingDataHandler)) - servers[-1].storage = {} - servers.append(HTTPServer((redirecting_host, redirecting_preserving_data_port), RedirectingPreservingDataHandler)) - jobs = [ threading.Thread(target=server.serve_forever) for server in servers ] - [ job.start() for job in jobs ] - try: print('Phase 1') for query in prepare_put_queries: @@ -287,11 +86,24 @@ def test_sophisticated_default(started_cluster): print('Phase 3') query = put_query - global received_data_completed - received_data_completed = False run_query(query) + for i in range(10): + try: + data = json.loads(instance.exec_in_container(['cat', communication_path])) + received_data_completed = data['received_data_completed'] + received_data = data['received_data'] + finalize_data = data['finalize_data'] + finalize_data_query = data['finalize_data_query'] + except: + time.sleep(0.5) + else: + break + else: + assert False, 'Could not read data from mock server'+str(data) assert received_data[-1].decode() == '1,2,3\n3,2,1\n78,43,45\n' assert received_data_completed + assert finalize_data == '1hello-etag' + assert finalize_data_query == 'uploadId=TEST' print('Phase 4') query = redirect_put_query @@ -307,8 +119,4 @@ def test_sophisticated_default(started_cluster): ] finally: - print('Shutting down') - [ server.shutdown() for server in servers ] - print('Joining threads') - [ job.join() for job in jobs ] print('Done') diff --git a/dbms/tests/integration/test_storage_s3/test_server.py b/dbms/tests/integration/test_storage_s3/test_server.py new file mode 100644 index 00000000000..9b2ac3bdb60 --- /dev/null +++ b/dbms/tests/integration/test_storage_s3/test_server.py @@ -0,0 +1,250 @@ +try: + from BaseHTTPServer import BaseHTTPRequestHandler +except ImportError: + from http.server import BaseHTTPRequestHandler + +try: + from BaseHTTPServer import HTTPServer +except ImportError: + from http.server import HTTPServer + +try: + import urllib.parse as urlparse +except ImportError: + import urlparse + +import json +import logging +import os +import socket +import sys +import threading +import time + + +logging.getLogger().setLevel(logging.INFO) +file_handler = logging.FileHandler('/var/log/clickhouse-server/test-server.log', 'a', encoding='utf-8') +file_handler.setFormatter(logging.Formatter('%(asctime)s %(message)s')) +logging.getLogger().addHandler(file_handler) +logging.getLogger().addHandler(logging.StreamHandler()) + +comm_path = sys.argv[1] + +def GetFreeTCPPortsAndIP(n): + result = [] + sockets = [] + for i in range(n): + tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + tcp.bind((socket.gethostname(), 0)) + addr, port = tcp.getsockname() + result.append(port) + sockets.append(tcp) + [ s.close() for s in sockets ] + return result, addr + +(redirecting_to_http_port, redirecting_to_https_port, preserving_data_port, redirecting_preserving_data_port), localhost = GetFreeTCPPortsAndIP(4) +data = { + 'redirecting_to_http_port': redirecting_to_http_port, + 'redirecting_to_https_port': redirecting_to_https_port, + 'preserving_data_port': preserving_data_port, + 'redirecting_preserving_data_port': redirecting_preserving_data_port, + 'localhost': localhost +} +redirecting_host = localhost + +with open(comm_path, 'w') as f: + f.write(json.dumps(data)) + + +class RedirectingToHTTPHandler(BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(307) + self.send_header('Content-type', 'text/xml') + self.send_header('Location', 'http://storage.yandexcloud.net/milovidov/test.csv') + self.end_headers() + self.wfile.write(r''' + + TemporaryRedirect + Please re-send this request to the specified temporary endpoint. + Continue to use the original request endpoint for future requests. + storage.yandexcloud.net +'''.encode()) + self.finish() + + +class RedirectingToHTTPSHandler(BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(307) + self.send_header('Content-type', 'text/xml') + self.send_header('Location', 'https://storage.yandexcloud.net/milovidov/test.csv') + self.end_headers() + self.wfile.write(r''' + + TemporaryRedirect + Please re-send this request to the specified temporary endpoint. + Continue to use the original request endpoint for future requests. + storage.yandexcloud.net +'''.encode()) + self.finish() + + +class PreservingDataHandler(BaseHTTPRequestHandler): + protocol_version = 'HTTP/1.1' + + def parse_request(self): + result = BaseHTTPRequestHandler.parse_request(self) + # Adaptation to Python 3. + if sys.version_info.major == 2 and result == True: + expect = self.headers.get('Expect', "") + if (expect.lower() == "100-continue" and self.protocol_version >= "HTTP/1.1" and self.request_version >= "HTTP/1.1"): + if not self.handle_expect_100(): + return False + return result + + def send_response_only(self, code, message=None): + if message is None: + if code in self.responses: + message = self.responses[code][0] + else: + message = '' + if self.request_version != 'HTTP/0.9': + self.wfile.write("%s %d %s\r\n" % (self.protocol_version, code, message)) + + def handle_expect_100(self): + logging.info('Received Expect-100') + self.send_response_only(100) + self.end_headers() + return True + + def do_POST(self): + self.send_response(200) + query = urlparse.urlparse(self.path).query + logging.info('POST ' + query) + if query == 'uploads': + post_data = r''' +TEST'''.encode() + self.send_header('Content-length', str(len(post_data))) + self.send_header('Content-type', 'text/plain') + self.end_headers() + self.wfile.write(post_data) + else: + post_data = self.rfile.read(int(self.headers.get('Content-Length'))) + self.send_header('Content-type', 'text/plain') + self.end_headers() + data['received_data_completed'] = True + data['finalize_data'] = post_data + data['finalize_data_query'] = query + with open(comm_path, 'w') as f: + f.write(json.dumps(data)) + self.finish() + + def do_PUT(self): + self.send_response(200) + self.send_header('Content-type', 'text/plain') + self.send_header('ETag', 'hello-etag') + self.end_headers() + query = urlparse.urlparse(self.path).query + path = urlparse.urlparse(self.path).path + logging.info('Content-Length = ' + self.headers.get('Content-Length')) + logging.info('PUT ' + query) + assert self.headers.get('Content-Length') + assert self.headers['Expect'] == '100-continue' + put_data = self.rfile.read() + data.setdefault('received_data', []).append(put_data) + with open(comm_path, 'w') as f: + f.write(json.dumps(data)) + logging.info('PUT to {}'.format(path)) + self.server.storage[path] = put_data + self.finish() + + def do_GET(self): + path = urlparse.urlparse(self.path).path + if path in self.server.storage: + self.send_response(200) + self.send_header('Content-type', 'text/plain') + self.send_header('Content-length', str(len(self.server.storage[path]))) + self.end_headers() + self.wfile.write(self.server.storage[path]) + else: + self.send_response(404) + self.end_headers() + self.finish() + + +class RedirectingPreservingDataHandler(BaseHTTPRequestHandler): + protocol_version = 'HTTP/1.1' + + def parse_request(self): + result = BaseHTTPRequestHandler.parse_request(self) + # Adaptation to Python 3. + if sys.version_info.major == 2 and result == True: + expect = self.headers.get('Expect', "") + if (expect.lower() == "100-continue" and self.protocol_version >= "HTTP/1.1" and self.request_version >= "HTTP/1.1"): + if not self.handle_expect_100(): + return False + return result + + def send_response_only(self, code, message=None): + if message is None: + if code in self.responses: + message = self.responses[code][0] + else: + message = '' + if self.request_version != 'HTTP/0.9': + self.wfile.write("%s %d %s\r\n" % (self.protocol_version, code, message)) + + def handle_expect_100(self): + logging.info('Received Expect-100') + return True + + def do_POST(self): + query = urlparse.urlparse(self.path).query + if query: + query = '?{}'.format(query) + self.send_response(307) + self.send_header('Content-type', 'text/xml') + self.send_header('Location', 'http://{host}:{port}/{bucket}/test.csv{query}'.format(host=localhost, port=preserving_data_port, bucket=bucket, query=query)) + self.end_headers() + self.wfile.write(r''' + + TemporaryRedirect + Please re-send this request to the specified temporary endpoint. + Continue to use the original request endpoint for future requests. + {host}:{port} +'''.format(host=localhost, port=preserving_data_port).encode()) + self.finish() + + def do_PUT(self): + query = urlparse.urlparse(self.path).query + if query: + query = '?{}'.format(query) + self.send_response(307) + self.send_header('Content-type', 'text/xml') + self.send_header('Location', 'http://{host}:{port}/{bucket}/test.csv{query}'.format(host=localhost, port=preserving_data_port, bucket=bucket, query=query)) + self.end_headers() + self.wfile.write(r''' + + TemporaryRedirect + Please re-send this request to the specified temporary endpoint. + Continue to use the original request endpoint for future requests. + {host}:{port} +'''.format(host=localhost, port=preserving_data_port).encode()) + self.finish() + + +servers = [] +servers.append(HTTPServer((redirecting_host, redirecting_to_https_port), RedirectingToHTTPSHandler)) +servers.append(HTTPServer((redirecting_host, redirecting_to_http_port), RedirectingToHTTPHandler)) +servers.append(HTTPServer((redirecting_host, preserving_data_port), PreservingDataHandler)) +servers[-1].storage = {} +servers.append(HTTPServer((redirecting_host, redirecting_preserving_data_port), RedirectingPreservingDataHandler)) +jobs = [ threading.Thread(target=server.serve_forever) for server in servers ] +[ job.start() for job in jobs ] + +time.sleep(60) # Timeout + +logging.info('Shutting down') +[ server.shutdown() for server in servers ] +logging.info('Joining threads') +[ job.join() for job in jobs ] +logging.info('Done') From 4406ad10611071c12c3d869204b0a41052983660 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Fri, 13 Sep 2019 13:53:17 +0000 Subject: [PATCH 090/309] Tests fix. --- dbms/tests/integration/test_storage_s3/test.py | 7 +++++-- dbms/tests/integration/test_storage_s3/test_server.py | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/dbms/tests/integration/test_storage_s3/test.py b/dbms/tests/integration/test_storage_s3/test.py index 9cedc5b8b4f..c383a2a7bea 100644 --- a/dbms/tests/integration/test_storage_s3/test.py +++ b/dbms/tests/integration/test_storage_s3/test.py @@ -23,7 +23,8 @@ def test_sophisticated_default(started_cluster): instance = started_cluster.instances['dummy'] instance.copy_file_to_container(os.path.join(os.path.dirname(__file__), 'test_server.py'), 'test_server.py') communication_path = '/test_sophisticated_default' - instance.exec_in_container(['python', 'test_server.py', communication_path], detach=True) + bucket = 'abc' + instance.exec_in_container(['python', 'test_server.py', communication_path, bucket], detach=True) format = 'column1 UInt32, column2 UInt32, column3 UInt32' values = '(1, 2, 3), (3, 2, 1), (78, 43, 45)' @@ -45,7 +46,6 @@ def test_sophisticated_default(started_cluster): assert False, 'Could not initialize mock server' + str(raw) redirecting_host = localhost - bucket = 'abc' def run_query(query): print('Running query "{}"...'.format(query)) @@ -117,6 +117,9 @@ def test_sophisticated_default(started_cluster): ['1', '1', '1', '1'], ['11', '11', '11', '1331'], ] + # FIXME check result + + # FIXME tests for multipart finally: print('Done') diff --git a/dbms/tests/integration/test_storage_s3/test_server.py b/dbms/tests/integration/test_storage_s3/test_server.py index 9b2ac3bdb60..aed5996212b 100644 --- a/dbms/tests/integration/test_storage_s3/test_server.py +++ b/dbms/tests/integration/test_storage_s3/test_server.py @@ -29,6 +29,7 @@ logging.getLogger().addHandler(file_handler) logging.getLogger().addHandler(logging.StreamHandler()) comm_path = sys.argv[1] +bucket = sys.argv[2] def GetFreeTCPPortsAndIP(n): result = [] From 393bf8a804efa9cff972c277eeb20941af819576 Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Fri, 30 Aug 2019 16:28:02 +0300 Subject: [PATCH 091/309] =?UTF-8?q?cmake/find=5F*=20=E2=86=92=20cmake/find?= =?UTF-8?q?/*?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 78 +++++++++---------- ...contrib_lib.cmake => contrib_finder.cmake} | 0 cmake/default_libs.cmake | 4 +- .../{find_base64.cmake => find/base64.cmake} | 0 cmake/{find_boost.cmake => find/boost.cmake} | 0 .../{find_brotli.cmake => find/brotli.cmake} | 0 cmake/{find_capnp.cmake => find/capnp.cmake} | 0 .../{find_ccache.cmake => find/ccache.cmake} | 0 .../consistent-hashing.cmake} | 0 cmake/{find_cpuid.cmake => find/cpuid.cmake} | 0 .../cpuinfo.cmake} | 0 cmake/{find_cxx.cmake => find/cxx.cmake} | 0 .../execinfo.cmake} | 0 .../fastops.cmake} | 0 cmake/{find_gperf.cmake => find/gperf.cmake} | 0 cmake/{find_gtest.cmake => find/gtest.cmake} | 0 cmake/{find_h3.cmake => find/h3.cmake} | 0 cmake/{find_hdfs3.cmake => find/hdfs3.cmake} | 0 .../hyperscan.cmake} | 0 cmake/{find_icu.cmake => find/icu.cmake} | 0 .../libgsasl.cmake} | 0 .../libxml2.cmake} | 0 cmake/{find_llvm.cmake => find/llvm.cmake} | 0 cmake/{find_ltdl.cmake => find/ltdl.cmake} | 0 cmake/{find_lz4.cmake => find/lz4.cmake} | 0 cmake/{find_odbc.cmake => find/odbc.cmake} | 0 cmake/{find_orc.cmake => find/orc.cmake} | 0 .../parquet.cmake} | 2 +- .../pdqsort.cmake} | 0 cmake/{find_poco.cmake => find/poco.cmake} | 0 .../protobuf.cmake} | 0 .../rapidjson.cmake} | 0 .../rdkafka.cmake} | 0 cmake/{find_re2.cmake => find/re2.cmake} | 0 .../readline_edit.cmake} | 0 cmake/{find_rt.cmake => find/rt.cmake} | 0 .../simdjson.cmake} | 0 .../{find_snappy.cmake => find/snappy.cmake} | 0 .../sparsehash.cmake} | 0 cmake/{find_ssl.cmake => find/ssl.cmake} | 0 .../termcap.cmake} | 0 .../{find_unwind.cmake => find/unwind.cmake} | 0 .../{find_xxhash.cmake => find/xxhash.cmake} | 0 cmake/{find_zlib.cmake => find/zlib.cmake} | 0 cmake/{find_zstd.cmake => find/zstd.cmake} | 0 45 files changed, 42 insertions(+), 42 deletions(-) rename cmake/{find_contrib_lib.cmake => contrib_finder.cmake} (100%) rename cmake/{find_base64.cmake => find/base64.cmake} (100%) rename cmake/{find_boost.cmake => find/boost.cmake} (100%) rename cmake/{find_brotli.cmake => find/brotli.cmake} (100%) rename cmake/{find_capnp.cmake => find/capnp.cmake} (100%) rename cmake/{find_ccache.cmake => find/ccache.cmake} (100%) rename cmake/{find_consistent-hashing.cmake => find/consistent-hashing.cmake} (100%) rename cmake/{find_cpuid.cmake => find/cpuid.cmake} (100%) rename cmake/{find_cpuinfo.cmake => find/cpuinfo.cmake} (100%) rename cmake/{find_cxx.cmake => find/cxx.cmake} (100%) rename cmake/{find_execinfo.cmake => find/execinfo.cmake} (100%) rename cmake/{find_fastops.cmake => find/fastops.cmake} (100%) rename cmake/{find_gperf.cmake => find/gperf.cmake} (100%) rename cmake/{find_gtest.cmake => find/gtest.cmake} (100%) rename cmake/{find_h3.cmake => find/h3.cmake} (100%) rename cmake/{find_hdfs3.cmake => find/hdfs3.cmake} (100%) rename cmake/{find_hyperscan.cmake => find/hyperscan.cmake} (100%) rename cmake/{find_icu.cmake => find/icu.cmake} (100%) rename cmake/{find_libgsasl.cmake => find/libgsasl.cmake} (100%) rename cmake/{find_libxml2.cmake => find/libxml2.cmake} (100%) rename cmake/{find_llvm.cmake => find/llvm.cmake} (100%) rename cmake/{find_ltdl.cmake => find/ltdl.cmake} (100%) rename cmake/{find_lz4.cmake => find/lz4.cmake} (100%) rename cmake/{find_odbc.cmake => find/odbc.cmake} (100%) rename cmake/{find_orc.cmake => find/orc.cmake} (100%) rename cmake/{find_parquet.cmake => find/parquet.cmake} (98%) rename cmake/{find_pdqsort.cmake => find/pdqsort.cmake} (100%) rename cmake/{find_poco.cmake => find/poco.cmake} (100%) rename cmake/{find_protobuf.cmake => find/protobuf.cmake} (100%) rename cmake/{find_rapidjson.cmake => find/rapidjson.cmake} (100%) rename cmake/{find_rdkafka.cmake => find/rdkafka.cmake} (100%) rename cmake/{find_re2.cmake => find/re2.cmake} (100%) rename cmake/{find_readline_edit.cmake => find/readline_edit.cmake} (100%) rename cmake/{find_rt.cmake => find/rt.cmake} (100%) rename cmake/{find_simdjson.cmake => find/simdjson.cmake} (100%) rename cmake/{find_snappy.cmake => find/snappy.cmake} (100%) rename cmake/{find_sparsehash.cmake => find/sparsehash.cmake} (100%) rename cmake/{find_ssl.cmake => find/ssl.cmake} (100%) rename cmake/{find_termcap.cmake => find/termcap.cmake} (100%) rename cmake/{find_unwind.cmake => find/unwind.cmake} (100%) rename cmake/{find_xxhash.cmake => find/xxhash.cmake} (100%) rename cmake/{find_zlib.cmake => find/zlib.cmake} (100%) rename cmake/{find_zstd.cmake => find/zstd.cmake} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1086b00ddde..5dce3f0390e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,7 +62,7 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git" AND NOT EXISTS "${ClickHouse_SOURC message (FATAL_ERROR "Submodules are not initialized. Run\n\tgit submodule update --init --recursive") endif () -include (cmake/find_ccache.cmake) +include (cmake/find/ccache.cmake) if (NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "None") message (STATUS "CMAKE_BUILD_TYPE is not set, set to default = RELWITHDEBINFO") @@ -303,49 +303,49 @@ endif () message (STATUS "Building for: ${CMAKE_SYSTEM} ${CMAKE_SYSTEM_PROCESSOR} ${CMAKE_LIBRARY_ARCHITECTURE} ; USE_STATIC_LIBRARIES=${USE_STATIC_LIBRARIES} MAKE_STATIC_LIBRARIES=${MAKE_STATIC_LIBRARIES} SPLIT_SHARED=${SPLIT_SHARED_LIBRARIES} UNBUNDLED=${UNBUNDLED} CCACHE=${CCACHE_FOUND} ${CCACHE_VERSION}") include(GNUInstallDirs) -include (cmake/find_contrib_lib.cmake) +include (cmake/contrib_finder.cmake) include (cmake/lib_name.cmake) find_contrib_lib(double-conversion) # Must be before parquet -include (cmake/find_ssl.cmake) -include (cmake/find_icu.cmake) -include (cmake/find_boost.cmake) -include (cmake/find_zlib.cmake) -include (cmake/find_zstd.cmake) -include (cmake/find_ltdl.cmake) # for odbc -include (cmake/find_termcap.cmake) -include (cmake/find_odbc.cmake) +include (cmake/find/ssl.cmake) +include (cmake/find/icu.cmake) +include (cmake/find/boost.cmake) +include (cmake/find/zlib.cmake) +include (cmake/find/zstd.cmake) +include (cmake/find/ltdl.cmake) # for odbc +include (cmake/find/termcap.cmake) +include (cmake/find/odbc.cmake) # openssl, zlib, odbc before poco -include (cmake/find_poco.cmake) -include (cmake/find_lz4.cmake) -include (cmake/find_xxhash.cmake) -include (cmake/find_sparsehash.cmake) -include (cmake/find_rt.cmake) -include (cmake/find_execinfo.cmake) -include (cmake/find_readline_edit.cmake) -include (cmake/find_re2.cmake) -include (cmake/find_libgsasl.cmake) -include (cmake/find_rdkafka.cmake) -include (cmake/find_capnp.cmake) -include (cmake/find_llvm.cmake) -include (cmake/find_h3.cmake) -include (cmake/find_cpuid.cmake) # Freebsd, bundled +include (cmake/find/poco.cmake) +include (cmake/find/lz4.cmake) +include (cmake/find/xxhash.cmake) +include (cmake/find/sparsehash.cmake) +include (cmake/find/rt.cmake) +include (cmake/find/execinfo.cmake) +include (cmake/find/readline_edit.cmake) +include (cmake/find/re2.cmake) +include (cmake/find/libgsasl.cmake) +include (cmake/find/rdkafka.cmake) +include (cmake/find/capnp.cmake) +include (cmake/find/llvm.cmake) +include (cmake/find/h3.cmake) +include (cmake/find/cpuid.cmake) # Freebsd, bundled if (NOT USE_CPUID) - include (cmake/find_cpuinfo.cmake) # Debian + include (cmake/find/cpuinfo.cmake) # Debian endif() -include (cmake/find_libxml2.cmake) -include (cmake/find_brotli.cmake) -include (cmake/find_protobuf.cmake) -include (cmake/find_pdqsort.cmake) -include (cmake/find_hdfs3.cmake) # uses protobuf -include (cmake/find_consistent-hashing.cmake) -include (cmake/find_base64.cmake) -include (cmake/find_parquet.cmake) -include (cmake/find_hyperscan.cmake) -include (cmake/find_simdjson.cmake) -include (cmake/find_rapidjson.cmake) -include (cmake/find_fastops.cmake) -#include (cmake/find_orc.cmake) +include (cmake/find/libxml2.cmake) +include (cmake/find/brotli.cmake) +include (cmake/find/protobuf.cmake) +include (cmake/find/pdqsort.cmake) +include (cmake/find/hdfs3.cmake) # uses protobuf +include (cmake/find/consistent-hashing.cmake) +include (cmake/find/base64.cmake) +include (cmake/find/parquet.cmake) +include (cmake/find/hyperscan.cmake) +include (cmake/find/simdjson.cmake) +include (cmake/find/rapidjson.cmake) +include (cmake/find/fastops.cmake) +include (cmake/find/orc.cmake) find_contrib_lib(cityhash) find_contrib_lib(farmhash) @@ -353,7 +353,7 @@ find_contrib_lib(metrohash) find_contrib_lib(btrie) if (ENABLE_TESTS) - include (cmake/find_gtest.cmake) + include (cmake/find/gtest.cmake) endif () # Need to process before "contrib" dir: diff --git a/cmake/find_contrib_lib.cmake b/cmake/contrib_finder.cmake similarity index 100% rename from cmake/find_contrib_lib.cmake rename to cmake/contrib_finder.cmake diff --git a/cmake/default_libs.cmake b/cmake/default_libs.cmake index 54a01042558..0c2bedafda8 100644 --- a/cmake/default_libs.cmake +++ b/cmake/default_libs.cmake @@ -30,8 +30,8 @@ set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) add_subdirectory(libs/libglibc-compatibility) -include (cmake/find_unwind.cmake) -include (cmake/find_cxx.cmake) +include (cmake/find/unwind.cmake) +include (cmake/find/cxx.cmake) add_library(global-group INTERFACE) target_link_libraries(global-group INTERFACE diff --git a/cmake/find_base64.cmake b/cmake/find/base64.cmake similarity index 100% rename from cmake/find_base64.cmake rename to cmake/find/base64.cmake diff --git a/cmake/find_boost.cmake b/cmake/find/boost.cmake similarity index 100% rename from cmake/find_boost.cmake rename to cmake/find/boost.cmake diff --git a/cmake/find_brotli.cmake b/cmake/find/brotli.cmake similarity index 100% rename from cmake/find_brotli.cmake rename to cmake/find/brotli.cmake diff --git a/cmake/find_capnp.cmake b/cmake/find/capnp.cmake similarity index 100% rename from cmake/find_capnp.cmake rename to cmake/find/capnp.cmake diff --git a/cmake/find_ccache.cmake b/cmake/find/ccache.cmake similarity index 100% rename from cmake/find_ccache.cmake rename to cmake/find/ccache.cmake diff --git a/cmake/find_consistent-hashing.cmake b/cmake/find/consistent-hashing.cmake similarity index 100% rename from cmake/find_consistent-hashing.cmake rename to cmake/find/consistent-hashing.cmake diff --git a/cmake/find_cpuid.cmake b/cmake/find/cpuid.cmake similarity index 100% rename from cmake/find_cpuid.cmake rename to cmake/find/cpuid.cmake diff --git a/cmake/find_cpuinfo.cmake b/cmake/find/cpuinfo.cmake similarity index 100% rename from cmake/find_cpuinfo.cmake rename to cmake/find/cpuinfo.cmake diff --git a/cmake/find_cxx.cmake b/cmake/find/cxx.cmake similarity index 100% rename from cmake/find_cxx.cmake rename to cmake/find/cxx.cmake diff --git a/cmake/find_execinfo.cmake b/cmake/find/execinfo.cmake similarity index 100% rename from cmake/find_execinfo.cmake rename to cmake/find/execinfo.cmake diff --git a/cmake/find_fastops.cmake b/cmake/find/fastops.cmake similarity index 100% rename from cmake/find_fastops.cmake rename to cmake/find/fastops.cmake diff --git a/cmake/find_gperf.cmake b/cmake/find/gperf.cmake similarity index 100% rename from cmake/find_gperf.cmake rename to cmake/find/gperf.cmake diff --git a/cmake/find_gtest.cmake b/cmake/find/gtest.cmake similarity index 100% rename from cmake/find_gtest.cmake rename to cmake/find/gtest.cmake diff --git a/cmake/find_h3.cmake b/cmake/find/h3.cmake similarity index 100% rename from cmake/find_h3.cmake rename to cmake/find/h3.cmake diff --git a/cmake/find_hdfs3.cmake b/cmake/find/hdfs3.cmake similarity index 100% rename from cmake/find_hdfs3.cmake rename to cmake/find/hdfs3.cmake diff --git a/cmake/find_hyperscan.cmake b/cmake/find/hyperscan.cmake similarity index 100% rename from cmake/find_hyperscan.cmake rename to cmake/find/hyperscan.cmake diff --git a/cmake/find_icu.cmake b/cmake/find/icu.cmake similarity index 100% rename from cmake/find_icu.cmake rename to cmake/find/icu.cmake diff --git a/cmake/find_libgsasl.cmake b/cmake/find/libgsasl.cmake similarity index 100% rename from cmake/find_libgsasl.cmake rename to cmake/find/libgsasl.cmake diff --git a/cmake/find_libxml2.cmake b/cmake/find/libxml2.cmake similarity index 100% rename from cmake/find_libxml2.cmake rename to cmake/find/libxml2.cmake diff --git a/cmake/find_llvm.cmake b/cmake/find/llvm.cmake similarity index 100% rename from cmake/find_llvm.cmake rename to cmake/find/llvm.cmake diff --git a/cmake/find_ltdl.cmake b/cmake/find/ltdl.cmake similarity index 100% rename from cmake/find_ltdl.cmake rename to cmake/find/ltdl.cmake diff --git a/cmake/find_lz4.cmake b/cmake/find/lz4.cmake similarity index 100% rename from cmake/find_lz4.cmake rename to cmake/find/lz4.cmake diff --git a/cmake/find_odbc.cmake b/cmake/find/odbc.cmake similarity index 100% rename from cmake/find_odbc.cmake rename to cmake/find/odbc.cmake diff --git a/cmake/find_orc.cmake b/cmake/find/orc.cmake similarity index 100% rename from cmake/find_orc.cmake rename to cmake/find/orc.cmake diff --git a/cmake/find_parquet.cmake b/cmake/find/parquet.cmake similarity index 98% rename from cmake/find_parquet.cmake rename to cmake/find/parquet.cmake index 5c5bc664113..77ce38b255e 100644 --- a/cmake/find_parquet.cmake +++ b/cmake/find/parquet.cmake @@ -21,7 +21,7 @@ endif() if(ARROW_INCLUDE_DIR AND PARQUET_INCLUDE_DIR) elseif(NOT MISSING_INTERNAL_PARQUET_LIBRARY AND NOT OS_FREEBSD) - include(cmake/find_snappy.cmake) + include(cmake/find/snappy.cmake) set(CAN_USE_INTERNAL_PARQUET_LIBRARY 1) include(CheckCXXSourceCompiles) if(NOT USE_INTERNAL_DOUBLE_CONVERSION_LIBRARY) diff --git a/cmake/find_pdqsort.cmake b/cmake/find/pdqsort.cmake similarity index 100% rename from cmake/find_pdqsort.cmake rename to cmake/find/pdqsort.cmake diff --git a/cmake/find_poco.cmake b/cmake/find/poco.cmake similarity index 100% rename from cmake/find_poco.cmake rename to cmake/find/poco.cmake diff --git a/cmake/find_protobuf.cmake b/cmake/find/protobuf.cmake similarity index 100% rename from cmake/find_protobuf.cmake rename to cmake/find/protobuf.cmake diff --git a/cmake/find_rapidjson.cmake b/cmake/find/rapidjson.cmake similarity index 100% rename from cmake/find_rapidjson.cmake rename to cmake/find/rapidjson.cmake diff --git a/cmake/find_rdkafka.cmake b/cmake/find/rdkafka.cmake similarity index 100% rename from cmake/find_rdkafka.cmake rename to cmake/find/rdkafka.cmake diff --git a/cmake/find_re2.cmake b/cmake/find/re2.cmake similarity index 100% rename from cmake/find_re2.cmake rename to cmake/find/re2.cmake diff --git a/cmake/find_readline_edit.cmake b/cmake/find/readline_edit.cmake similarity index 100% rename from cmake/find_readline_edit.cmake rename to cmake/find/readline_edit.cmake diff --git a/cmake/find_rt.cmake b/cmake/find/rt.cmake similarity index 100% rename from cmake/find_rt.cmake rename to cmake/find/rt.cmake diff --git a/cmake/find_simdjson.cmake b/cmake/find/simdjson.cmake similarity index 100% rename from cmake/find_simdjson.cmake rename to cmake/find/simdjson.cmake diff --git a/cmake/find_snappy.cmake b/cmake/find/snappy.cmake similarity index 100% rename from cmake/find_snappy.cmake rename to cmake/find/snappy.cmake diff --git a/cmake/find_sparsehash.cmake b/cmake/find/sparsehash.cmake similarity index 100% rename from cmake/find_sparsehash.cmake rename to cmake/find/sparsehash.cmake diff --git a/cmake/find_ssl.cmake b/cmake/find/ssl.cmake similarity index 100% rename from cmake/find_ssl.cmake rename to cmake/find/ssl.cmake diff --git a/cmake/find_termcap.cmake b/cmake/find/termcap.cmake similarity index 100% rename from cmake/find_termcap.cmake rename to cmake/find/termcap.cmake diff --git a/cmake/find_unwind.cmake b/cmake/find/unwind.cmake similarity index 100% rename from cmake/find_unwind.cmake rename to cmake/find/unwind.cmake diff --git a/cmake/find_xxhash.cmake b/cmake/find/xxhash.cmake similarity index 100% rename from cmake/find_xxhash.cmake rename to cmake/find/xxhash.cmake diff --git a/cmake/find_zlib.cmake b/cmake/find/zlib.cmake similarity index 100% rename from cmake/find_zlib.cmake rename to cmake/find/zlib.cmake diff --git a/cmake/find_zstd.cmake b/cmake/find/zstd.cmake similarity index 100% rename from cmake/find_zstd.cmake rename to cmake/find/zstd.cmake From ac47427b76b52d2e5041ec32ba136abbbcb78bf1 Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Fri, 30 Aug 2019 17:31:28 +0300 Subject: [PATCH 092/309] =?UTF-8?q?cmake/test=5Fcpu=20=E2=86=92=20cmake/cp?= =?UTF-8?q?u=5Ffeatures?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 2 +- cmake/{test_cpu.cmake => cpu_features.cmake} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cmake/{test_cpu.cmake => cpu_features.cmake} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5dce3f0390e..3befdc8cd64 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -166,7 +166,7 @@ else() endif() if(NOT DISABLE_CPU_OPTIMIZE) - include(cmake/test_cpu.cmake) + include(cmake/cpu_features.cmake) endif() if(NOT COMPILER_CLANG) # clang: error: the clang compiler does not support '-march=native' diff --git a/cmake/test_cpu.cmake b/cmake/cpu_features.cmake similarity index 100% rename from cmake/test_cpu.cmake rename to cmake/cpu_features.cmake From f01706ad1187affdd656e3acad17700a7b21f01b Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Fri, 30 Aug 2019 19:50:43 +0300 Subject: [PATCH 093/309] Refactor some cmake files --- CMakeLists.txt | 38 +++---------------- cmake/arch.cmake | 13 ------- cmake/{ => linux}/default_libs.cmake | 23 ++++++------ cmake/target.cmake | 56 ++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+), 58 deletions(-) rename cmake/{ => linux}/default_libs.cmake (60%) create mode 100644 cmake/target.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 3befdc8cd64..0491dea8786 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,8 @@ foreach(policy endif() endforeach() +include (cmake/target.cmake) + # Ignore export() since we don't use it, # but it gets broken with a global targets via link_libraries() macro (export) @@ -41,22 +43,6 @@ else() message(STATUS "IPO/LTO not enabled.") endif() -if (COMPILER_GCC) - # Require minimum version of gcc - set (GCC_MINIMUM_VERSION 8) - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS ${GCC_MINIMUM_VERSION} AND NOT CMAKE_VERSION VERSION_LESS 2.8.9) - message (FATAL_ERROR "GCC version must be at least ${GCC_MINIMUM_VERSION}. For example, if GCC ${GCC_MINIMUM_VERSION} is available under gcc-${GCC_MINIMUM_VERSION}, g++-${GCC_MINIMUM_VERSION} names, do the following: export CC=gcc-${GCC_MINIMUM_VERSION} CXX=g++-${GCC_MINIMUM_VERSION}; rm -rf CMakeCache.txt CMakeFiles; and re run cmake or ./release.") - endif () -elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - # Require minimum version of clang - set (CLANG_MINIMUM_VERSION 7) - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS ${CLANG_MINIMUM_VERSION}) - message (FATAL_ERROR "Clang version must be at least ${CLANG_MINIMUM_VERSION}.") - endif () -else () - message (WARNING "You are using an unsupported compiler. Compilation has only been tested with Clang 6+ and GCC 7+.") -endif () - # Check that submodules are present only if source was downloaded with git if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git" AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/boost/boost") message (FATAL_ERROR "Submodules are not initialized. Run\n\tgit submodule update --init --recursive") @@ -136,22 +122,6 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "amd64|x86_64") endif () endif () -string(REGEX MATCH "-?[0-9]+(.[0-9]+)?$" COMPILER_POSTFIX ${CMAKE_CXX_COMPILER}) - -find_program (LLD_PATH NAMES "lld${COMPILER_POSTFIX}" "lld") -find_program (GOLD_PATH NAMES "gold") - -if (COMPILER_CLANG AND LLD_PATH AND NOT LINKER_NAME) - set (LINKER_NAME "lld") -elseif (GOLD_PATH) - set (LINKER_NAME "gold") -endif () - -if (LINKER_NAME) - message(STATUS "Using linker: ${LINKER_NAME} (selected from: LLD_PATH=${LLD_PATH}; GOLD_PATH=${GOLD_PATH}; COMPILER_POSTFIX=${COMPILER_POSTFIX})") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}") -endif () - # Make sure the final executable has symbols exported set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic") @@ -233,7 +203,9 @@ if (NOT SANITIZE) endif () include(cmake/dbms_glob_sources.cmake) -include(cmake/default_libs.cmake) +if (OS_LINUX) + include(cmake/linux/default_libs.cmake) +endif () ###################################### ### Add targets below this comment ### diff --git a/cmake/arch.cmake b/cmake/arch.cmake index f8e18629b09..deaa7a36eb4 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -12,19 +12,6 @@ if ((ARCH_ARM AND NOT ARCH_AARCH64) OR ARCH_I386) message (FATAL_ERROR "32bit platforms are not supported") endif () -if (CMAKE_SYSTEM MATCHES "Linux") - set (OS_LINUX 1) -endif () -if (CMAKE_SYSTEM MATCHES "FreeBSD") - set (OS_FREEBSD 1) -endif () - -if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - set (COMPILER_GCC 1) -elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set (COMPILER_CLANG 1) -endif () - if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le.*|PPC64LE.*)") set (ARCH_PPC64LE 1) if (COMPILER_CLANG OR (COMPILER_GCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)) diff --git a/cmake/default_libs.cmake b/cmake/linux/default_libs.cmake similarity index 60% rename from cmake/default_libs.cmake rename to cmake/linux/default_libs.cmake index 0c2bedafda8..d1f4730c97e 100644 --- a/cmake/default_libs.cmake +++ b/cmake/linux/default_libs.cmake @@ -3,20 +3,18 @@ set (DEFAULT_LIBS "-nodefaultlibs") -if (OS_LINUX) - # We need builtins from Clang's RT even without libcxx - for ubsan+int128. - # See https://bugs.llvm.org/show_bug.cgi?id=16404 - if (COMPILER_CLANG) - execute_process (COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libclang_rt.builtins-${CMAKE_SYSTEM_PROCESSOR}.a OUTPUT_VARIABLE BUILTINS_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE) - else () - set (BUILTINS_LIBRARY "-lgcc") - endif () - - set (DEFAULT_LIBS "${DEFAULT_LIBS} ${BUILTINS_LIBRARY} ${COVERAGE_OPTION} -lc -lm -lrt -lpthread -ldl") - - message(STATUS "Default libraries: ${DEFAULT_LIBS}") +# We need builtins from Clang's RT even without libcxx - for ubsan+int128. +# See https://bugs.llvm.org/show_bug.cgi?id=16404 +if (COMPILER_CLANG) + execute_process (COMMAND ${CMAKE_CXX_COMPILER} --print-file-name=libclang_rt.builtins-${CMAKE_SYSTEM_PROCESSOR}.a OUTPUT_VARIABLE BUILTINS_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE) +else () + set (BUILTINS_LIBRARY "-lgcc") endif () +set (DEFAULT_LIBS "${DEFAULT_LIBS} ${BUILTINS_LIBRARY} ${COVERAGE_OPTION} -lc -lm -lrt -lpthread -ldl") + +message(STATUS "Default libraries: ${DEFAULT_LIBS}") + set(CMAKE_CXX_STANDARD_LIBRARIES ${DEFAULT_LIBS}) set(CMAKE_C_STANDARD_LIBRARIES ${DEFAULT_LIBS}) @@ -42,6 +40,7 @@ target_link_libraries(global-group INTERFACE link_libraries(global-group) +# FIXME: remove when all contribs will get custom cmake lists install( TARGETS global-group global-libs EXPORT global diff --git a/cmake/target.cmake b/cmake/target.cmake new file mode 100644 index 00000000000..8bbdce87109 --- /dev/null +++ b/cmake/target.cmake @@ -0,0 +1,56 @@ +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + set (OS_LINUX 1) +elseif (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") + set (OS_FREEBSD 1) +elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin") + set (OS_DARWIN 1) +endif () + +if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set (COMPILER_GCC 1) +elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set (COMPILER_CLANG 1) +endif () + +if (COMPILER_GCC) + # Require minimum version of gcc + set (GCC_MINIMUM_VERSION 8) + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS ${GCC_MINIMUM_VERSION} AND NOT CMAKE_VERSION VERSION_LESS 2.8.9) + message (FATAL_ERROR "GCC version must be at least ${GCC_MINIMUM_VERSION}. For example, if GCC ${GCC_MINIMUM_VERSION} is available under gcc-${GCC_MINIMUM_VERSION}, g++-${GCC_MINIMUM_VERSION} names, do the following: export CC=gcc-${GCC_MINIMUM_VERSION} CXX=g++-${GCC_MINIMUM_VERSION}; rm -rf CMakeCache.txt CMakeFiles; and re run cmake or ./release.") + endif () +elseif (COMPILER_CLANG) + # Require minimum version of clang + set (CLANG_MINIMUM_VERSION 7) + if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS ${CLANG_MINIMUM_VERSION}) + message (FATAL_ERROR "Clang version must be at least ${CLANG_MINIMUM_VERSION}.") + endif () +else () + message (WARNING "You are using an unsupported compiler. Compilation has only been tested with Clang 6+ and GCC 7+.") +endif () + +string(REGEX MATCH "-?[0-9]+(.[0-9]+)?$" COMPILER_POSTFIX ${CMAKE_CXX_COMPILER}) + +find_program (LLD_PATH NAMES "lld${COMPILER_POSTFIX}" "lld") +find_program (GOLD_PATH NAMES "gold") + +if (COMPILER_CLANG AND LLD_PATH AND NOT LINKER_NAME) + set (LINKER_NAME "lld") +elseif (GOLD_PATH) + set (LINKER_NAME "gold") +endif () + +if (LINKER_NAME) + message(STATUS "Using linker: ${LINKER_NAME} (selected from: LLD_PATH=${LLD_PATH}; GOLD_PATH=${GOLD_PATH}; COMPILER_POSTFIX=${COMPILER_POSTFIX})") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}") +endif () + +if (CMAKE_CROSSCOMPILING) + if (NOT COMPILER_CLANG OR NOT LINKER_NAME MATCHES "lld") + message (FATAL "Cross-compilation supported only for Clang compiler and LLD linker") + endif () + + if (CMAKE_SYSTEM_NAME MATCHES "Darwin") + set (CMAKE_C_COMPILER_TARGET x86_64-apple-darwin) + set (CMAKE_CXX_COMPILER_TARGET x86_64-apple-darwin) + endif () +endif () From 1b5192706e39746e9485bac569c051bbb654c823 Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Wed, 4 Sep 2019 18:17:18 +0300 Subject: [PATCH 094/309] Some fixes for OS X build. --- CMakeLists.txt | 2 +- dbms/src/Common/StackTrace.h | 3 ++- libs/libcommon/CMakeLists.txt | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0491dea8786..91ccbcb2406 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -198,7 +198,7 @@ if (UNBUNDLED OR NOT (OS_LINUX OR APPLE) OR ARCH_32) endif () # Make this extra-checks for correct library dependencies. -if (NOT SANITIZE) +if (OS_LINUX AND NOT SANITIZE) set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--no-undefined") endif () diff --git a/dbms/src/Common/StackTrace.h b/dbms/src/Common/StackTrace.h index b0aa49bce5e..13147587a19 100644 --- a/dbms/src/Common/StackTrace.h +++ b/dbms/src/Common/StackTrace.h @@ -9,7 +9,8 @@ #ifdef __APPLE__ // ucontext is not available without _XOPEN_SOURCE -#define _XOPEN_SOURCE 700 +# pragma clang diagnostic ignored "-Wreserved-id-macro" +# define _XOPEN_SOURCE 700 #endif #include diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index 79e3d1fda80..cf1144124e8 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -84,7 +84,7 @@ elseif (USE_TCMALLOC) endif () elseif (SANITIZE) message (STATUS "Will use ${SANITIZE} sanitizer.") -else () +elseif (OS_LINUX) message (WARNING "Non default allocator is disabled. This is not recommended for production Linux builds.") endif () From e692a88196c921a5d578afd7487aa8012809cf21 Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Mon, 9 Sep 2019 21:19:43 +0300 Subject: [PATCH 095/309] CMake now works Example: -DCMAKE_SYSTEM_NAME=Darwin -DSDK_PATH=${HOME}/stuff/MacOSX10.12.sdk -DLINKER_NAME=${HOME}/.local/bin/ld64 --- CMakeLists.txt | 3 +++ cmake/darwin/default_libs.cmake | 39 +++++++++++++++++++++++++++++++++ cmake/darwin/sdk.cmake | 11 ++++++++++ cmake/find/cxx.cmake | 16 +------------- cmake/target.cmake | 28 ++++++++++++++--------- dbms/src/Common/StackTrace.cpp | 1 + 6 files changed, 73 insertions(+), 25 deletions(-) create mode 100644 cmake/darwin/default_libs.cmake create mode 100644 cmake/darwin/sdk.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 91ccbcb2406..ccfe81567ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -205,6 +205,9 @@ endif () include(cmake/dbms_glob_sources.cmake) if (OS_LINUX) include(cmake/linux/default_libs.cmake) +elseif (OS_DARWIN) + include(cmake/darwin/sdk.cmake) + include(cmake/darwin/default_libs.cmake) endif () ###################################### diff --git a/cmake/darwin/default_libs.cmake b/cmake/darwin/default_libs.cmake new file mode 100644 index 00000000000..b0bb8e5e84d --- /dev/null +++ b/cmake/darwin/default_libs.cmake @@ -0,0 +1,39 @@ +set (DEFAULT_LIBS "-nodefaultlibs") + +if (NOT COMPILER_CLANG) + message (FATAL_ERROR "Darwin build is supported only for Clang") +endif () + +set (DEFAULT_LIBS "${DEFAULT_LIBS} ${COVERAGE_OPTION} -lc -lm -lrt -lpthread -ldl") + +message(STATUS "Default libraries: ${DEFAULT_LIBS}") + +set(CMAKE_CXX_STANDARD_LIBRARIES ${DEFAULT_LIBS}) +set(CMAKE_C_STANDARD_LIBRARIES ${DEFAULT_LIBS}) + +# Global libraries + +add_library(global-libs INTERFACE) + +# Unfortunately '-pthread' doesn't work with '-nodefaultlibs'. +# Just make sure we have pthreads at all. +set(THREADS_PREFER_PTHREAD_FLAG ON) +find_package(Threads REQUIRED) + +include (cmake/find/cxx.cmake) + +add_library(global-group INTERFACE) + +target_link_libraries(global-group INTERFACE + -Wl,--start-group + $ + -Wl,--end-group +) + +link_libraries(global-group) + +# FIXME: remove when all contribs will get custom cmake lists +install( + TARGETS global-group global-libs + EXPORT global +) diff --git a/cmake/darwin/sdk.cmake b/cmake/darwin/sdk.cmake new file mode 100644 index 00000000000..382b3a31ba2 --- /dev/null +++ b/cmake/darwin/sdk.cmake @@ -0,0 +1,11 @@ +option (SDK_PATH "Path to the SDK to build with" "") + +if (NOT EXISTS "${SDK_PATH}/SDKSettings.plist") + message (FATAL_ERROR "Wrong SDK path provided: ${SDK_PATH}") +endif () + +set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -isysroot ${SDK_PATH}") +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -isysroot ${SDK_PATH}") + +set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -isysroot ${SDK_PATH}") +set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -isysroot ${SDK_PATH}") diff --git a/cmake/find/cxx.cmake b/cmake/find/cxx.cmake index f84a76183ec..4f2430228d4 100644 --- a/cmake/find/cxx.cmake +++ b/cmake/find/cxx.cmake @@ -1,25 +1,11 @@ -if (OS_LINUX AND COMPILER_CLANG) +if (COMPILER_CLANG) option (USE_LIBCXX "Use libc++ and libc++abi instead of libstdc++" ON) option (USE_INTERNAL_LIBCXX_LIBRARY "Set to FALSE to use system libcxx and libcxxabi libraries instead of bundled" ${NOT_UNBUNDLED}) endif() if (USE_LIBCXX) set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_LIBCPP_DEBUG=0") # More checks in debug build. -endif () -# FIXME: make better check for submodule presence -if (USE_INTERNAL_LIBCXX_LIBRARY AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libcxx/include/vector") - message (WARNING "submodule contrib/libcxx is missing. to fix try run: \n git submodule update --init --recursive") - set (USE_INTERNAL_LIBCXX_LIBRARY 0) -endif () - -# FIXME: make better check for submodule presence -if (USE_INTERNAL_LIBCXX_LIBRARY AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/libcxxabi/src") - message (WARNING "submodule contrib/libcxxabi is missing. to fix try run: \n git submodule update --init --recursive") - set (USE_INTERNAL_LIBCXX_LIBRARY 0) -endif () - -if (USE_LIBCXX) if (NOT USE_INTERNAL_LIBCXX_LIBRARY) find_library (LIBCXX_LIBRARY c++) find_library (LIBCXXFS_LIBRARY c++fs) diff --git a/cmake/target.cmake b/cmake/target.cmake index 8bbdce87109..be235a26ce3 100644 --- a/cmake/target.cmake +++ b/cmake/target.cmake @@ -30,13 +30,18 @@ endif () string(REGEX MATCH "-?[0-9]+(.[0-9]+)?$" COMPILER_POSTFIX ${CMAKE_CXX_COMPILER}) -find_program (LLD_PATH NAMES "lld${COMPILER_POSTFIX}" "lld") -find_program (GOLD_PATH NAMES "gold") +if (OS_LINUX) + find_program (LLD_PATH NAMES "lld${COMPILER_POSTFIX}" "lld") + find_program (GOLD_PATH NAMES "gold") +endif() -if (COMPILER_CLANG AND LLD_PATH AND NOT LINKER_NAME) - set (LINKER_NAME "lld") -elseif (GOLD_PATH) - set (LINKER_NAME "gold") +option (LINKER_NAME "Linker name or full path") +if (NOT LINKER_NAME) + if (COMPILER_CLANG AND LLD_PATH) + set (LINKER_NAME "lld") + elseif (GOLD_PATH) + set (LINKER_NAME "gold") + endif () endif () if (LINKER_NAME) @@ -45,12 +50,15 @@ if (LINKER_NAME) endif () if (CMAKE_CROSSCOMPILING) - if (NOT COMPILER_CLANG OR NOT LINKER_NAME MATCHES "lld") - message (FATAL "Cross-compilation supported only for Clang compiler and LLD linker") - endif () - if (CMAKE_SYSTEM_NAME MATCHES "Darwin") + set (CMAKE_SYSTEM_PROCESSOR x86_64) set (CMAKE_C_COMPILER_TARGET x86_64-apple-darwin) set (CMAKE_CXX_COMPILER_TARGET x86_64-apple-darwin) + + set (HAS_PRE_1970_EXITCODE "0" CACHE STRING "Result from TRY_RUN" FORCE) + set (HAS_PRE_1970_EXITCODE__TRYRUN_OUTPUT "" CACHE STRING "Output from TRY_RUN" FORCE) + + set( HAS_POST_2038_EXITCODE "0" CACHE STRING "Result from TRY_RUN" FORCE) + set( HAS_POST_2038_EXITCODE__TRYRUN_OUTPUT "" CACHE STRING "Output from TRY_RUN" FORCE) endif () endif () diff --git a/dbms/src/Common/StackTrace.cpp b/dbms/src/Common/StackTrace.cpp index 9981d0941aa..9694e33a2dd 100644 --- a/dbms/src/Common/StackTrace.cpp +++ b/dbms/src/Common/StackTrace.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include From 5b60053ef7c5edaeb27fb9be7b61dc7262b024e4 Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Wed, 11 Sep 2019 13:01:38 +0300 Subject: [PATCH 096/309] [WIP] --- CMakeLists.txt | 14 ++++-------- cmake/darwin/default_libs.cmake | 4 +--- cmake/darwin/sdk.cmake | 8 +++---- cmake/find/cxx.cmake | 5 +++++ cmake/find/snappy.cmake | 34 +++++++++--------------------- cmake/target.cmake | 3 +++ dbms/src/Common/Allocator.h | 5 ++++- dbms/src/Common/TraceCollector.cpp | 2 +- dbms/src/Common/new_delete.cpp | 4 ++++ 9 files changed, 36 insertions(+), 43 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ccfe81567ea..b2949ea1b42 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -147,16 +147,9 @@ if (ARCH_NATIVE) set (COMPILER_FLAGS "${COMPILER_FLAGS} -march=native") endif () -if (CMAKE_VERSION VERSION_LESS "3.8.0") - if (NOT MSVC) - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") - endif () -else () - set (CMAKE_CXX_STANDARD 17) - set (CMAKE_CXX_EXTENSIONS 0) # https://cmake.org/cmake/help/latest/prop_tgt/CXX_EXTENSIONS.html#prop_tgt:CXX_EXTENSIONS - set (CMAKE_CXX_STANDARD_REQUIRED ON) - set (CXX_FLAGS_INTERNAL_COMPILER "-std=c++17") -endif () +set (CMAKE_CXX_STANDARD 17) +set (CMAKE_CXX_EXTENSIONS 0) # https://cmake.org/cmake/help/latest/prop_tgt/CXX_EXTENSIONS.html#prop_tgt:CXX_EXTENSIONS +set (CMAKE_CXX_STANDARD_REQUIRED ON) if (COMPILER_GCC OR COMPILER_CLANG) # Enable C++14 sized global deallocation functions. It should be enabled by setting -std=c++14 but I'm not sure. @@ -203,6 +196,7 @@ if (OS_LINUX AND NOT SANITIZE) endif () include(cmake/dbms_glob_sources.cmake) + if (OS_LINUX) include(cmake/linux/default_libs.cmake) elseif (OS_DARWIN) diff --git a/cmake/darwin/default_libs.cmake b/cmake/darwin/default_libs.cmake index b0bb8e5e84d..e684a10a08f 100644 --- a/cmake/darwin/default_libs.cmake +++ b/cmake/darwin/default_libs.cmake @@ -4,7 +4,7 @@ if (NOT COMPILER_CLANG) message (FATAL_ERROR "Darwin build is supported only for Clang") endif () -set (DEFAULT_LIBS "${DEFAULT_LIBS} ${COVERAGE_OPTION} -lc -lm -lrt -lpthread -ldl") +set (DEFAULT_LIBS "${DEFAULT_LIBS} ${COVERAGE_OPTION} -lc -lm -lpthread -ldl") message(STATUS "Default libraries: ${DEFAULT_LIBS}") @@ -25,9 +25,7 @@ include (cmake/find/cxx.cmake) add_library(global-group INTERFACE) target_link_libraries(global-group INTERFACE - -Wl,--start-group $ - -Wl,--end-group ) link_libraries(global-group) diff --git a/cmake/darwin/sdk.cmake b/cmake/darwin/sdk.cmake index 382b3a31ba2..92a9f8f66e5 100644 --- a/cmake/darwin/sdk.cmake +++ b/cmake/darwin/sdk.cmake @@ -4,8 +4,8 @@ if (NOT EXISTS "${SDK_PATH}/SDKSettings.plist") message (FATAL_ERROR "Wrong SDK path provided: ${SDK_PATH}") endif () -set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -isysroot ${SDK_PATH}") -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -isysroot ${SDK_PATH}") +set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -isysroot ${SDK_PATH} -mmacosx-version-min=10.14") +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -isysroot ${SDK_PATH} -mmacosx-version-min=10.14") -set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -isysroot ${SDK_PATH}") -set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -isysroot ${SDK_PATH}") +set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -isysroot ${SDK_PATH} -mmacosx-version-min=10.14") +set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -isysroot ${SDK_PATH} -mmacosx-version-min=10.14") diff --git a/cmake/find/cxx.cmake b/cmake/find/cxx.cmake index 4f2430228d4..9a00269cfc6 100644 --- a/cmake/find/cxx.cmake +++ b/cmake/find/cxx.cmake @@ -6,6 +6,11 @@ endif() if (USE_LIBCXX) set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_LIBCPP_DEBUG=0") # More checks in debug build. + if (OS_DARWIN) + # Use libcxx from SDK + set (USE_INTERNAL_LIBCXX_LIBRARY OFF) + endif () + if (NOT USE_INTERNAL_LIBCXX_LIBRARY) find_library (LIBCXX_LIBRARY c++) find_library (LIBCXXFS_LIBRARY c++fs) diff --git a/cmake/find/snappy.cmake b/cmake/find/snappy.cmake index 1098bbc3e53..a39139ee363 100644 --- a/cmake/find/snappy.cmake +++ b/cmake/find/snappy.cmake @@ -1,27 +1,13 @@ -option(USE_INTERNAL_SNAPPY_LIBRARY "Set to FALSE to use system snappy library instead of bundled" ${NOT_UNBUNDLED}) +option(USE_SNAPPY "Enable support of snappy library" ON) -if(NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/snappy/snappy.h") - if(USE_INTERNAL_SNAPPY_LIBRARY) - message(WARNING "submodule contrib/snappy is missing. to fix try run: \n git submodule update --init --recursive") - set(USE_INTERNAL_SNAPPY_LIBRARY 0) +if (USE_SNAPPY) + option (USE_INTERNAL_SNAPPY_LIBRARY "Set to FALSE to use system snappy library instead of bundled" ${NOT_UNBUNDLED}) + + if(NOT USE_INTERNAL_SNAPPY_LIBRARY) + find_library(SNAPPY_LIBRARY snappy) + else () + set(SNAPPY_LIBRARY snappy) endif() - set(MISSING_INTERNAL_SNAPPY_LIBRARY 1) -endif() -if(NOT USE_INTERNAL_SNAPPY_LIBRARY) - find_library(SNAPPY_LIBRARY snappy) - find_path(SNAPPY_INCLUDE_DIR NAMES snappy.h PATHS ${SNAPPY_INCLUDE_PATHS}) -endif() - -if(SNAPPY_LIBRARY AND SNAPPY_INCLUDE_DIR) -elseif(NOT MISSING_INTERNAL_SNAPPY_LIBRARY) - set(SNAPPY_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/snappy) - set(USE_INTERNAL_SNAPPY_LIBRARY 1) - set(SNAPPY_LIBRARY snappy) -endif() - -if(SNAPPY_LIBRARY AND SNAPPY_INCLUDE_DIR) - set(USE_SNAPPY 1) -endif() - -message(STATUS "Using snappy=${USE_SNAPPY}: ${SNAPPY_INCLUDE_DIR} : ${SNAPPY_LIBRARY}") + message (STATUS "Using snappy: ${SNAPPY_LIBRARY}") +endif () diff --git a/cmake/target.cmake b/cmake/target.cmake index be235a26ce3..85cd1cd8357 100644 --- a/cmake/target.cmake +++ b/cmake/target.cmake @@ -61,4 +61,7 @@ if (CMAKE_CROSSCOMPILING) set( HAS_POST_2038_EXITCODE "0" CACHE STRING "Result from TRY_RUN" FORCE) set( HAS_POST_2038_EXITCODE__TRYRUN_OUTPUT "" CACHE STRING "Output from TRY_RUN" FORCE) endif () + + # Don't know why but CXX_STANDARD doesn't work for cross-compilation + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") endif () diff --git a/dbms/src/Common/Allocator.h b/dbms/src/Common/Allocator.h index ad5b0318c91..e6e2f602894 100644 --- a/dbms/src/Common/Allocator.h +++ b/dbms/src/Common/Allocator.h @@ -179,7 +179,10 @@ protected: // MAP_POPULATE to mmap(). This takes some time, but should be faster // overall than having a hot loop interrupted by page faults. static constexpr int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS - | (mmap_populate ? MAP_POPULATE : 0); +#if defined(OS_LINUX) + | (mmap_populate ? MAP_POPULATE : 0) +#endif + ; private: void * allocNoTrack(size_t size, size_t alignment) diff --git a/dbms/src/Common/TraceCollector.cpp b/dbms/src/Common/TraceCollector.cpp index 9451c3f88e5..bd06a200460 100644 --- a/dbms/src/Common/TraceCollector.cpp +++ b/dbms/src/Common/TraceCollector.cpp @@ -46,7 +46,7 @@ TraceCollector::TraceCollector(std::shared_ptr & trace_log_) if (-1 == fcntl(trace_pipe.fds_rw[1], F_SETFL, flags | O_NONBLOCK)) throwFromErrno("Cannot set non-blocking mode of pipe", ErrorCodes::CANNOT_FCNTL); -#if !defined(__FreeBSD__) +#if defined(OS_LINUX) /** Increase pipe size to avoid slowdown during fine-grained trace collection. */ int pipe_size = fcntl(trace_pipe.fds_rw[1], F_GETPIPE_SZ); diff --git a/dbms/src/Common/new_delete.cpp b/dbms/src/Common/new_delete.cpp index f2a85163035..cbf9b93290e 100644 --- a/dbms/src/Common/new_delete.cpp +++ b/dbms/src/Common/new_delete.cpp @@ -1,4 +1,8 @@ +#if defined(OS_LINUX) #include +#elif defined(OS_DARWIN) +#include +#endif #include #include From ad986f285ea9ec3a7f23de9007f2582fc33e42d0 Mon Sep 17 00:00:00 2001 From: CurtizJ Date: Fri, 13 Sep 2019 20:38:56 +0300 Subject: [PATCH 097/309] fix redis with mixed keys --- .../Dictionaries/RedisBlockInputStream.cpp | 4 --- .../Dictionaries/RedisDictionarySource.cpp | 35 +++++++++++++------ 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 31ae9162141..cc8f1d005de 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -163,8 +163,6 @@ namespace DB insertValue(*columns[idx], description.types[idx].first, value, name); }; - std::cerr << "keys: " << keys.toString() << "\n"; - if (keys.begin()->get()->isArray()) { size_t num_rows = 0; @@ -201,8 +199,6 @@ namespace DB } ++cursor; - std::cerr << "Redis command: " << commandForValues.toString() << "\n"; - Poco::Redis::Array values = client->execute(commandForValues); if (keys_array.size() != values.size() + 1) // 'HMGET' primary_key secondary_keys throw Exception{"Inconsistent sizes of keys and values in Redis request", diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index fc1593b339e..92d7644db1f 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -132,37 +132,52 @@ namespace DB RedisDictionarySource::~RedisDictionarySource() = default; + static std::string storageTypeToKeyType(RedisStorageType::Id type) + { + switch (type) + { + case RedisStorageType::Id::SIMPLE: + return "string"; + case RedisStorageType::Id::HASH_MAP: + return "hash"; + default: + return "none"; + } + + __builtin_unreachable(); + } BlockInputStreamPtr RedisDictionarySource::loadAll() { Poco::Redis::Command command_for_keys("KEYS"); command_for_keys << "*"; - Poco::Redis::Array keys = client->execute(command_for_keys); + /// Get only keys for specified storage type. + auto all_keys = client->execute(command_for_keys); + Poco::Redis::Array keys; + auto key_type = storageTypeToKeyType(storage_type); + for (auto & key : all_keys) + if (key_type == client->execute(Poco::Redis::Command("TYPE").addRedisType(key))) + keys.addRedisType(std::move(key)); if (storage_type == RedisStorageType::HASH_MAP && !keys.isNull()) { Poco::Redis::Array hkeys; for (const auto & key : keys) { - Poco::Redis::Command command_for_type("TYPE"); - auto type_reply = client->execute(command_for_type.addRedisType(key)); - if (type_reply != "hash") - continue; - Poco::Redis::Command command_for_secondary_keys("HKEYS"); command_for_secondary_keys.addRedisType(key); - Poco::Redis::Array reply_for_primary_key = client->execute(command_for_secondary_keys); + auto secondary_keys = client->execute(command_for_secondary_keys); Poco::Redis::Array primary_with_secondary; primary_with_secondary.addRedisType(key); - for (const auto & secondary_key : reply_for_primary_key) + for (const auto & secondary_key : secondary_keys) primary_with_secondary.addRedisType(secondary_key); - hkeys.add(primary_with_secondary); + hkeys.add(std::move(primary_with_secondary)); } - keys = hkeys; + keys = std::move(hkeys); } return std::make_shared(client, std::move(keys), sample_block, max_block_size); From d53872c30011e92502efa5d7f50f3d463cdce408 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Sat, 14 Sep 2019 07:44:46 +0000 Subject: [PATCH 098/309] Tests improvement. --- .../tests/integration/test_storage_s3/test.py | 90 ++++++++++--------- .../test_storage_s3/test_server.py | 60 ++++++------- 2 files changed, 79 insertions(+), 71 deletions(-) diff --git a/dbms/tests/integration/test_storage_s3/test.py b/dbms/tests/integration/test_storage_s3/test.py index c383a2a7bea..b975c4c92d5 100644 --- a/dbms/tests/integration/test_storage_s3/test.py +++ b/dbms/tests/integration/test_storage_s3/test.py @@ -14,112 +14,122 @@ def started_cluster(): cluster.shutdown() +import httplib import json +import logging import os import time +import traceback -def test_sophisticated_default(started_cluster): +logging.getLogger().setLevel(logging.INFO) +logging.getLogger().addHandler(logging.StreamHandler()) + +def test_simple(started_cluster): instance = started_cluster.instances['dummy'] instance.copy_file_to_container(os.path.join(os.path.dirname(__file__), 'test_server.py'), 'test_server.py') - communication_path = '/test_sophisticated_default' + communication_port = 10000 bucket = 'abc' - instance.exec_in_container(['python', 'test_server.py', communication_path, bucket], detach=True) + instance.exec_in_container(['python', 'test_server.py', str(communication_port), bucket], detach=True) + + def get_data(): + conn = httplib.HTTPConnection(started_cluster.instances['dummy'].ip_address, communication_port) + conn.request("GET", "/") + r = conn.getresponse() + raw_data = r.read() + conn.close() + return json.loads(raw_data) format = 'column1 UInt32, column2 UInt32, column3 UInt32' values = '(1, 2, 3), (3, 2, 1), (78, 43, 45)' other_values = '(1, 1, 1), (1, 1, 1), (11, 11, 11)' for i in range(10): try: - raw = instance.exec_in_container(['cat', communication_path]) - data = json.loads(instance.exec_in_container(['cat', communication_path])) + data = get_data() redirecting_to_http_port = data['redirecting_to_http_port'] - redirecting_to_https_port = data['redirecting_to_https_port'] preserving_data_port = data['preserving_data_port'] redirecting_preserving_data_port = data['redirecting_preserving_data_port'] - localhost = data['localhost'] except: + logging.error(traceback.format_exc()) time.sleep(0.5) else: break else: - assert False, 'Could not initialize mock server' + str(raw) + assert False, 'Could not initialize mock server' - redirecting_host = localhost + mock_host = started_cluster.instances['dummy'].ip_address def run_query(query): - print('Running query "{}"...'.format(query)) + logging.info('Running query "{}"...'.format(query)) result = instance.query(query) - print('Query finished') + logging.info('Query finished') return result prepare_put_queries = [ - "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(localhost, preserving_data_port, bucket, format, values), + "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(mock_host, preserving_data_port, bucket, format, values), ] queries = [ - "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(redirecting_host, redirecting_to_http_port, format), + "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(mock_host, redirecting_to_http_port, format), ] - put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(redirecting_host, preserving_data_port, bucket, format, values) + put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(mock_host, preserving_data_port, bucket, format, values) - redirect_put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(redirecting_host, redirecting_preserving_data_port, bucket, format, other_values) + redirect_put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(mock_host, redirecting_preserving_data_port, bucket, format, other_values) check_queries = [ - "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(localhost, preserving_data_port, bucket, format), + "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(mock_host, preserving_data_port, bucket, format), ] try: - print('Phase 1') + logging.info('Phase 1') for query in prepare_put_queries: run_query(query) - print('Phase 2') + logging.info('Phase 2') for query in queries: stdout = run_query(query) assert list(map(str.split, stdout.splitlines())) == [ - ['1', '2', '3', '6'], - ['3', '2', '1', '6'], - ['78', '43', '45', '150930'], + ['42', '87', '44', '160776'], + ['55', '33', '81', '147015'], + ['1', '0', '9', '0'], ] - print('Phase 3') + logging.info('Phase 3') query = put_query run_query(query) - for i in range(10): - try: - data = json.loads(instance.exec_in_container(['cat', communication_path])) - received_data_completed = data['received_data_completed'] - received_data = data['received_data'] - finalize_data = data['finalize_data'] - finalize_data_query = data['finalize_data_query'] - except: - time.sleep(0.5) - else: - break - else: - assert False, 'Could not read data from mock server'+str(data) + data = get_data() + received_data_completed = data['received_data_completed'] + received_data = data['received_data'] + finalize_data = data['finalize_data'] + finalize_data_query = data['finalize_data_query'] assert received_data[-1].decode() == '1,2,3\n3,2,1\n78,43,45\n' assert received_data_completed assert finalize_data == '1hello-etag' assert finalize_data_query == 'uploadId=TEST' - print('Phase 4') + logging.info('Phase 4') query = redirect_put_query run_query(query) for query in check_queries: - print(query) + logging.info(query) stdout = run_query(query) assert list(map(str.split, stdout.splitlines())) == [ ['1', '1', '1', '1'], ['1', '1', '1', '1'], ['11', '11', '11', '1331'], ] - # FIXME check result + data = get_data() + received_data = data['received_data'] + assert received_data[-1].decode() == '1,1,1\n1,1,1\n11,11,11\n' # FIXME tests for multipart - finally: - print('Done') + except: + logging.error(traceback.format_exc()) + raise + + else: + logging.info('Done') diff --git a/dbms/tests/integration/test_storage_s3/test_server.py b/dbms/tests/integration/test_storage_s3/test_server.py index aed5996212b..bc22b0df085 100644 --- a/dbms/tests/integration/test_storage_s3/test_server.py +++ b/dbms/tests/integration/test_storage_s3/test_server.py @@ -28,7 +28,7 @@ file_handler.setFormatter(logging.Formatter('%(asctime)s %(message)s')) logging.getLogger().addHandler(file_handler) logging.getLogger().addHandler(logging.StreamHandler()) -comm_path = sys.argv[1] +communication_port = int(sys.argv[1]) bucket = sys.argv[2] def GetFreeTCPPortsAndIP(n): @@ -43,41 +43,34 @@ def GetFreeTCPPortsAndIP(n): [ s.close() for s in sockets ] return result, addr -(redirecting_to_http_port, redirecting_to_https_port, preserving_data_port, redirecting_preserving_data_port), localhost = GetFreeTCPPortsAndIP(4) +(redirecting_to_http_port, simple_server_port, preserving_data_port, redirecting_preserving_data_port), localhost = GetFreeTCPPortsAndIP(4) data = { 'redirecting_to_http_port': redirecting_to_http_port, - 'redirecting_to_https_port': redirecting_to_https_port, 'preserving_data_port': preserving_data_port, 'redirecting_preserving_data_port': redirecting_preserving_data_port, - 'localhost': localhost } redirecting_host = localhost -with open(comm_path, 'w') as f: - f.write(json.dumps(data)) + +class SimpleHTTPServerHandler(BaseHTTPRequestHandler): + def do_GET(self): + logging.info('GET {}'.format(self.path)) + if self.path == '/milovidov/test.csv': + self.send_response(200) + self.send_header('Content-type', 'text/plain') + self.end_headers() + self.wfile.write('42,87,44\n55,33,81\n1,0,9\n') + else: + self.send_response(404) + self.end_headers() + self.finish() class RedirectingToHTTPHandler(BaseHTTPRequestHandler): def do_GET(self): self.send_response(307) self.send_header('Content-type', 'text/xml') - self.send_header('Location', 'http://storage.yandexcloud.net/milovidov/test.csv') - self.end_headers() - self.wfile.write(r''' - - TemporaryRedirect - Please re-send this request to the specified temporary endpoint. - Continue to use the original request endpoint for future requests. - storage.yandexcloud.net -'''.encode()) - self.finish() - - -class RedirectingToHTTPSHandler(BaseHTTPRequestHandler): - def do_GET(self): - self.send_response(307) - self.send_header('Content-type', 'text/xml') - self.send_header('Location', 'https://storage.yandexcloud.net/milovidov/test.csv') + self.send_header('Location', 'http://{}:{}/milovidov/test.csv'.format(localhost, simple_server_port)) self.end_headers() self.wfile.write(r''' @@ -135,8 +128,6 @@ class PreservingDataHandler(BaseHTTPRequestHandler): data['received_data_completed'] = True data['finalize_data'] = post_data data['finalize_data_query'] = query - with open(comm_path, 'w') as f: - f.write(json.dumps(data)) self.finish() def do_PUT(self): @@ -152,8 +143,6 @@ class PreservingDataHandler(BaseHTTPRequestHandler): assert self.headers['Expect'] == '100-continue' put_data = self.rfile.read() data.setdefault('received_data', []).append(put_data) - with open(comm_path, 'w') as f: - f.write(json.dumps(data)) logging.info('PUT to {}'.format(path)) self.server.storage[path] = put_data self.finish() @@ -233,12 +222,21 @@ class RedirectingPreservingDataHandler(BaseHTTPRequestHandler): self.finish() +class CommunicationServerHandler(BaseHTTPRequestHandler): + def do_GET(self): + self.send_response(200) + self.end_headers() + self.wfile.write(json.dumps(data)) + self.finish() + + servers = [] -servers.append(HTTPServer((redirecting_host, redirecting_to_https_port), RedirectingToHTTPSHandler)) -servers.append(HTTPServer((redirecting_host, redirecting_to_http_port), RedirectingToHTTPHandler)) -servers.append(HTTPServer((redirecting_host, preserving_data_port), PreservingDataHandler)) +servers.append(HTTPServer((localhost, communication_port), CommunicationServerHandler)) +servers.append(HTTPServer((localhost, redirecting_to_http_port), RedirectingToHTTPHandler)) +servers.append(HTTPServer((localhost, preserving_data_port), PreservingDataHandler)) servers[-1].storage = {} -servers.append(HTTPServer((redirecting_host, redirecting_preserving_data_port), RedirectingPreservingDataHandler)) +servers.append(HTTPServer((localhost, simple_server_port), SimpleHTTPServerHandler)) +servers.append(HTTPServer((localhost, redirecting_preserving_data_port), RedirectingPreservingDataHandler)) jobs = [ threading.Thread(target=server.serve_forever) for server in servers ] [ job.start() for job in jobs ] From 01fdb802d4ded4fd2f29232b162fc649e93c8592 Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Sun, 15 Sep 2019 13:35:12 +0300 Subject: [PATCH 099/309] Useful changes --- cmake/target.cmake | 3 +++ dbms/CMakeLists.txt | 5 +++++ dbms/programs/server/Server.cpp | 4 ++-- dbms/src/Common/PoolWithFailoverBase.h | 2 +- dbms/src/Common/QueryProfiler.cpp | 6 ++++++ dbms/src/Common/TaskStatsInfoGetter.cpp | 4 ++-- dbms/src/Common/checkStackSize.cpp | 5 ++++- dbms/src/Common/tests/CMakeLists.txt | 2 +- dbms/src/IO/tests/CMakeLists.txt | 4 ++-- dbms/src/Interpreters/MetricLog.cpp | 2 +- dbms/src/Interpreters/tests/CMakeLists.txt | 2 +- libs/libcommon/CMakeLists.txt | 2 -- 12 files changed, 28 insertions(+), 13 deletions(-) diff --git a/cmake/target.cmake b/cmake/target.cmake index 85cd1cd8357..5ffc2b23114 100644 --- a/cmake/target.cmake +++ b/cmake/target.cmake @@ -1,9 +1,12 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux") set (OS_LINUX 1) + add_compile_definitions(OS_LINUX) elseif (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") set (OS_FREEBSD 1) + add_compile_definitions(OS_FREEBSD) elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin") set (OS_DARWIN 1) + add_compile_definitions(OS_DARWIN) endif () if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index b99599dce01..7be9f78b419 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -132,6 +132,11 @@ list (APPEND dbms_headers src/TableFunctions/ITableFunction.h src/TableFunctio list (APPEND dbms_sources src/Dictionaries/DictionaryFactory.cpp src/Dictionaries/DictionarySourceFactory.cpp src/Dictionaries/DictionaryStructure.cpp) list (APPEND dbms_headers src/Dictionaries/DictionaryFactory.h src/Dictionaries/DictionarySourceFactory.h src/Dictionaries/DictionaryStructure.h) +if (NOT ENABLE_SSL) + list (REMOVE_ITEM clickhouse_common_io_sources src/Common/OpenSSLHelpers.cpp) + list (REMOVE_ITEM clickhouse_common_io_headers src/Common/OpenSSLHelpers.h) +endif () + add_library(clickhouse_common_io ${clickhouse_common_io_headers} ${clickhouse_common_io_sources}) if (OS_FREEBSD) diff --git a/dbms/programs/server/Server.cpp b/dbms/programs/server/Server.cpp index bba96f18c35..957d823a567 100644 --- a/dbms/programs/server/Server.cpp +++ b/dbms/programs/server/Server.cpp @@ -54,16 +54,16 @@ #include #include "TCPHandlerFactory.h" #include "Common/config_version.h" -#include "MySQLHandlerFactory.h" #include -#if defined(__linux__) +#if defined(OS_LINUX) #include #include #endif #if USE_POCO_NETSSL +#include "MySQLHandlerFactory.h" #include #include #endif diff --git a/dbms/src/Common/PoolWithFailoverBase.h b/dbms/src/Common/PoolWithFailoverBase.h index 989831ce2b0..35f9b1b136b 100644 --- a/dbms/src/Common/PoolWithFailoverBase.h +++ b/dbms/src/Common/PoolWithFailoverBase.h @@ -199,7 +199,7 @@ PoolWithFailoverBase::getMany( for (const ShuffledPool & pool: shuffled_pools) { auto & pool_state = shared_pool_states[pool.index]; - pool_state.error_count = std::min(max_error_cap, pool_state.error_count + pool.error_count); + pool_state.error_count = std::min(max_error_cap, pool_state.error_count + pool.error_count); } }); diff --git a/dbms/src/Common/QueryProfiler.cpp b/dbms/src/Common/QueryProfiler.cpp index a0b75c567a9..c255579e70b 100644 --- a/dbms/src/Common/QueryProfiler.cpp +++ b/dbms/src/Common/QueryProfiler.cpp @@ -30,10 +30,13 @@ namespace /// Thus upper bound on query_id length should be introduced to avoid buffer overflow in signal handler. constexpr size_t QUERY_ID_MAX_LEN = 1024; +#if defined(OS_LINUX) thread_local size_t write_trace_iteration = 0; +#endif void writeTraceInfo(TimerType timer_type, int /* sig */, siginfo_t * info, void * context) { +#if defined(OS_LINUX) /// Quickly drop if signal handler is called too frequently. /// Otherwise we may end up infinitelly processing signals instead of doing any useful work. ++write_trace_iteration; @@ -50,6 +53,9 @@ namespace return; } } +#else + UNUSED(info); +#endif constexpr size_t buf_size = sizeof(char) + // TraceCollector stop flag 8 * sizeof(char) + // maximum VarUInt length for string size diff --git a/dbms/src/Common/TaskStatsInfoGetter.cpp b/dbms/src/Common/TaskStatsInfoGetter.cpp index b361161483a..6b551582d49 100644 --- a/dbms/src/Common/TaskStatsInfoGetter.cpp +++ b/dbms/src/Common/TaskStatsInfoGetter.cpp @@ -4,7 +4,7 @@ #include -#if defined(__linux__) +#if defined(OS_LINUX) #include "hasLinuxCapability.h" #include @@ -321,7 +321,7 @@ bool TaskStatsInfoGetter::checkPermissions() TaskStatsInfoGetter::TaskStatsInfoGetter() { - throw Exception("TaskStats are not implemented for this OS.", ErrorCodes::NOT_IMPLEMENTED); + // TODO: throw Exception("TaskStats are not implemented for this OS.", ErrorCodes::NOT_IMPLEMENTED); } void TaskStatsInfoGetter::getStat(::taskstats &, pid_t) diff --git a/dbms/src/Common/checkStackSize.cpp b/dbms/src/Common/checkStackSize.cpp index e7f91bc3330..7581a818004 100644 --- a/dbms/src/Common/checkStackSize.cpp +++ b/dbms/src/Common/checkStackSize.cpp @@ -17,12 +17,14 @@ namespace DB } } - +#if defined(OS_LINUX) static thread_local void * stack_address = nullptr; static thread_local size_t max_stack_size = 0; +#endif void checkStackSize() { +#if defined(OS_LINUX) using namespace DB; if (!stack_address) @@ -59,4 +61,5 @@ void checkStackSize() << ", maximum stack size: " << max_stack_size; throw Exception(message.str(), ErrorCodes::TOO_DEEP_RECURSION); } +#endif } diff --git a/dbms/src/Common/tests/CMakeLists.txt b/dbms/src/Common/tests/CMakeLists.txt index 67c0e376f74..f9315dd2eb2 100644 --- a/dbms/src/Common/tests/CMakeLists.txt +++ b/dbms/src/Common/tests/CMakeLists.txt @@ -36,7 +36,7 @@ target_include_directories (simple_cache PRIVATE ${DBMS_INCLUDE_DIR}) target_link_libraries (simple_cache PRIVATE common) add_executable (compact_array compact_array.cpp) -target_link_libraries (compact_array PRIVATE clickhouse_common_io stdc++fs) +target_link_libraries (compact_array PRIVATE clickhouse_common_io) add_executable (radix_sort radix_sort.cpp) target_link_libraries (radix_sort PRIVATE clickhouse_common_io) diff --git a/dbms/src/IO/tests/CMakeLists.txt b/dbms/src/IO/tests/CMakeLists.txt index 2c3dc307b18..38802718dd1 100644 --- a/dbms/src/IO/tests/CMakeLists.txt +++ b/dbms/src/IO/tests/CMakeLists.txt @@ -59,10 +59,10 @@ target_link_libraries (write_int PRIVATE clickhouse_common_io) if (OS_LINUX OR OS_FREEBSD) add_executable(write_buffer_aio write_buffer_aio.cpp) - target_link_libraries (write_buffer_aio PRIVATE clickhouse_common_io stdc++fs) + target_link_libraries (write_buffer_aio PRIVATE clickhouse_common_io) add_executable(read_buffer_aio read_buffer_aio.cpp) - target_link_libraries (read_buffer_aio PRIVATE clickhouse_common_io stdc++fs) + target_link_libraries (read_buffer_aio PRIVATE clickhouse_common_io) endif () add_executable (zlib_buffers zlib_buffers.cpp) diff --git a/dbms/src/Interpreters/MetricLog.cpp b/dbms/src/Interpreters/MetricLog.cpp index 59a500010dc..5622e0c65b0 100644 --- a/dbms/src/Interpreters/MetricLog.cpp +++ b/dbms/src/Interpreters/MetricLog.cpp @@ -103,7 +103,7 @@ void MetricLog::metricThreadFunction() for (size_t i = 0, end = ProfileEvents::end(); i < end; ++i) { const ProfileEvents::Count new_value = ProfileEvents::global_counters[i].load(std::memory_order_relaxed); - UInt64 & old_value = prev_profile_events[i]; + auto & old_value = prev_profile_events[i]; elem.profile_events[i] = new_value - old_value; old_value = new_value; } diff --git a/dbms/src/Interpreters/tests/CMakeLists.txt b/dbms/src/Interpreters/tests/CMakeLists.txt index 03c06eb7257..5c509f4e9b8 100644 --- a/dbms/src/Interpreters/tests/CMakeLists.txt +++ b/dbms/src/Interpreters/tests/CMakeLists.txt @@ -53,7 +53,7 @@ target_link_libraries (expression_analyzer PRIVATE dbms clickhouse_storages_syst add_check(expression_analyzer) add_executable (users users.cpp) -target_link_libraries (users PRIVATE dbms clickhouse_common_config stdc++fs) +target_link_libraries (users PRIVATE dbms clickhouse_common_config) if (OS_LINUX) add_executable (internal_iotop internal_iotop.cpp) diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index cf1144124e8..62c64a9bdb0 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -117,8 +117,6 @@ target_link_libraries (common ${Poco_Util_LIBRARY} ${Poco_Foundation_LIBRARY} ${CITYHASH_LIBRARIES} - PRIVATE - stdc++fs PUBLIC ${Boost_SYSTEM_LIBRARY} PRIVATE From 787c2b8d83953a92ec2c4e7c92444c2a7afeeddd Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Sun, 15 Sep 2019 13:35:19 +0300 Subject: [PATCH 100/309] WIP --- dbms/CMakeLists.txt | 4 +- dbms/src/Core/CMakeLists.txt | 47 ++++++++++++ .../Processors/Formats/Impl/CMakeLists.txt | 73 +++++++++++++++++++ libs/libcommon/src/sleep.cpp | 6 ++ 4 files changed, 128 insertions(+), 2 deletions(-) diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index 7be9f78b419..2462477e9aa 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -159,7 +159,7 @@ macro(add_object_library name common_path) endif () endmacro() -add_object_library(clickhouse_core src/Core) +list (APPEND all_modules clickhouse_core) add_object_library(clickhouse_compression src/Compression/) add_object_library(clickhouse_datastreams src/DataStreams) add_object_library(clickhouse_datatypes src/DataTypes) @@ -176,7 +176,7 @@ add_object_library(clickhouse_formats src/Formats) add_object_library(clickhouse_processors src/Processors) add_object_library(clickhouse_processors_executors src/Processors/Executors) add_object_library(clickhouse_processors_formats src/Processors/Formats) -add_object_library(clickhouse_processors_formats_impl src/Processors/Formats/Impl) +list (APPEND all_modules clickhouse_processors_formats_impl) add_object_library(clickhouse_processors_transforms src/Processors/Transforms) add_object_library(clickhouse_processors_sources src/Processors/Sources) diff --git a/dbms/src/Core/CMakeLists.txt b/dbms/src/Core/CMakeLists.txt index 65172356645..150563ba993 100644 --- a/dbms/src/Core/CMakeLists.txt +++ b/dbms/src/Core/CMakeLists.txt @@ -1,3 +1,50 @@ if (ENABLE_TESTS) add_subdirectory (tests) endif () + +set (SRCS + AccurateComparison.h + BackgroundSchedulePool.cpp + BackgroundSchedulePool.h + Block.cpp + Block.h + BlockInfo.cpp + BlockInfo.h + callOnTypeIndex.h + CMakeLists.txt + ColumnNumbers.h + ColumnsWithTypeAndName.h + ColumnWithTypeAndName.cpp + ColumnWithTypeAndName.h + config_core.h.in + DecimalComparison.h + Defines.h + ExternalResultDescription.cpp + ExternalResultDescription.h + ExternalTable.cpp + ExternalTable.h + Field.cpp + Field.h + iostream_debug_helpers.cpp + iostream_debug_helpers.h + MySQLProtocol.cpp + MySQLProtocol.h + NamesAndTypes.cpp + NamesAndTypes.h + Names.h + Protocol.h + QualifiedTableName.h + QueryProcessingStage.h + Row.h + SettingsCommon.cpp + SettingsCommon.h + Settings.cpp + Settings.h + SortCursor.h + SortDescription.h + TypeListNumber.h + Types.h + UUID.h +) + +add_library(clickhouse_core ${SRCS}) diff --git a/dbms/src/Processors/Formats/Impl/CMakeLists.txt b/dbms/src/Processors/Formats/Impl/CMakeLists.txt index 65172356645..64c1732a456 100644 --- a/dbms/src/Processors/Formats/Impl/CMakeLists.txt +++ b/dbms/src/Processors/Formats/Impl/CMakeLists.txt @@ -1,3 +1,76 @@ if (ENABLE_TESTS) add_subdirectory (tests) endif () + +set (SRCS + ArrowColumnToCHColumn.cpp + ArrowColumnToCHColumn.h + BinaryRowInputFormat.cpp + BinaryRowInputFormat.h + BinaryRowOutputFormat.cpp + BinaryRowOutputFormat.h + CapnProtoRowInputFormat.cpp + CapnProtoRowInputFormat.h + CMakeLists.txt + CSVRowInputFormat.cpp + CSVRowInputFormat.h + CSVRowOutputFormat.cpp + CSVRowOutputFormat.h + JSONCompactRowOutputFormat.cpp + JSONCompactRowOutputFormat.h + JSONEachRowRowInputFormat.cpp + JSONEachRowRowInputFormat.h + JSONEachRowRowOutputFormat.cpp + JSONEachRowRowOutputFormat.h + JSONEachRowWithProgressRowOutputFormat.cpp + JSONEachRowWithProgressRowOutputFormat.h + JSONRowOutputFormat.cpp + JSONRowOutputFormat.h + MySQLOutputFormat.cpp + MySQLOutputFormat.h + NativeFormat.cpp + NullFormat.cpp + ODBCDriver2BlockOutputFormat.cpp + ODBCDriver2BlockOutputFormat.h + ODBCDriverBlockOutputFormat.cpp + ODBCDriverBlockOutputFormat.h + ORCBlockInputFormat.cpp + ORCBlockInputFormat.h + ParquetBlockInputFormat.cpp + ParquetBlockInputFormat.h + ParquetBlockOutputFormat.cpp + ParquetBlockOutputFormat.h + PrettyBlockOutputFormat.cpp + PrettyBlockOutputFormat.h + PrettyCompactBlockOutputFormat.cpp + PrettyCompactBlockOutputFormat.h + PrettySpaceBlockOutputFormat.cpp + PrettySpaceBlockOutputFormat.h + ProtobufRowInputFormat.cpp + ProtobufRowInputFormat.h + ProtobufRowOutputFormat.cpp + ProtobufRowOutputFormat.h + TabSeparatedRawRowOutputFormat.h + TabSeparatedRowInputFormat.cpp + TabSeparatedRowInputFormat.h + TabSeparatedRowOutputFormat.cpp + TabSeparatedRowOutputFormat.h + TemplateBlockOutputFormat.cpp + TemplateBlockOutputFormat.h + TemplateRowInputFormat.cpp + TemplateRowInputFormat.h + TSKVRowInputFormat.cpp + TSKVRowInputFormat.h + TSKVRowOutputFormat.cpp + TSKVRowOutputFormat.h + ValuesRowInputFormat.cpp + ValuesRowInputFormat.h + ValuesRowOutputFormat.cpp + ValuesRowOutputFormat.h + VerticalRowOutputFormat.cpp + VerticalRowOutputFormat.h + XMLRowOutputFormat.cpp + XMLRowOutputFormat.h +) + +add_library(clickhouse_processors_formats_impl ${SRCS}) diff --git a/libs/libcommon/src/sleep.cpp b/libs/libcommon/src/sleep.cpp index 710b387d62e..663e745b501 100644 --- a/libs/libcommon/src/sleep.cpp +++ b/libs/libcommon/src/sleep.cpp @@ -14,6 +14,7 @@ */ void sleepForNanoseconds(uint64_t nanoseconds) { +#if defined(OS_LINUX) constexpr auto clock_type = CLOCK_MONOTONIC; struct timespec current_time; @@ -29,6 +30,11 @@ void sleepForNanoseconds(uint64_t nanoseconds) finish_time.tv_sec += (nanoseconds / resolution) + extra_second; while (clock_nanosleep(clock_type, TIMER_ABSTIME, &finish_time, nullptr) == EINTR); +#elif defined(OS_DARWIN) + // TODO: implement me! +#else +# error "sleepForNanoseconds not supported for this platform!" +#endif } void sleepForMicroseconds(uint64_t microseconds) From 8c356a383087f78e43135dc5f3eea23236a65efd Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Sun, 15 Sep 2019 18:20:31 +0300 Subject: [PATCH 101/309] WIP --- cmake/find/cxx.cmake | 5 -- contrib/libcxx-cmake/CMakeLists.txt | 5 ++ dbms/CMakeLists.txt | 6 +- .../client/readpassphrase/readpassphrase.c | 6 +- dbms/src/Core/CMakeLists.txt | 47 ------------ .../Processors/Formats/Impl/CMakeLists.txt | 73 ------------------- 6 files changed, 11 insertions(+), 131 deletions(-) diff --git a/cmake/find/cxx.cmake b/cmake/find/cxx.cmake index 9a00269cfc6..4f2430228d4 100644 --- a/cmake/find/cxx.cmake +++ b/cmake/find/cxx.cmake @@ -6,11 +6,6 @@ endif() if (USE_LIBCXX) set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_LIBCPP_DEBUG=0") # More checks in debug build. - if (OS_DARWIN) - # Use libcxx from SDK - set (USE_INTERNAL_LIBCXX_LIBRARY OFF) - endif () - if (NOT USE_INTERNAL_LIBCXX_LIBRARY) find_library (LIBCXX_LIBRARY c++) find_library (LIBCXXFS_LIBRARY c++fs) diff --git a/contrib/libcxx-cmake/CMakeLists.txt b/contrib/libcxx-cmake/CMakeLists.txt index 9609c7ca9e7..54bbb5882e9 100644 --- a/contrib/libcxx-cmake/CMakeLists.txt +++ b/contrib/libcxx-cmake/CMakeLists.txt @@ -42,7 +42,12 @@ add_library(cxx ${SRCS}) target_include_directories(cxx SYSTEM BEFORE PUBLIC $) target_compile_definitions(cxx PRIVATE -D_LIBCPP_BUILDING_LIBRARY -DLIBCXX_BUILDING_LIBCXXABI) + target_compile_options(cxx PUBLIC -nostdinc++ -Wno-reserved-id-macro) +if (OS_DARWIN) + target_compile_options(cxx PUBLIC -Wno-ctad-maybe-unsupported) +endif () + target_link_libraries(cxx PUBLIC cxxabi) install( diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index 2462477e9aa..1967cb4b067 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -159,8 +159,8 @@ macro(add_object_library name common_path) endif () endmacro() -list (APPEND all_modules clickhouse_core) -add_object_library(clickhouse_compression src/Compression/) +add_object_library(clickhouse_core src/Core) +add_object_library(clickhouse_compression src/Compression) add_object_library(clickhouse_datastreams src/DataStreams) add_object_library(clickhouse_datatypes src/DataTypes) add_object_library(clickhouse_databases src/Databases) @@ -176,7 +176,7 @@ add_object_library(clickhouse_formats src/Formats) add_object_library(clickhouse_processors src/Processors) add_object_library(clickhouse_processors_executors src/Processors/Executors) add_object_library(clickhouse_processors_formats src/Processors/Formats) -list (APPEND all_modules clickhouse_processors_formats_impl) +add_object_library(clickhouse_processors_formats_impl src/Processors/Formats/Impl) add_object_library(clickhouse_processors_transforms src/Processors/Transforms) add_object_library(clickhouse_processors_sources src/Processors/Sources) diff --git a/dbms/programs/client/readpassphrase/readpassphrase.c b/dbms/programs/client/readpassphrase/readpassphrase.c index be12c923db2..8c56877196c 100644 --- a/dbms/programs/client/readpassphrase/readpassphrase.c +++ b/dbms/programs/client/readpassphrase/readpassphrase.c @@ -46,7 +46,7 @@ # define _POSIX_VDISABLE VDISABLE #endif -static volatile sig_atomic_t signo[_NSIG]; +static volatile sig_atomic_t signo[NSIG]; static void handler(int); @@ -67,7 +67,7 @@ readpassphrase(const char *prompt, char *buf, size_t bufsiz, int flags) } restart: - for (i = 0; i < _NSIG; i++) + for (i = 0; i < NSIG; i++) signo[i] = 0; nr = -1; save_errno = 0; @@ -173,7 +173,7 @@ restart: * If we were interrupted by a signal, resend it to ourselves * now that we have restored the signal handlers. */ - for (i = 0; i < _NSIG; i++) { + for (i = 0; i < NSIG; i++) { if (signo[i]) { kill(getpid(), i); switch (i) { diff --git a/dbms/src/Core/CMakeLists.txt b/dbms/src/Core/CMakeLists.txt index 150563ba993..65172356645 100644 --- a/dbms/src/Core/CMakeLists.txt +++ b/dbms/src/Core/CMakeLists.txt @@ -1,50 +1,3 @@ if (ENABLE_TESTS) add_subdirectory (tests) endif () - -set (SRCS - AccurateComparison.h - BackgroundSchedulePool.cpp - BackgroundSchedulePool.h - Block.cpp - Block.h - BlockInfo.cpp - BlockInfo.h - callOnTypeIndex.h - CMakeLists.txt - ColumnNumbers.h - ColumnsWithTypeAndName.h - ColumnWithTypeAndName.cpp - ColumnWithTypeAndName.h - config_core.h.in - DecimalComparison.h - Defines.h - ExternalResultDescription.cpp - ExternalResultDescription.h - ExternalTable.cpp - ExternalTable.h - Field.cpp - Field.h - iostream_debug_helpers.cpp - iostream_debug_helpers.h - MySQLProtocol.cpp - MySQLProtocol.h - NamesAndTypes.cpp - NamesAndTypes.h - Names.h - Protocol.h - QualifiedTableName.h - QueryProcessingStage.h - Row.h - SettingsCommon.cpp - SettingsCommon.h - Settings.cpp - Settings.h - SortCursor.h - SortDescription.h - TypeListNumber.h - Types.h - UUID.h -) - -add_library(clickhouse_core ${SRCS}) diff --git a/dbms/src/Processors/Formats/Impl/CMakeLists.txt b/dbms/src/Processors/Formats/Impl/CMakeLists.txt index 64c1732a456..65172356645 100644 --- a/dbms/src/Processors/Formats/Impl/CMakeLists.txt +++ b/dbms/src/Processors/Formats/Impl/CMakeLists.txt @@ -1,76 +1,3 @@ if (ENABLE_TESTS) add_subdirectory (tests) endif () - -set (SRCS - ArrowColumnToCHColumn.cpp - ArrowColumnToCHColumn.h - BinaryRowInputFormat.cpp - BinaryRowInputFormat.h - BinaryRowOutputFormat.cpp - BinaryRowOutputFormat.h - CapnProtoRowInputFormat.cpp - CapnProtoRowInputFormat.h - CMakeLists.txt - CSVRowInputFormat.cpp - CSVRowInputFormat.h - CSVRowOutputFormat.cpp - CSVRowOutputFormat.h - JSONCompactRowOutputFormat.cpp - JSONCompactRowOutputFormat.h - JSONEachRowRowInputFormat.cpp - JSONEachRowRowInputFormat.h - JSONEachRowRowOutputFormat.cpp - JSONEachRowRowOutputFormat.h - JSONEachRowWithProgressRowOutputFormat.cpp - JSONEachRowWithProgressRowOutputFormat.h - JSONRowOutputFormat.cpp - JSONRowOutputFormat.h - MySQLOutputFormat.cpp - MySQLOutputFormat.h - NativeFormat.cpp - NullFormat.cpp - ODBCDriver2BlockOutputFormat.cpp - ODBCDriver2BlockOutputFormat.h - ODBCDriverBlockOutputFormat.cpp - ODBCDriverBlockOutputFormat.h - ORCBlockInputFormat.cpp - ORCBlockInputFormat.h - ParquetBlockInputFormat.cpp - ParquetBlockInputFormat.h - ParquetBlockOutputFormat.cpp - ParquetBlockOutputFormat.h - PrettyBlockOutputFormat.cpp - PrettyBlockOutputFormat.h - PrettyCompactBlockOutputFormat.cpp - PrettyCompactBlockOutputFormat.h - PrettySpaceBlockOutputFormat.cpp - PrettySpaceBlockOutputFormat.h - ProtobufRowInputFormat.cpp - ProtobufRowInputFormat.h - ProtobufRowOutputFormat.cpp - ProtobufRowOutputFormat.h - TabSeparatedRawRowOutputFormat.h - TabSeparatedRowInputFormat.cpp - TabSeparatedRowInputFormat.h - TabSeparatedRowOutputFormat.cpp - TabSeparatedRowOutputFormat.h - TemplateBlockOutputFormat.cpp - TemplateBlockOutputFormat.h - TemplateRowInputFormat.cpp - TemplateRowInputFormat.h - TSKVRowInputFormat.cpp - TSKVRowInputFormat.h - TSKVRowOutputFormat.cpp - TSKVRowOutputFormat.h - ValuesRowInputFormat.cpp - ValuesRowInputFormat.h - ValuesRowOutputFormat.cpp - ValuesRowOutputFormat.h - VerticalRowOutputFormat.cpp - VerticalRowOutputFormat.h - XMLRowOutputFormat.cpp - XMLRowOutputFormat.h -) - -add_library(clickhouse_processors_formats_impl ${SRCS}) From 84cfbb8e69f7b398f0ee7545e62af3f07c7c553d Mon Sep 17 00:00:00 2001 From: Yuriy Date: Mon, 9 Sep 2019 16:42:32 +0300 Subject: [PATCH 102/309] simplified mariadb-connector-c compilation and fixed caching_sha2_password plugin --- contrib/CMakeLists.txt | 12 +- contrib/mariadb-connector-c | 2 +- .../mariadb-connector-c-cmake/CMakeLists.txt | 74 --- .../linux_x86_64/include/config.h | 269 ---------- .../linux_x86_64/include/ma_config.h | 269 ---------- .../linux_x86_64/include/mariadb_version.h | 36 -- .../libmariadb/ma_client_plugin.c | 502 ------------------ libs/libmysqlxx/CMakeLists.txt | 4 +- libs/libmysqlxx/cmake/find_mysqlclient.cmake | 4 +- 9 files changed, 13 insertions(+), 1159 deletions(-) delete mode 100644 contrib/mariadb-connector-c-cmake/CMakeLists.txt delete mode 100644 contrib/mariadb-connector-c-cmake/linux_x86_64/include/config.h delete mode 100644 contrib/mariadb-connector-c-cmake/linux_x86_64/include/ma_config.h delete mode 100644 contrib/mariadb-connector-c-cmake/linux_x86_64/include/mariadb_version.h delete mode 100644 contrib/mariadb-connector-c-cmake/linux_x86_64/libmariadb/ma_client_plugin.c diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 96462de0190..5525aef61db 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -138,11 +138,13 @@ if (USE_INTERNAL_SSL_LIBRARY) endif () if (ENABLE_MYSQL AND USE_INTERNAL_MYSQL_LIBRARY) - add_subdirectory (mariadb-connector-c-cmake) - target_include_directories(mysqlclient BEFORE PRIVATE ${ZLIB_INCLUDE_DIR}) - if(OPENSSL_INCLUDE_DIR) - target_include_directories(mysqlclient BEFORE PRIVATE ${OPENSSL_INCLUDE_DIR}) - endif() + set(CLIENT_PLUGIN_CACHING_SHA2_PASSWORD STATIC) + set(CLIENT_PLUGIN_SHA256_PASSWORD STATIC) + set(CLIENT_PLUGIN_REMOTE_IO OFF) + set(CLIENT_PLUGIN_DIALOG OFF) + set(CLIENT_PLUGIN_CLIENT_ED25519 OFF) + set(CLIENT_PLUGIN_MYSQL_CLEAR_PASSWORD OFF) + add_subdirectory (mariadb-connector-c) endif () if (USE_INTERNAL_RDKAFKA_LIBRARY) diff --git a/contrib/mariadb-connector-c b/contrib/mariadb-connector-c index c6503d3acc8..9bbf08c2a0f 160000 --- a/contrib/mariadb-connector-c +++ b/contrib/mariadb-connector-c @@ -1 +1 @@ -Subproject commit c6503d3acc85ca1a7f5e7e38b605d7c9410aac1e +Subproject commit 9bbf08c2a0fb7b34671291fce13e6af62c5343a2 diff --git a/contrib/mariadb-connector-c-cmake/CMakeLists.txt b/contrib/mariadb-connector-c-cmake/CMakeLists.txt deleted file mode 100644 index 2e80b0c325f..00000000000 --- a/contrib/mariadb-connector-c-cmake/CMakeLists.txt +++ /dev/null @@ -1,74 +0,0 @@ -set(MARIADB_CLIENT_SOURCE_DIR ${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c) -set(MARIADB_CLIENT_BINARY_DIR ${ClickHouse_BINARY_DIR}/contrib/mariadb-connector-c) - -set(SRCS -#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/bmove_upp.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/get_password.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_alloc.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_array.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_charset.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_compress.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_context.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_default.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_dtoa.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_errmsg.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_hash.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_init.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_io.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_list.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_ll2str.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_loaddata.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_net.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_password.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_pvio.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/mariadb_async.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/mariadb_charset.c -#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/mariadb_dyncol.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/mariadb_lib.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/mariadb_stmt.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_sha1.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_stmt_codec.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_string.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_time.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_tls.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/openssl_crypt.c -#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/gnutls.c -#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/ma_schannel.c -#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/schannel.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/auth_gssapi_client.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/dialog.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/gssapi_client.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/gssapi_errmsg.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/mariadb_cleartext.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/my_auth.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/old_password.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/sha256_pw.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/caching_sha2_pw.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/sspi_client.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/sspi_errmsg.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/connection/aurora.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/connection/replication.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/io/remote_io.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/pvio/pvio_npipe.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/pvio/pvio_shmem.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/pvio/pvio_socket.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/trace/trace_example.c -${CMAKE_CURRENT_SOURCE_DIR}/linux_x86_64/libmariadb/ma_client_plugin.c -) - -if(OPENSSL_LIBRARIES) - list(APPEND SRCS ${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/openssl.c) -endif() - -add_library(mysqlclient ${SRCS}) - -if(OPENSSL_LIBRARIES) - target_link_libraries(mysqlclient PRIVATE ${OPENSSL_LIBRARIES}) - target_compile_definitions(mysqlclient PRIVATE -D HAVE_OPENSSL -D HAVE_TLS) -endif() - -target_include_directories(mysqlclient PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/linux_x86_64/include) -target_include_directories(mysqlclient PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/common/include) -target_include_directories(mysqlclient PUBLIC ${MARIADB_CLIENT_SOURCE_DIR}/include) - -target_compile_definitions(mysqlclient PRIVATE -D THREAD) diff --git a/contrib/mariadb-connector-c-cmake/linux_x86_64/include/config.h b/contrib/mariadb-connector-c-cmake/linux_x86_64/include/config.h deleted file mode 100644 index 90c42c97df6..00000000000 --- a/contrib/mariadb-connector-c-cmake/linux_x86_64/include/config.h +++ /dev/null @@ -1,269 +0,0 @@ - -/* - * Include file constants (processed in LibmysqlIncludeFiles.txt 1 - */ -#define HAVE_ALLOCA_H 1 -/* #undef HAVE_BIGENDIAN */ -#define HAVE_SETLOCALE 1 -#define HAVE_NL_LANGINFO 1 -#define HAVE_ARPA_INET_H 1 -#define HAVE_CRYPT_H 1 -#define HAVE_DIRENT_H 1 -#define HAVE_DLFCN_H 1 -#define HAVE_EXECINFO_H 1 -#define HAVE_FCNTL_H 1 -#define HAVE_FENV_H 1 -#define HAVE_FLOAT_H 1 -/* #undef HAVE_FPU_CONTROL_H */ -#define HAVE_GRP_H 1 -/* #undef HAVE_IEEEFP_H */ -#define HAVE_LIMITS_H 1 -#define HAVE_MALLOC_H 1 -#define HAVE_MEMORY_H 1 -#define HAVE_NETINET_IN_H 1 -#define HAVE_PATHS_H 1 -#define HAVE_PWD_H 1 -#define HAVE_SCHED_H 1 -/* #undef HAVE_SELECT_H */ -#define HAVE_STDDEF_H 1 -#define HAVE_STDINT_H 1 -#define HAVE_STDLIB_H 1 -#define HAVE_STRING_H 1 -#define HAVE_STRINGS_H 1 -/* #undef HAVE_SYNCH_H */ -/* #undef HAVE_SYS_FPU_H */ -#define HAVE_SYS_IOCTL_H 1 -#define HAVE_SYS_IPC_H 1 -#define HAVE_SYS_MMAN_H 1 -#define HAVE_SYS_PRCTL_H 1 -#define HAVE_SYS_SELECT_H 1 -#define HAVE_SYS_SHM_H 1 -#define HAVE_SYS_SOCKET_H 1 -#define HAVE_SYS_STAT_H 1 -/* #undef HAVE_SYS_STREAM_H */ -#define HAVE_SYS_TIMEB_H 1 -#define HAVE_SYS_TYPES_H 1 -#define HAVE_SYS_UN_H 1 -/* #undef HAVE_SYSENT_H */ -#define HAVE_TERMIO_H 1 -#define HAVE_TERMIOS_H 1 -#define HAVE_UNISTD_H 1 -#define HAVE_UTIME_H 1 -#define HAVE_UCONTEXT_H 1 - -/* - * function definitions - processed in LibmysqlFunctions.txt - */ -#define HAVE_ACCESS 1 -/* #undef HAVE_AIOWAIT */ -#define HAVE_ALARM 1 -/* #undef HAVE_ALLOCA */ -#define HAVE_BCMP 1 -/* #undef HAVE_BFILL */ -/* #undef HAVE_BMOVE */ -#define HAVE_BZERO 1 -#define HAVE_CLOCK_GETTIME 1 -/* #undef HAVE_COMPRESS */ -/* #undef HAVE_CRYPT */ -#define HAVE_DLERROR 1 -#define HAVE_DLOPEN 1 -#define HAVE_FCHMOD 1 -#define HAVE_FCNTL 1 -/* #undef HAVE_FCONVERT */ -#define HAVE_FDATASYNC 1 -#define HAVE_FESETROUND 1 -#define HAVE_FINITE 1 -#define HAVE_FSEEKO 1 -#define HAVE_FSYNC 1 -#define HAVE_GETADDRINFO 1 -#define HAVE_GETCWD 1 -#define HAVE_GETHOSTBYADDR_R 1 -#define HAVE_GETHOSTBYNAME_R 1 -/* #undef HAVE_GETHRTIME */ -#define HAVE_GETNAMEINFO 1 -#define HAVE_GETPAGESIZE 1 -#define HAVE_GETPASS 1 -/* #undef HAVE_GETPASSPHRASE */ -#define HAVE_GETPWNAM 1 -#define HAVE_GETPWUID 1 -#define HAVE_GETRLIMIT 1 -#define HAVE_GETRUSAGE 1 -#define HAVE_GETWD 1 -#define HAVE_GMTIME_R 1 -#define HAVE_INITGROUPS 1 -#define HAVE_LDIV 1 -#define HAVE_LOCALTIME_R 1 -#define HAVE_LOG2 1 -#define HAVE_LONGJMP 1 -#define HAVE_LSTAT 1 -#define HAVE_MADVISE 1 -#define HAVE_MALLINFO 1 -#define HAVE_MEMALIGN 1 -#define HAVE_MEMCPY 1 -#define HAVE_MEMMOVE 1 -#define HAVE_MKSTEMP 1 -#define HAVE_MLOCK 1 -#define HAVE_MLOCKALL 1 -#define HAVE_MMAP 1 -#define HAVE_MMAP64 1 -#define HAVE_PERROR 1 -#define HAVE_POLL 1 -#define HAVE_PREAD 1 -/* #undef HAVE_PTHREAD_ATTR_CREATE */ -#define HAVE_PTHREAD_ATTR_GETSTACKSIZE 1 -/* #undef HAVE_PTHREAD_ATTR_SETPRIO */ -#define HAVE_PTHREAD_ATTR_SETSCHEDPARAM 1 -#define HAVE_PTHREAD_ATTR_SETSCOPE 1 -#define HAVE_PTHREAD_ATTR_SETSTACKSIZE 1 -/* #undef HAVE_PTHREAD_CONDATTR_CREATE */ -/* #undef HAVE_PTHREAD_INIT */ -#define HAVE_PTHREAD_KEY_DELETE 1 -#define HAVE_PTHREAD_KILL 1 -#define HAVE_PTHREAD_RWLOCK_RDLOCK 1 -/* #undef HAVE_PTHREAD_SETPRIO_NP */ -#define HAVE_PTHREAD_SETSCHEDPARAM 1 -#define HAVE_PTHREAD_SIGMASK 1 -/* #undef HAVE_PTHREAD_THREADMASK */ -/* #undef HAVE_PTHREAD_YIELD_NP */ -#define HAVE_READDIR_R 1 -#define HAVE_READLINK 1 -#define HAVE_REALPATH 1 -#define HAVE_RENAME 1 -#define HAVE_SCHED_YIELD 1 -#define HAVE_SELECT 1 -/* #undef HAVE_SETFD */ -/* #undef HAVE_SETFILEPOINTER */ -#define HAVE_SIGNAL 1 -#define HAVE_SIGACTION 1 -/* #undef HAVE_SIGTHREADMASK */ -#define HAVE_SIGWAIT 1 -#define HAVE_SLEEP 1 -#define HAVE_SNPRINTF 1 -/* #undef HAVE_SQLITE */ -#define HAVE_STPCPY 1 -#define HAVE_STRERROR 1 -/* #undef HAVE_STRLCPY */ -#define HAVE_STRNLEN 1 -#define HAVE_STRPBRK 1 -#define HAVE_STRSEP 1 -#define HAVE_STRSTR 1 -#define HAVE_STRTOK_R 1 -#define HAVE_STRTOL 1 -#define HAVE_STRTOLL 1 -#define HAVE_STRTOUL 1 -#define HAVE_STRTOULL 1 -/* #undef HAVE_TELL */ -/* #undef HAVE_THR_SETCONCURRENCY */ -/* #undef HAVE_THR_YIELD */ -#define HAVE_VASPRINTF 1 -#define HAVE_VSNPRINTF 1 - -/* - * types and sizes - */ -/* Types we may use */ -#define SIZEOF_CHAR 1 -#if defined(SIZEOF_CHAR) -# define HAVE_CHAR 1 -#endif - -#define SIZEOF_CHARP 8 -#if defined(SIZEOF_CHARP) -# define HAVE_CHARP 1 -#endif - -#define SIZEOF_SHORT 2 -#if defined(SIZEOF_SHORT) -# define HAVE_SHORT 1 -#endif - -#define SIZEOF_INT 4 -#if defined(SIZEOF_INT) -# define HAVE_INT 1 -#endif - -#define SIZEOF_LONG 8 -#if defined(SIZEOF_LONG) -# define HAVE_LONG 1 -#endif - -#define SIZEOF_LONG_LONG 8 -#if defined(SIZEOF_LONG_LONG) -# define HAVE_LONG_LONG 1 -#endif - - -#define SIZEOF_SIGSET_T 128 -#if defined(SIZEOF_SIGSET_T) -# define HAVE_SIGSET_T 1 -#endif - -#define SIZEOF_SIZE_T 8 -#if defined(SIZEOF_SIZE_T) -# define HAVE_SIZE_T 1 -#endif - -/* #undef SIZEOF_UCHAR */ -#if defined(SIZEOF_UCHAR) -# define HAVE_UCHAR 1 -#endif - -#define SIZEOF_UINT 4 -#if defined(SIZEOF_UINT) -# define HAVE_UINT 1 -#endif - -#define SIZEOF_ULONG 8 -#if defined(SIZEOF_ULONG) -# define HAVE_ULONG 1 -#endif - -/* #undef SIZEOF_INT8 */ -#if defined(SIZEOF_INT8) -# define HAVE_INT8 1 -#endif -/* #undef SIZEOF_UINT8 */ -#if defined(SIZEOF_UINT8) -# define HAVE_UINT8 1 -#endif - -/* #undef SIZEOF_INT16 */ -#if defined(SIZEOF_INT16) -# define HAVE_INT16 1 -#endif -/* #undef SIZEOF_UINT16 */ -#if defined(SIZEOF_UINT16) -# define HAVE_UINT16 1 -#endif - -/* #undef SIZEOF_INT32 */ -#if defined(SIZEOF_INT32) -# define HAVE_INT32 1 -#endif -/* #undef SIZEOF_UINT32 */ -#if defined(SIZEOF_UINT32) -# define HAVE_UINT32 1 -#endif -/* #undef SIZEOF_U_INT32_T */ -#if defined(SIZEOF_U_INT32_T) -# define HAVE_U_INT32_T 1 -#endif - -/* #undef SIZEOF_INT64 */ -#if defined(SIZEOF_INT64) -# define HAVE_INT64 1 -#endif -/* #undef SIZEOF_UINT64 */ -#if defined(SIZEOF_UINT64) -# define HAVE_UINT64 1 -#endif - -/* #undef SIZEOF_SOCKLEN_T */ -#if defined(SIZEOF_SOCKLEN_T) -# define HAVE_SOCKLEN_T 1 -#endif - -#define SOCKET_SIZE_TYPE socklen_t - -#define MARIADB_DEFAULT_CHARSET "latin1" - diff --git a/contrib/mariadb-connector-c-cmake/linux_x86_64/include/ma_config.h b/contrib/mariadb-connector-c-cmake/linux_x86_64/include/ma_config.h deleted file mode 100644 index 90c42c97df6..00000000000 --- a/contrib/mariadb-connector-c-cmake/linux_x86_64/include/ma_config.h +++ /dev/null @@ -1,269 +0,0 @@ - -/* - * Include file constants (processed in LibmysqlIncludeFiles.txt 1 - */ -#define HAVE_ALLOCA_H 1 -/* #undef HAVE_BIGENDIAN */ -#define HAVE_SETLOCALE 1 -#define HAVE_NL_LANGINFO 1 -#define HAVE_ARPA_INET_H 1 -#define HAVE_CRYPT_H 1 -#define HAVE_DIRENT_H 1 -#define HAVE_DLFCN_H 1 -#define HAVE_EXECINFO_H 1 -#define HAVE_FCNTL_H 1 -#define HAVE_FENV_H 1 -#define HAVE_FLOAT_H 1 -/* #undef HAVE_FPU_CONTROL_H */ -#define HAVE_GRP_H 1 -/* #undef HAVE_IEEEFP_H */ -#define HAVE_LIMITS_H 1 -#define HAVE_MALLOC_H 1 -#define HAVE_MEMORY_H 1 -#define HAVE_NETINET_IN_H 1 -#define HAVE_PATHS_H 1 -#define HAVE_PWD_H 1 -#define HAVE_SCHED_H 1 -/* #undef HAVE_SELECT_H */ -#define HAVE_STDDEF_H 1 -#define HAVE_STDINT_H 1 -#define HAVE_STDLIB_H 1 -#define HAVE_STRING_H 1 -#define HAVE_STRINGS_H 1 -/* #undef HAVE_SYNCH_H */ -/* #undef HAVE_SYS_FPU_H */ -#define HAVE_SYS_IOCTL_H 1 -#define HAVE_SYS_IPC_H 1 -#define HAVE_SYS_MMAN_H 1 -#define HAVE_SYS_PRCTL_H 1 -#define HAVE_SYS_SELECT_H 1 -#define HAVE_SYS_SHM_H 1 -#define HAVE_SYS_SOCKET_H 1 -#define HAVE_SYS_STAT_H 1 -/* #undef HAVE_SYS_STREAM_H */ -#define HAVE_SYS_TIMEB_H 1 -#define HAVE_SYS_TYPES_H 1 -#define HAVE_SYS_UN_H 1 -/* #undef HAVE_SYSENT_H */ -#define HAVE_TERMIO_H 1 -#define HAVE_TERMIOS_H 1 -#define HAVE_UNISTD_H 1 -#define HAVE_UTIME_H 1 -#define HAVE_UCONTEXT_H 1 - -/* - * function definitions - processed in LibmysqlFunctions.txt - */ -#define HAVE_ACCESS 1 -/* #undef HAVE_AIOWAIT */ -#define HAVE_ALARM 1 -/* #undef HAVE_ALLOCA */ -#define HAVE_BCMP 1 -/* #undef HAVE_BFILL */ -/* #undef HAVE_BMOVE */ -#define HAVE_BZERO 1 -#define HAVE_CLOCK_GETTIME 1 -/* #undef HAVE_COMPRESS */ -/* #undef HAVE_CRYPT */ -#define HAVE_DLERROR 1 -#define HAVE_DLOPEN 1 -#define HAVE_FCHMOD 1 -#define HAVE_FCNTL 1 -/* #undef HAVE_FCONVERT */ -#define HAVE_FDATASYNC 1 -#define HAVE_FESETROUND 1 -#define HAVE_FINITE 1 -#define HAVE_FSEEKO 1 -#define HAVE_FSYNC 1 -#define HAVE_GETADDRINFO 1 -#define HAVE_GETCWD 1 -#define HAVE_GETHOSTBYADDR_R 1 -#define HAVE_GETHOSTBYNAME_R 1 -/* #undef HAVE_GETHRTIME */ -#define HAVE_GETNAMEINFO 1 -#define HAVE_GETPAGESIZE 1 -#define HAVE_GETPASS 1 -/* #undef HAVE_GETPASSPHRASE */ -#define HAVE_GETPWNAM 1 -#define HAVE_GETPWUID 1 -#define HAVE_GETRLIMIT 1 -#define HAVE_GETRUSAGE 1 -#define HAVE_GETWD 1 -#define HAVE_GMTIME_R 1 -#define HAVE_INITGROUPS 1 -#define HAVE_LDIV 1 -#define HAVE_LOCALTIME_R 1 -#define HAVE_LOG2 1 -#define HAVE_LONGJMP 1 -#define HAVE_LSTAT 1 -#define HAVE_MADVISE 1 -#define HAVE_MALLINFO 1 -#define HAVE_MEMALIGN 1 -#define HAVE_MEMCPY 1 -#define HAVE_MEMMOVE 1 -#define HAVE_MKSTEMP 1 -#define HAVE_MLOCK 1 -#define HAVE_MLOCKALL 1 -#define HAVE_MMAP 1 -#define HAVE_MMAP64 1 -#define HAVE_PERROR 1 -#define HAVE_POLL 1 -#define HAVE_PREAD 1 -/* #undef HAVE_PTHREAD_ATTR_CREATE */ -#define HAVE_PTHREAD_ATTR_GETSTACKSIZE 1 -/* #undef HAVE_PTHREAD_ATTR_SETPRIO */ -#define HAVE_PTHREAD_ATTR_SETSCHEDPARAM 1 -#define HAVE_PTHREAD_ATTR_SETSCOPE 1 -#define HAVE_PTHREAD_ATTR_SETSTACKSIZE 1 -/* #undef HAVE_PTHREAD_CONDATTR_CREATE */ -/* #undef HAVE_PTHREAD_INIT */ -#define HAVE_PTHREAD_KEY_DELETE 1 -#define HAVE_PTHREAD_KILL 1 -#define HAVE_PTHREAD_RWLOCK_RDLOCK 1 -/* #undef HAVE_PTHREAD_SETPRIO_NP */ -#define HAVE_PTHREAD_SETSCHEDPARAM 1 -#define HAVE_PTHREAD_SIGMASK 1 -/* #undef HAVE_PTHREAD_THREADMASK */ -/* #undef HAVE_PTHREAD_YIELD_NP */ -#define HAVE_READDIR_R 1 -#define HAVE_READLINK 1 -#define HAVE_REALPATH 1 -#define HAVE_RENAME 1 -#define HAVE_SCHED_YIELD 1 -#define HAVE_SELECT 1 -/* #undef HAVE_SETFD */ -/* #undef HAVE_SETFILEPOINTER */ -#define HAVE_SIGNAL 1 -#define HAVE_SIGACTION 1 -/* #undef HAVE_SIGTHREADMASK */ -#define HAVE_SIGWAIT 1 -#define HAVE_SLEEP 1 -#define HAVE_SNPRINTF 1 -/* #undef HAVE_SQLITE */ -#define HAVE_STPCPY 1 -#define HAVE_STRERROR 1 -/* #undef HAVE_STRLCPY */ -#define HAVE_STRNLEN 1 -#define HAVE_STRPBRK 1 -#define HAVE_STRSEP 1 -#define HAVE_STRSTR 1 -#define HAVE_STRTOK_R 1 -#define HAVE_STRTOL 1 -#define HAVE_STRTOLL 1 -#define HAVE_STRTOUL 1 -#define HAVE_STRTOULL 1 -/* #undef HAVE_TELL */ -/* #undef HAVE_THR_SETCONCURRENCY */ -/* #undef HAVE_THR_YIELD */ -#define HAVE_VASPRINTF 1 -#define HAVE_VSNPRINTF 1 - -/* - * types and sizes - */ -/* Types we may use */ -#define SIZEOF_CHAR 1 -#if defined(SIZEOF_CHAR) -# define HAVE_CHAR 1 -#endif - -#define SIZEOF_CHARP 8 -#if defined(SIZEOF_CHARP) -# define HAVE_CHARP 1 -#endif - -#define SIZEOF_SHORT 2 -#if defined(SIZEOF_SHORT) -# define HAVE_SHORT 1 -#endif - -#define SIZEOF_INT 4 -#if defined(SIZEOF_INT) -# define HAVE_INT 1 -#endif - -#define SIZEOF_LONG 8 -#if defined(SIZEOF_LONG) -# define HAVE_LONG 1 -#endif - -#define SIZEOF_LONG_LONG 8 -#if defined(SIZEOF_LONG_LONG) -# define HAVE_LONG_LONG 1 -#endif - - -#define SIZEOF_SIGSET_T 128 -#if defined(SIZEOF_SIGSET_T) -# define HAVE_SIGSET_T 1 -#endif - -#define SIZEOF_SIZE_T 8 -#if defined(SIZEOF_SIZE_T) -# define HAVE_SIZE_T 1 -#endif - -/* #undef SIZEOF_UCHAR */ -#if defined(SIZEOF_UCHAR) -# define HAVE_UCHAR 1 -#endif - -#define SIZEOF_UINT 4 -#if defined(SIZEOF_UINT) -# define HAVE_UINT 1 -#endif - -#define SIZEOF_ULONG 8 -#if defined(SIZEOF_ULONG) -# define HAVE_ULONG 1 -#endif - -/* #undef SIZEOF_INT8 */ -#if defined(SIZEOF_INT8) -# define HAVE_INT8 1 -#endif -/* #undef SIZEOF_UINT8 */ -#if defined(SIZEOF_UINT8) -# define HAVE_UINT8 1 -#endif - -/* #undef SIZEOF_INT16 */ -#if defined(SIZEOF_INT16) -# define HAVE_INT16 1 -#endif -/* #undef SIZEOF_UINT16 */ -#if defined(SIZEOF_UINT16) -# define HAVE_UINT16 1 -#endif - -/* #undef SIZEOF_INT32 */ -#if defined(SIZEOF_INT32) -# define HAVE_INT32 1 -#endif -/* #undef SIZEOF_UINT32 */ -#if defined(SIZEOF_UINT32) -# define HAVE_UINT32 1 -#endif -/* #undef SIZEOF_U_INT32_T */ -#if defined(SIZEOF_U_INT32_T) -# define HAVE_U_INT32_T 1 -#endif - -/* #undef SIZEOF_INT64 */ -#if defined(SIZEOF_INT64) -# define HAVE_INT64 1 -#endif -/* #undef SIZEOF_UINT64 */ -#if defined(SIZEOF_UINT64) -# define HAVE_UINT64 1 -#endif - -/* #undef SIZEOF_SOCKLEN_T */ -#if defined(SIZEOF_SOCKLEN_T) -# define HAVE_SOCKLEN_T 1 -#endif - -#define SOCKET_SIZE_TYPE socklen_t - -#define MARIADB_DEFAULT_CHARSET "latin1" - diff --git a/contrib/mariadb-connector-c-cmake/linux_x86_64/include/mariadb_version.h b/contrib/mariadb-connector-c-cmake/linux_x86_64/include/mariadb_version.h deleted file mode 100644 index 821a7f8add2..00000000000 --- a/contrib/mariadb-connector-c-cmake/linux_x86_64/include/mariadb_version.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright Abandoned 1996, 1999, 2001 MySQL AB - This file is public domain and comes with NO WARRANTY of any kind */ - -/* Version numbers for protocol & mysqld */ - -#ifndef _mariadb_version_h_ -#define _mariadb_version_h_ - -#ifdef _CUSTOMCONFIG_ -#include -#else -#define PROTOCOL_VERSION 10 -#define MARIADB_CLIENT_VERSION_STR "10.3.6" -#define MARIADB_BASE_VERSION "mariadb-10.3" -#define MARIADB_VERSION_ID 100306 -#define MYSQL_VERSION_ID 100306 -#define MARIADB_PORT 3306 -#define MARIADB_UNIX_ADDR "/var/run/mysqld/mysqld.sock" -#define MYSQL_CONFIG_NAME "my" - -#define MARIADB_PACKAGE_VERSION "3.0.6" -#define MARIADB_PACKAGE_VERSION_ID 30006 -#define MARIADB_SYSTEM_TYPE "Linux" -#define MARIADB_MACHINE_TYPE "x86_64" -#define MARIADB_PLUGINDIR "lib/mariadb/plugin" - -/* mysqld compile time options */ -#ifndef MYSQL_CHARSET -#define MYSQL_CHARSET "" -#endif -#endif - -/* Source information */ -#define CC_SOURCE_REVISION "a0fd36cc5a5313414a5a2ebe9322577a29b4782a" - -#endif /* _mariadb_version_h_ */ diff --git a/contrib/mariadb-connector-c-cmake/linux_x86_64/libmariadb/ma_client_plugin.c b/contrib/mariadb-connector-c-cmake/linux_x86_64/libmariadb/ma_client_plugin.c deleted file mode 100644 index 434a4b3f4c3..00000000000 --- a/contrib/mariadb-connector-c-cmake/linux_x86_64/libmariadb/ma_client_plugin.c +++ /dev/null @@ -1,502 +0,0 @@ -/* Copyright (C) 2010 - 2012 Sergei Golubchik and Monty Program Ab - 2015-2016 MariaDB Corporation AB - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public - License along with this library; if not see - or write to the Free Software Foundation, Inc., - 51 Franklin St., Fifth Floor, Boston, MA 02110, USA */ - -/** - @file - - Support code for the client side (libmariadb) plugins - - Client plugins are somewhat different from server plugins, they are simpler. - - They do not need to be installed or in any way explicitly loaded on the - client, they are loaded automatically on demand. - One client plugin per shared object, soname *must* match the plugin name. - - There is no reference counting and no unloading either. -*/ - -#if _MSC_VER -/* Silence warnings about variable 'unused' being used. */ -#define FORCE_INIT_OF_VARS 1 -#endif - -#include -#include -#include -#include -#include - -#include "errmsg.h" -#include - -struct st_client_plugin_int { - struct st_client_plugin_int *next; - void *dlhandle; - struct st_mysql_client_plugin *plugin; -}; - -static my_bool initialized= 0; -static MA_MEM_ROOT mem_root; - -static uint valid_plugins[][2]= { - {MYSQL_CLIENT_AUTHENTICATION_PLUGIN, MYSQL_CLIENT_AUTHENTICATION_PLUGIN_INTERFACE_VERSION}, - {MARIADB_CLIENT_PVIO_PLUGIN, MARIADB_CLIENT_PVIO_PLUGIN_INTERFACE_VERSION}, - {MARIADB_CLIENT_TRACE_PLUGIN, MARIADB_CLIENT_TRACE_PLUGIN_INTERFACE_VERSION}, - {MARIADB_CLIENT_CONNECTION_PLUGIN, MARIADB_CLIENT_CONNECTION_PLUGIN_INTERFACE_VERSION}, - {0, 0} -}; - -/* - Loaded plugins are stored in a linked list. - The list is append-only, the elements are added to the head (like in a stack). - The elements are added under a mutex, but the list can be read and traversed - without any mutex because once an element is added to the list, it stays - there. The main purpose of a mutex is to prevent two threads from - loading the same plugin twice in parallel. -*/ - - -struct st_client_plugin_int *plugin_list[MYSQL_CLIENT_MAX_PLUGINS + MARIADB_CLIENT_MAX_PLUGINS]; -#ifdef THREAD -static pthread_mutex_t LOCK_load_client_plugin; -#endif - -extern struct st_mysql_client_plugin mysql_native_password_client_plugin; -extern struct st_mysql_client_plugin mysql_old_password_client_plugin; -extern struct st_mysql_client_plugin pvio_socket_client_plugin; -extern struct st_mysql_client_plugin sha256_password_client_plugin; -extern struct st_mysql_client_plugin caching_sha2_password_client_plugin; - - -struct st_mysql_client_plugin *mysql_client_builtins[]= -{ - (struct st_mysql_client_plugin *)&mysql_native_password_client_plugin, - (struct st_mysql_client_plugin *)&mysql_old_password_client_plugin, - (struct st_mysql_client_plugin *)&pvio_socket_client_plugin, - (struct st_mysql_client_plugin *)&sha256_password_client_plugin, - (struct st_mysql_client_plugin *)&caching_sha2_password_client_plugin, - 0 -}; - - -static int is_not_initialized(MYSQL *mysql, const char *name) -{ - if (initialized) - return 0; - - my_set_error(mysql, CR_AUTH_PLUGIN_CANNOT_LOAD, - SQLSTATE_UNKNOWN, ER(CR_AUTH_PLUGIN_CANNOT_LOAD), - name, "not initialized"); - return 1; -} - -static int get_plugin_nr(uint type) -{ - uint i= 0; - for(; valid_plugins[i][1]; i++) - if (valid_plugins[i][0] == type) - return i; - return -1; -} - -static const char *check_plugin_version(struct st_mysql_client_plugin *plugin, unsigned int version) -{ - if (plugin->interface_version < version || - (plugin->interface_version >> 8) > (version >> 8)) - return "Incompatible client plugin interface"; - return 0; -} - -/** - finds a plugin in the list - - @param name plugin name to search for - @param type plugin type - - @note this does NOT necessarily need a mutex, take care! - - @retval a pointer to a found plugin or 0 -*/ -static struct st_mysql_client_plugin *find_plugin(const char *name, int type) -{ - struct st_client_plugin_int *p; - int plugin_nr= get_plugin_nr(type); - - DBUG_ASSERT(initialized); - if (plugin_nr == -1) - return 0; - - if (!name) - return plugin_list[plugin_nr]->plugin; - - for (p= plugin_list[plugin_nr]; p; p= p->next) - { - if (strcmp(p->plugin->name, name) == 0) - return p->plugin; - } - return NULL; -} - - -/** - verifies the plugin and adds it to the list - - @param mysql MYSQL structure (for error reporting) - @param plugin plugin to install - @param dlhandle a handle to the shared object (returned by dlopen) - or 0 if the plugin was not dynamically loaded - @param argc number of arguments in the 'va_list args' - @param args arguments passed to the plugin initialization function - - @retval a pointer to an installed plugin or 0 -*/ - -static struct st_mysql_client_plugin * -add_plugin(MYSQL *mysql, struct st_mysql_client_plugin *plugin, void *dlhandle, - int argc, va_list args) -{ - const char *errmsg; - struct st_client_plugin_int plugin_int, *p; - char errbuf[1024]; - int plugin_nr; - - DBUG_ASSERT(initialized); - - plugin_int.plugin= plugin; - plugin_int.dlhandle= dlhandle; - - if ((plugin_nr= get_plugin_nr(plugin->type)) == -1) - { - errmsg= "Unknown client plugin type"; - goto err1; - } - if ((errmsg= check_plugin_version(plugin, valid_plugins[plugin_nr][1]))) - goto err1; - - /* Call the plugin initialization function, if any */ - if (plugin->init && plugin->init(errbuf, sizeof(errbuf), argc, args)) - { - errmsg= errbuf; - goto err1; - } - - p= (struct st_client_plugin_int *) - ma_memdup_root(&mem_root, (char *)&plugin_int, sizeof(plugin_int)); - - if (!p) - { - errmsg= "Out of memory"; - goto err2; - } - -#ifdef THREAD - safe_mutex_assert_owner(&LOCK_load_client_plugin); -#endif - - p->next= plugin_list[plugin_nr]; - plugin_list[plugin_nr]= p; - - return plugin; - -err2: - if (plugin->deinit) - plugin->deinit(); -err1: - my_set_error(mysql, CR_AUTH_PLUGIN_CANNOT_LOAD, SQLSTATE_UNKNOWN, - ER(CR_AUTH_PLUGIN_CANNOT_LOAD), plugin->name, errmsg); - if (dlhandle) - (void)dlclose(dlhandle); - return NULL; -} - - -/** - Loads plugins which are specified in the environment variable - LIBMYSQL_PLUGINS. - - Multiple plugins must be separated by semicolon. This function doesn't - return or log an error. - - The function is be called by mysql_client_plugin_init - - @todo - Support extended syntax, passing parameters to plugins, for example - LIBMYSQL_PLUGINS="plugin1(param1,param2);plugin2;..." - or - LIBMYSQL_PLUGINS="plugin1=int:param1,str:param2;plugin2;..." -*/ - -static void load_env_plugins(MYSQL *mysql) -{ - char *plugs, *free_env, *s= getenv("LIBMYSQL_PLUGINS"); - - if (ma_check_env_str(s)) - return; - - free_env= strdup(s); - plugs= s= free_env; - - do { - if ((s= strchr(plugs, ';'))) - *s= '\0'; - mysql_load_plugin(mysql, plugs, -1, 0); - plugs= s + 1; - } while (s); - - free(free_env); -} - -/********** extern functions to be used by libmariadb *********************/ - -/** - Initializes the client plugin layer. - - This function must be called before any other client plugin function. - - @retval 0 successful - @retval != 0 error occurred -*/ - -int mysql_client_plugin_init() -{ - MYSQL mysql; - struct st_mysql_client_plugin **builtin; - va_list unused; - LINT_INIT_STRUCT(unused); - - if (initialized) - return 0; - - memset(&mysql, 0, sizeof(mysql)); /* dummy mysql for set_mysql_extended_error */ - - pthread_mutex_init(&LOCK_load_client_plugin, MY_MUTEX_INIT_SLOW); - ma_init_alloc_root(&mem_root, 128, 128); - - memset(&plugin_list, 0, sizeof(plugin_list)); - - initialized= 1; - - pthread_mutex_lock(&LOCK_load_client_plugin); - for (builtin= mysql_client_builtins; *builtin; builtin++) - add_plugin(&mysql, *builtin, 0, 0, unused); - - pthread_mutex_unlock(&LOCK_load_client_plugin); - - load_env_plugins(&mysql); - - return 0; -} - - -/** - Deinitializes the client plugin layer. - - Unloades all client plugins and frees any associated resources. -*/ - -void mysql_client_plugin_deinit() -{ - int i; - struct st_client_plugin_int *p; - - if (!initialized) - return; - - for (i=0; i < MYSQL_CLIENT_MAX_PLUGINS; i++) - for (p= plugin_list[i]; p; p= p->next) - { - if (p->plugin->deinit) - p->plugin->deinit(); - if (p->dlhandle) - (void)dlclose(p->dlhandle); - } - - memset(&plugin_list, 0, sizeof(plugin_list)); - initialized= 0; - ma_free_root(&mem_root, MYF(0)); - pthread_mutex_destroy(&LOCK_load_client_plugin); -} - -/************* public facing functions, for client consumption *********/ - -/* see for a full description */ -struct st_mysql_client_plugin * STDCALL -mysql_client_register_plugin(MYSQL *mysql, - struct st_mysql_client_plugin *plugin) -{ - va_list unused; - LINT_INIT_STRUCT(unused); - - if (is_not_initialized(mysql, plugin->name)) - return NULL; - - pthread_mutex_lock(&LOCK_load_client_plugin); - - /* make sure the plugin wasn't loaded meanwhile */ - if (find_plugin(plugin->name, plugin->type)) - { - my_set_error(mysql, CR_AUTH_PLUGIN_CANNOT_LOAD, - SQLSTATE_UNKNOWN, ER(CR_AUTH_PLUGIN_CANNOT_LOAD), - plugin->name, "it is already loaded"); - plugin= NULL; - } - else - plugin= add_plugin(mysql, plugin, 0, 0, unused); - - pthread_mutex_unlock(&LOCK_load_client_plugin); - return plugin; -} - - -/* see for a full description */ -struct st_mysql_client_plugin * STDCALL -mysql_load_plugin_v(MYSQL *mysql, const char *name, int type, - int argc, va_list args) -{ - const char *errmsg; -#ifdef _WIN32 - char errbuf[1024]; -#endif - char dlpath[FN_REFLEN+1]; - void *sym, *dlhandle = NULL; - struct st_mysql_client_plugin *plugin; - char *env_plugin_dir= getenv("MARIADB_PLUGIN_DIR"); - - CLEAR_CLIENT_ERROR(mysql); - if (is_not_initialized(mysql, name)) - return NULL; - - pthread_mutex_lock(&LOCK_load_client_plugin); - - /* make sure the plugin wasn't loaded meanwhile */ - if (type >= 0 && find_plugin(name, type)) - { - errmsg= "it is already loaded"; - goto err; - } - - /* Compile dll path */ - snprintf(dlpath, sizeof(dlpath) - 1, "%s/%s%s", - mysql->options.extension && mysql->options.extension->plugin_dir ? - mysql->options.extension->plugin_dir : (env_plugin_dir) ? env_plugin_dir : - MARIADB_PLUGINDIR, name, SO_EXT); - - /* Open new dll handle */ - if (!(dlhandle= dlopen((const char *)dlpath, RTLD_NOW))) - { -#ifdef _WIN32 - char winmsg[255]; - size_t len; - winmsg[0] = 0; - FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, - NULL, - GetLastError(), - MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), - winmsg, 255, NULL); - len= strlen(winmsg); - while (len > 0 && (winmsg[len - 1] == '\n' || winmsg[len - 1] == '\r')) - len--; - if (len) - winmsg[len] = 0; - snprintf(errbuf, sizeof(errbuf), "%s Library path is '%s'", winmsg, dlpath); - errmsg= errbuf; -#else - errmsg= dlerror(); -#endif - goto err; - } - - - if (!(sym= dlsym(dlhandle, plugin_declarations_sym))) - { - errmsg= "not a plugin"; - (void)dlclose(dlhandle); - goto err; - } - - plugin= (struct st_mysql_client_plugin*)sym; - - if (type >=0 && type != plugin->type) - { - errmsg= "type mismatch"; - goto err; - } - - if (strcmp(name, plugin->name)) - { - errmsg= "name mismatch"; - goto err; - } - - if (type < 0 && find_plugin(name, plugin->type)) - { - errmsg= "it is already loaded"; - goto err; - } - - plugin= add_plugin(mysql, plugin, dlhandle, argc, args); - - pthread_mutex_unlock(&LOCK_load_client_plugin); - - return plugin; - -err: - if (dlhandle) - dlclose(dlhandle); - pthread_mutex_unlock(&LOCK_load_client_plugin); - my_set_error(mysql, CR_AUTH_PLUGIN_CANNOT_LOAD, SQLSTATE_UNKNOWN, - ER(CR_AUTH_PLUGIN_CANNOT_LOAD), name, errmsg); - return NULL; -} - - -/* see for a full description */ -struct st_mysql_client_plugin * STDCALL -mysql_load_plugin(MYSQL *mysql, const char *name, int type, int argc, ...) -{ - struct st_mysql_client_plugin *p; - va_list args; - va_start(args, argc); - p= mysql_load_plugin_v(mysql, name, type, argc, args); - va_end(args); - return p; -} - -/* see for a full description */ -struct st_mysql_client_plugin * STDCALL -mysql_client_find_plugin(MYSQL *mysql, const char *name, int type) -{ - struct st_mysql_client_plugin *p; - int plugin_nr= get_plugin_nr(type); - - if (is_not_initialized(mysql, name)) - return NULL; - - if (plugin_nr == -1) - { - my_set_error(mysql, CR_AUTH_PLUGIN_CANNOT_LOAD, SQLSTATE_UNKNOWN, - ER(CR_AUTH_PLUGIN_CANNOT_LOAD), name, "invalid type"); - } - - if ((p= find_plugin(name, type))) - return p; - - /* not found, load it */ - return mysql_load_plugin(mysql, name, type, 0); -} - diff --git a/libs/libmysqlxx/CMakeLists.txt b/libs/libmysqlxx/CMakeLists.txt index 263a031d7b0..25d81380fc8 100644 --- a/libs/libmysqlxx/CMakeLists.txt +++ b/libs/libmysqlxx/CMakeLists.txt @@ -29,7 +29,9 @@ add_library (mysqlxx target_include_directories (mysqlxx PUBLIC include) if (USE_INTERNAL_MYSQL_LIBRARY) - + target_include_directories (mysqlxx PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c/include) + target_include_directories (mysqlxx PUBLIC ${ClickHouse_BINARY_DIR}/contrib/mariadb-connector-c/include) + target_include_directories (mysqlxx PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c-cmake/common/include) else () set(PLATFORM_LIBRARIES ${CMAKE_DL_LIBS}) diff --git a/libs/libmysqlxx/cmake/find_mysqlclient.cmake b/libs/libmysqlxx/cmake/find_mysqlclient.cmake index 98b42a0a9b4..e07ebe2304d 100644 --- a/libs/libmysqlxx/cmake/find_mysqlclient.cmake +++ b/libs/libmysqlxx/cmake/find_mysqlclient.cmake @@ -6,14 +6,14 @@ if(ENABLE_MYSQL) option(USE_INTERNAL_MYSQL_LIBRARY "Set to FALSE to use system mysqlclient library instead of bundled" OFF) endif() - if(USE_INTERNAL_MYSQL_LIBRARY AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c/README.md") + if(USE_INTERNAL_MYSQL_LIBRARY AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c/README") message(WARNING "submodule contrib/mariadb-connector-c is missing. to fix try run: \n git submodule update --init --recursive") set(USE_INTERNAL_MYSQL_LIBRARY 0) endif() if (USE_INTERNAL_MYSQL_LIBRARY) - set (MYSQLCLIENT_LIBRARIES mysqlclient) + set (MYSQLCLIENT_LIBRARIES mariadbclient) set (USE_MYSQL 1) set (MYSQLXX_LIBRARY mysqlxx) else () From fd0c76131f2f3195fdce43109af58b9a6de7a4a5 Mon Sep 17 00:00:00 2001 From: Yuriy Date: Sun, 15 Sep 2019 23:20:47 +0300 Subject: [PATCH 103/309] removed contrib/mariadb-connector-c-cmake --- .../mariadb-connector-c-cmake/common/include/mysql/mysql.h | 1 - .../common/include/mysql/mysqld_error.h | 1 - libs/libmysqlxx/CMakeLists.txt | 1 - libs/libmysqlxx/src/Connection.cpp | 4 ++-- libs/libmysqlxx/src/Exception.cpp | 4 ++-- libs/libmysqlxx/src/Pool.cpp | 6 +++--- libs/libmysqlxx/src/Query.cpp | 4 ++-- libs/libmysqlxx/src/ResultBase.cpp | 4 ++-- libs/libmysqlxx/src/Row.cpp | 4 ++-- libs/libmysqlxx/src/StoreQueryResult.cpp | 4 ++-- libs/libmysqlxx/src/UseQueryResult.cpp | 4 ++-- 11 files changed, 17 insertions(+), 20 deletions(-) delete mode 100644 contrib/mariadb-connector-c-cmake/common/include/mysql/mysql.h delete mode 100644 contrib/mariadb-connector-c-cmake/common/include/mysql/mysqld_error.h diff --git a/contrib/mariadb-connector-c-cmake/common/include/mysql/mysql.h b/contrib/mariadb-connector-c-cmake/common/include/mysql/mysql.h deleted file mode 100644 index 741c7ba03c9..00000000000 --- a/contrib/mariadb-connector-c-cmake/common/include/mysql/mysql.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/contrib/mariadb-connector-c-cmake/common/include/mysql/mysqld_error.h b/contrib/mariadb-connector-c-cmake/common/include/mysql/mysqld_error.h deleted file mode 100644 index 95d26eef163..00000000000 --- a/contrib/mariadb-connector-c-cmake/common/include/mysql/mysqld_error.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/libs/libmysqlxx/CMakeLists.txt b/libs/libmysqlxx/CMakeLists.txt index 25d81380fc8..2d2ad75628d 100644 --- a/libs/libmysqlxx/CMakeLists.txt +++ b/libs/libmysqlxx/CMakeLists.txt @@ -31,7 +31,6 @@ target_include_directories (mysqlxx PUBLIC include) if (USE_INTERNAL_MYSQL_LIBRARY) target_include_directories (mysqlxx PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c/include) target_include_directories (mysqlxx PUBLIC ${ClickHouse_BINARY_DIR}/contrib/mariadb-connector-c/include) - target_include_directories (mysqlxx PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c-cmake/common/include) else () set(PLATFORM_LIBRARIES ${CMAKE_DL_LIBS}) diff --git a/libs/libmysqlxx/src/Connection.cpp b/libs/libmysqlxx/src/Connection.cpp index 80971549444..b44249b4ca6 100644 --- a/libs/libmysqlxx/src/Connection.cpp +++ b/libs/libmysqlxx/src/Connection.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/Exception.cpp b/libs/libmysqlxx/src/Exception.cpp index dadd37e29e7..b065d17ed51 100644 --- a/libs/libmysqlxx/src/Exception.cpp +++ b/libs/libmysqlxx/src/Exception.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/Pool.cpp b/libs/libmysqlxx/src/Pool.cpp index a17246e5d6d..410ac062039 100644 --- a/libs/libmysqlxx/src/Pool.cpp +++ b/libs/libmysqlxx/src/Pool.cpp @@ -1,6 +1,6 @@ -#if __has_include() -#include -#include +#if __has_include() +#include +#include #else #include #include diff --git a/libs/libmysqlxx/src/Query.cpp b/libs/libmysqlxx/src/Query.cpp index 6f275c918a5..dc5c3274641 100644 --- a/libs/libmysqlxx/src/Query.cpp +++ b/libs/libmysqlxx/src/Query.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/ResultBase.cpp b/libs/libmysqlxx/src/ResultBase.cpp index b03f92e38f2..eac1e22ca3d 100644 --- a/libs/libmysqlxx/src/ResultBase.cpp +++ b/libs/libmysqlxx/src/ResultBase.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/Row.cpp b/libs/libmysqlxx/src/Row.cpp index e4baa681d69..aecec46e519 100644 --- a/libs/libmysqlxx/src/Row.cpp +++ b/libs/libmysqlxx/src/Row.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/StoreQueryResult.cpp b/libs/libmysqlxx/src/StoreQueryResult.cpp index 05ad4299e17..a09986a3014 100644 --- a/libs/libmysqlxx/src/StoreQueryResult.cpp +++ b/libs/libmysqlxx/src/StoreQueryResult.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/UseQueryResult.cpp b/libs/libmysqlxx/src/UseQueryResult.cpp index c5c52ffcb9c..19daca90b15 100644 --- a/libs/libmysqlxx/src/UseQueryResult.cpp +++ b/libs/libmysqlxx/src/UseQueryResult.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif From 040e63eba472e3b18edf93601073ea14704a18bf Mon Sep 17 00:00:00 2001 From: Yuriy Date: Mon, 16 Sep 2019 01:30:55 +0300 Subject: [PATCH 104/309] disabled tests of mariadb-connector-c --- contrib/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 5525aef61db..5241ae75ec4 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -144,6 +144,7 @@ if (ENABLE_MYSQL AND USE_INTERNAL_MYSQL_LIBRARY) set(CLIENT_PLUGIN_DIALOG OFF) set(CLIENT_PLUGIN_CLIENT_ED25519 OFF) set(CLIENT_PLUGIN_MYSQL_CLEAR_PASSWORD OFF) + set(SKIP_TESTS 1) add_subdirectory (mariadb-connector-c) endif () From 89524cff46af38103975f343d3376a0f27a75d7b Mon Sep 17 00:00:00 2001 From: Yuriy Date: Mon, 16 Sep 2019 05:32:38 +0300 Subject: [PATCH 105/309] link with glibc-compatibility before libm --- contrib/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 8713fd7d711..0574dc33a1d 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -132,6 +132,7 @@ if (ENABLE_MYSQL AND USE_INTERNAL_MYSQL_LIBRARY) set(CLIENT_PLUGIN_CLIENT_ED25519 OFF) set(CLIENT_PLUGIN_MYSQL_CLEAR_PASSWORD OFF) set(SKIP_TESTS 1) + set(LIBM glibc-compatibility) add_subdirectory (mariadb-connector-c) endif () From e9336c9166166832138a819b9c2ddfd9a997a62b Mon Sep 17 00:00:00 2001 From: CurtizJ Date: Mon, 16 Sep 2019 19:17:56 +0300 Subject: [PATCH 106/309] improvements of redis external dictionary --- .../Dictionaries/RedisBlockInputStream.cpp | 116 +++++++----------- dbms/src/Dictionaries/RedisBlockInputStream.h | 9 +- .../Dictionaries/RedisDictionarySource.cpp | 65 ++++++---- dbms/src/Dictionaries/RedisDictionarySource.h | 27 ++-- 4 files changed, 101 insertions(+), 116 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index cc8f1d005de..016a13cf9e0 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -33,10 +33,11 @@ namespace DB RedisBlockInputStream::RedisBlockInputStream( const std::shared_ptr & client_, - const Poco::Redis::Array & keys_, + const RedisArray & keys_, + const RedisStorageType & storage_type_, const DB::Block & sample_block, const size_t max_block_size_) - : client(client_), keys(keys_), max_block_size{max_block_size_} + : client(client_), keys(keys_), storage_type(storage_type_), max_block_size{max_block_size_} { description.init(sample_block); } @@ -47,9 +48,8 @@ namespace DB namespace { using ValueType = ExternalResultDescription::ValueType; - using RedisArray = Poco::Redis::Array; - bool isNull(const Poco::Redis::RedisType::Ptr & value) + bool isNullString(const Poco::Redis::RedisType::Ptr & value) { return value->isBulkString() && static_cast *>(value.get())->value().isNull(); @@ -131,14 +131,12 @@ namespace DB break; } } - - void insertDefaultValue(IColumn & column, const IColumn & sample_column) { column.insertFrom(sample_column, 0); } } Block RedisBlockInputStream::readImpl() { - if (keys.isNull() || description.sample_block.rows() == 0 || keys.size() == 0) + if (keys.isNull() || description.sample_block.rows() == 0 || cursor >= keys.size()) all_read = true; if (all_read) @@ -163,43 +161,31 @@ namespace DB insertValue(*columns[idx], description.types[idx].first, value, name); }; - if (keys.begin()->get()->isArray()) + if (storage_type == RedisStorageType::HASH_MAP) { size_t num_rows = 0; while (num_rows < max_block_size && !all_read) { if (cursor >= keys.size()) - { - all_read = true; break; - } - const auto & primary_with_secondary = *(keys.begin() + cursor); - const auto & keys_array = - static_cast *>(primary_with_secondary.get())->value(); + const auto & keys_array = keys.get(cursor); if (keys_array.size() < 2) { throw Exception{"Too low keys in request to source: " + DB::toString(keys_array.size()) - + ", expected 2 or more", - ErrorCodes::LOGICAL_ERROR}; + + ", expected 2 or more", ErrorCodes::LOGICAL_ERROR}; } + if (num_rows + keys_array.size() - 1 > max_block_size) - { - if (num_rows == 0) - throw Exception{"Too many (" + DB::toString(keys_array.size()) + ") key attributes", - ErrorCodes::LIMIT_EXCEEDED}; break; - } - Poco::Redis::Command commandForValues("HMGET"); - for (size_t i = 0; i < keys_array.size(); ++i) - { - const auto & secondary_key = *(keys_array.begin() + i); - commandForValues.addRedisType(secondary_key); - } + Poco::Redis::Command command_for_values("HMGET"); + for (auto it = keys_array.begin(); it != keys_array.end(); ++it) + command_for_values.addRedisType(*it); + ++cursor; + auto values = client->execute(command_for_values); - Poco::Redis::Array values = client->execute(commandForValues); if (keys_array.size() != values.size() + 1) // 'HMGET' primary_key secondary_keys throw Exception{"Inconsistent sizes of keys and values in Redis request", ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH}; @@ -209,14 +195,12 @@ namespace DB { const auto & secondary_key = *(keys_array.begin() + i + 1); const auto & value = *(values.begin() + i); + if (value.isNull()) - { - insertValueByIdx(0, primary_key); - insertValueByIdx(1, secondary_key); - insertDefaultValue(*columns[2], *description.sample_block.getByPosition(2).column); - ++num_rows; - } - else if (!isNull(value)) // null string means 'no value for requested key' + throw Exception("Got NULL value in response from Redis", ErrorCodes::LOGICAL_ERROR); + + /// null string means 'no value for requested key' + if (!isNullString(value)) { insertValueByIdx(0, primary_key); insertValueByIdx(1, secondary_key); @@ -228,46 +212,34 @@ namespace DB } else { - size_t num_rows = 0; - while (num_rows < max_block_size && !all_read) + Poco::Redis::Command command_for_values("MGET"); + + // keys.size() > 0 + for (size_t i = 0; i < max_block_size && cursor < keys.size(); ++i) { - Poco::Redis::Command commandForValues("MGET"); + const auto & key = *(keys.begin() + cursor); + command_for_values.addRedisType(key); + ++cursor; + } - // keys.size() > 0 - for (size_t i = 0; i < max_block_size && cursor < keys.size(); ++i) + auto values = client->execute(command_for_values); + if (command_for_values.size() != values.size() + 1) // 'MGET' keys + throw Exception{"Inconsistent sizes of keys and values in Redis request", + ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH}; + + for (size_t i = 0; i < values.size(); ++i) + { + const auto & key = *(keys.begin() + cursor - i - 1); + const auto & value = *(values.begin() + values.size() - i - 1); + + if (value.isNull()) + throw Exception("Got NULL value in response from Redis", ErrorCodes::LOGICAL_ERROR); + + /// null string means 'no value for requested key' + if (!isNullString(value)) { - const auto & key = *(keys.begin() + cursor); - commandForValues.addRedisType(key); - ++cursor; - } - - if (commandForValues.size() == 1) // only 'MGET' - { - all_read = true; - break; - } - - Poco::Redis::Array values = client->execute(commandForValues); - if (commandForValues.size() != values.size() + 1) // 'MGET' keys - throw Exception{"Inconsistent sizes of keys and values in Redis request", - ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH}; - - for (size_t i = 0; i < values.size(); ++i) - { - const auto & key = *(keys.begin() + cursor - i - 1); - const auto & value = *(values.begin() + values.size() - i - 1); - if (value.isNull()) - { - insertValueByIdx(0, key); - insertDefaultValue(*columns[1], *description.sample_block.getByPosition(1).column); - ++num_rows; - } - else if (!isNull(value)) // null string means 'no value for requested key' - { - insertValueByIdx(0, key); - insertValueByIdx(1, value); - ++num_rows; - } + insertValueByIdx(0, key); + insertValueByIdx(1, value); } } } diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.h b/dbms/src/Dictionaries/RedisBlockInputStream.h index dc64ee0fdd4..5034e16080b 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.h +++ b/dbms/src/Dictionaries/RedisBlockInputStream.h @@ -3,7 +3,7 @@ #include #include #include - +#include "RedisDictionarySource.h" namespace Poco { @@ -11,6 +11,7 @@ namespace Poco { class Array; class Client; + class RedisType; } } @@ -20,9 +21,14 @@ namespace DB class RedisBlockInputStream final : public IBlockInputStream { public: + using RedisArray = Poco::Redis::Array; + using RedisTypePtr = Poco::Redis::RedisType::Ptr; + using RedisBulkString = Poco::Redis::BulkString; + RedisBlockInputStream( const std::shared_ptr & client_, const Poco::Redis::Array & keys_, + const RedisStorageType & storage_type_, const Block & sample_block, const size_t max_block_size); @@ -37,6 +43,7 @@ namespace DB std::shared_ptr client; Poco::Redis::Array keys; + RedisStorageType storage_type; const size_t max_block_size; ExternalResultDescription description; size_t cursor = 0; diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index 92d7644db1f..5d67dd6ae92 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -57,7 +57,7 @@ namespace DB } - static const size_t max_block_size = 8192; + static const size_t max_block_size = 4; RedisDictionarySource::RedisDictionarySource( @@ -65,7 +65,7 @@ namespace DB const std::string & host_, UInt16 port_, UInt8 db_index_, - RedisStorageType::Id storage_type_, + RedisStorageType storage_type_, const Block & sample_block_) : dict_struct{dict_struct_} , host{host_} @@ -80,11 +80,12 @@ namespace DB DB::toString(dict_struct.attributes.size()) + ", expected 1", ErrorCodes::INVALID_CONFIG_PARAMETER}; - { if (storage_type == RedisStorageType::HASH_MAP) + { if (!dict_struct.key.has_value()) throw Exception{"Redis source with storage type \'hash_map\' must have key", ErrorCodes::INVALID_CONFIG_PARAMETER}; + if (dict_struct.key.value().size() > 2) throw Exception{"Redis source with complex keys having more than 2 attributes are unsupported", ErrorCodes::INVALID_CONFIG_PARAMETER}; @@ -93,7 +94,7 @@ namespace DB if (db_index != 0) { - Poco::Redis::Command command("SELECT"); + RedisCommand command("SELECT"); command << static_cast(db_index); std::string reply = client->execute(command); if (reply != "+OK\r\n") @@ -132,55 +133,65 @@ namespace DB RedisDictionarySource::~RedisDictionarySource() = default; - static std::string storageTypeToKeyType(RedisStorageType::Id type) + static std::string storageTypeToKeyType(RedisStorageType type) { switch (type) { - case RedisStorageType::Id::SIMPLE: + case RedisStorageType::SIMPLE: return "string"; - case RedisStorageType::Id::HASH_MAP: + case RedisStorageType::HASH_MAP: return "hash"; default: return "none"; } - __builtin_unreachable(); + __builtin_unreachable(); } BlockInputStreamPtr RedisDictionarySource::loadAll() { - Poco::Redis::Command command_for_keys("KEYS"); + RedisCommand command_for_keys("KEYS"); command_for_keys << "*"; /// Get only keys for specified storage type. - auto all_keys = client->execute(command_for_keys); - Poco::Redis::Array keys; + auto all_keys = client->execute(command_for_keys); + RedisArray keys; auto key_type = storageTypeToKeyType(storage_type); for (auto & key : all_keys) - if (key_type == client->execute(Poco::Redis::Command("TYPE").addRedisType(key))) + if (key_type == client->execute(RedisCommand("TYPE").addRedisType(key))) keys.addRedisType(std::move(key)); if (storage_type == RedisStorageType::HASH_MAP && !keys.isNull()) { - Poco::Redis::Array hkeys; + RedisArray hkeys; for (const auto & key : keys) { - Poco::Redis::Command command_for_secondary_keys("HKEYS"); + RedisCommand command_for_secondary_keys("HKEYS"); command_for_secondary_keys.addRedisType(key); - auto secondary_keys = client->execute(command_for_secondary_keys); + auto secondary_keys = client->execute(command_for_secondary_keys); - Poco::Redis::Array primary_with_secondary; + RedisArray primary_with_secondary; primary_with_secondary.addRedisType(key); for (const auto & secondary_key : secondary_keys) + { primary_with_secondary.addRedisType(secondary_key); - - hkeys.add(std::move(primary_with_secondary)); + /// Do not store more than max_block_size values for one request. + if (primary_with_secondary.size() == max_block_size + 1) + { + hkeys.add(std::move(primary_with_secondary)); + primary_with_secondary.clear(); + primary_with_secondary.addRedisType(key); + } + } + if (primary_with_secondary.size() > 1) + hkeys.add(std::move(primary_with_secondary)); } + keys = std::move(hkeys); } - return std::make_shared(client, std::move(keys), sample_block, max_block_size); + return std::make_shared(client, std::move(keys), storage_type, sample_block, max_block_size); } @@ -192,12 +203,12 @@ namespace DB if (!dict_struct.id) throw Exception{"'id' is required for selective loading", ErrorCodes::UNSUPPORTED_METHOD}; - Poco::Redis::Array keys; + RedisArray keys; for (UInt64 id : ids) keys << DB::toString(id); - return std::make_shared(client, std::move(keys), sample_block, max_block_size); + return std::make_shared(client, std::move(keys), storage_type, sample_block, max_block_size); } std::string RedisDictionarySource::toString() const @@ -205,12 +216,14 @@ namespace DB return "Redis: " + host + ':' + DB::toString(port); } - RedisStorageType::Id RedisDictionarySource::parseStorageType(const std::string & storage_type) + RedisStorageType RedisDictionarySource::parseStorageType(const std::string & storage_type_str) { - RedisStorageType::Id storage_type_id = RedisStorageType::valueOf(storage_type); - if (storage_type_id == RedisStorageType::UNKNOWN) - storage_type_id = RedisStorageType::SIMPLE; - return storage_type_id; + if (storage_type_str == "hash_map") + return RedisStorageType::HASH_MAP; + else if (!storage_type_str.empty() && storage_type_str != "simple") + throw Exception("Unknown storage type " + storage_type_str + " for Redis dictionary", ErrorCodes::INVALID_CONFIG_PARAMETER); + + return RedisStorageType::SIMPLE; } } diff --git a/dbms/src/Dictionaries/RedisDictionarySource.h b/dbms/src/Dictionaries/RedisDictionarySource.h index d56de626a9a..19ba0a00e5f 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.h +++ b/dbms/src/Dictionaries/RedisDictionarySource.h @@ -18,30 +18,20 @@ namespace Poco namespace Redis { class Client; + class Array; + class Command; } } namespace DB { - namespace RedisStorageType + enum class RedisStorageType { - enum Id - { SIMPLE, HASH_MAP, UNKNOWN - }; - - Id valueOf(const std::string & value) - { - if (value == "simple") - return SIMPLE; - if (value == "hash_map") - return HASH_MAP; - return UNKNOWN; - } - } + }; class RedisDictionarySource final : public IDictionarySource { @@ -50,10 +40,13 @@ namespace DB const std::string & host, UInt16 port, UInt8 db_index, - RedisStorageType::Id storage_type, + RedisStorageType storage_type, const Block & sample_block); public: + using RedisArray = Poco::Redis::Array; + using RedisCommand = Poco::Redis::Command; + RedisDictionarySource( const DictionaryStructure & dict_struct, const Poco::Util::AbstractConfiguration & config, @@ -90,14 +83,14 @@ namespace DB std::string toString() const override; private: - static RedisStorageType::Id parseStorageType(const std::string& storage_type); + static RedisStorageType parseStorageType(const std::string& storage_type); private: const DictionaryStructure dict_struct; const std::string host; const UInt16 port; const UInt8 db_index; - const RedisStorageType::Id storage_type; + const RedisStorageType storage_type; Block sample_block; std::shared_ptr client; From 5ec000540033478b36dede5357e1c5812ba879ed Mon Sep 17 00:00:00 2001 From: root Date: Mon, 16 Sep 2019 17:28:41 +0000 Subject: [PATCH 107/309] +UpdatableSessionBase Committer: maqroll --- dbms/src/IO/HTTPCommon.cpp | 8 +- dbms/src/IO/HTTPCommon.h | 4 +- dbms/src/IO/ReadWriteBufferFromHTTP.cpp | 30 +--- dbms/src/IO/ReadWriteBufferFromHTTP.h | 210 +++++++++++++++++++----- dbms/src/IO/WriteBufferFromHTTP.cpp | 2 +- dbms/src/Storages/StorageURL.cpp | 2 +- 6 files changed, 176 insertions(+), 80 deletions(-) diff --git a/dbms/src/IO/HTTPCommon.cpp b/dbms/src/IO/HTTPCommon.cpp index eb48e544190..3d877fc7568 100644 --- a/dbms/src/IO/HTTPCommon.cpp +++ b/dbms/src/IO/HTTPCommon.cpp @@ -217,17 +217,15 @@ PooledHTTPSessionPtr makePooledHTTPSession(const Poco::URI & uri, const Connecti return HTTPSessionPool::instance().getSession(uri, timeouts, per_endpoint_pool_size); } +bool isRedirect(const Poco::Net::HTTPResponse::HTTPStatus status) { return status == Poco::Net::HTTPResponse::HTTP_MOVED_PERMANENTLY || status == Poco::Net::HTTPResponse::HTTP_FOUND || status == Poco::Net::HTTPResponse::HTTP_SEE_OTHER || status == Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT; } std::istream * receiveResponse( - Poco::Net::HTTPClientSession & session, const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response) + Poco::Net::HTTPClientSession & session, const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, const bool allow_redirects) { auto istr = &session.receiveResponse(response); auto status = response.getStatus(); - if ((request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET) && (status == Poco::Net::HTTPResponse::HTTP_MOVED_PERMANENTLY || status == Poco::Net::HTTPResponse::HTTP_FOUND || status == Poco::Net::HTTPResponse::HTTP_SEE_OTHER || status == Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT)) - throw Poco::URIRedirection(response.get("Location")); - - if (status != Poco::Net::HTTPResponse::HTTP_OK) + if (!(status == Poco::Net::HTTPResponse::HTTP_OK || (isRedirect(status) && allow_redirects))) { std::stringstream error_message; error_message << "Received error from remote server " << request.getURI() << ". HTTP status code: " << status << " " diff --git a/dbms/src/IO/HTTPCommon.h b/dbms/src/IO/HTTPCommon.h index dda8d2aac7e..f05efa15d36 100644 --- a/dbms/src/IO/HTTPCommon.h +++ b/dbms/src/IO/HTTPCommon.h @@ -50,11 +50,13 @@ HTTPSessionPtr makeHTTPSession(const Poco::URI & uri, const ConnectionTimeouts & /// As previous method creates session, but tooks it from pool PooledHTTPSessionPtr makePooledHTTPSession(const Poco::URI & uri, const ConnectionTimeouts & timeouts, size_t per_endpoint_pool_size); +bool isRedirect(const Poco::Net::HTTPResponse::HTTPStatus status); + /** Used to receive response (response headers and possibly body) * after sending data (request headers and possibly body). * Throws exception in case of non HTTP_OK (200) response code. * Returned istream lives in 'session' object. */ std::istream * receiveResponse( - Poco::Net::HTTPClientSession & session, const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response); + Poco::Net::HTTPClientSession & session, const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, bool allow_redirects); } diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.cpp b/dbms/src/IO/ReadWriteBufferFromHTTP.cpp index ec04de7802a..89e87020012 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.cpp +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.cpp @@ -9,33 +9,5 @@ namespace ErrorCodes extern const int TOO_MANY_REDIRECTS; } - -std::unique_ptr makeReadWriteBufferFromHTTP(const Poco::URI & uri, - const std::string & method, - std::function callback, - const DB::ConnectionTimeouts & timeouts, - const DB::SettingUInt64 max_redirects) - { - auto actual_uri =uri; - UInt64 redirects = 0; - - do - { - try - { - return std::make_unique(actual_uri, method, callback, timeouts); - } - catch (Poco::URIRedirection & exc) - { - redirects++; - actual_uri = exc.uri(); - } - } while(max_redirects>redirects); - - // too many redirects.... - std::stringstream error_message; - error_message << "Too many redirects while trying to access " << uri.toString() ; - - throw Exception(error_message.str(), ErrorCodes::TOO_MANY_REDIRECTS); - } } + diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.h b/dbms/src/IO/ReadWriteBufferFromHTTP.h index 9abcd0edf03..47ab3233024 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.h +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.h @@ -27,39 +27,78 @@ namespace DB /** Perform HTTP POST request and provide response to read. */ +namespace ErrorCodes +{ + extern const int TOO_MANY_REDIRECTS; +} + +template +class UpdatableSessionBase +{ +protected: + SessionPtr session; + UInt64 redirects { 0 }; + Poco::URI initial_uri; + const ConnectionTimeouts & timeouts; + DB::SettingUInt64 max_redirects; + +public: + void buildNewSession(const Poco::URI & uri); + + explicit UpdatableSessionBase(const Poco::URI uri, + const ConnectionTimeouts & timeouts_, + SettingUInt64 max_redirects_) + : initial_uri { uri } + , timeouts { timeouts_ } + , max_redirects { max_redirects_ } + { + } + + SessionPtr getSession() + { + return session; + } + + void updateSession(const Poco::URI & uri) + { + if (redirects++ + template class ReadWriteBufferFromHTTPBase : public ReadBuffer { protected: Poco::URI uri; std::string method; - SessionPtr session; + UpdatableSessionPtr session; std::istream * istr; /// owned by session std::unique_ptr impl; + std::function out_stream_callback; + const Poco::Net::HTTPBasicCredentials & credentials; - public: - using OutStreamCallback = std::function; - - explicit ReadWriteBufferFromHTTPBase(SessionPtr session_, - Poco::URI uri_, - const std::string & method_ = {}, - OutStreamCallback out_stream_callback = {}, - const Poco::Net::HTTPBasicCredentials & credentials = {}, - size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE) - : ReadBuffer(nullptr, 0) - , uri {uri_} - , method {!method_.empty() ? method_ : out_stream_callback ? Poco::Net::HTTPRequest::HTTP_POST : Poco::Net::HTTPRequest::HTTP_GET} - , session {session_} + protected: + std::istream * call(const Poco::URI uri_, Poco::Net::HTTPResponse & response) { // With empty path poco will send "POST HTTP/1.1" its bug. if (uri.getPath().empty()) uri.setPath("/"); - Poco::Net::HTTPRequest request(method, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); - request.setHost(uri.getHost()); // use original, not resolved host name in header + Poco::Net::HTTPRequest request(method, uri_.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); + request.setHost(uri_.getHost()); // use original, not resolved host name in header if (out_stream_callback) request.setChunkedTransferEncoding(true); @@ -67,26 +106,70 @@ namespace detail if (!credentials.getUsername().empty()) credentials.authenticate(request); - Poco::Net::HTTPResponse response; - LOG_TRACE((&Logger::get("ReadWriteBufferFromHTTP")), "Sending request to " << uri.toString()); + auto sess = session->getSession(); + + auto & stream_out = sess->sendRequest(request); + + if (out_stream_callback) + out_stream_callback(stream_out); + try { - auto & stream_out = session->sendRequest(request); + istr = receiveResponse(*sess, request, response, true); - if (out_stream_callback) - out_stream_callback(stream_out); + return istr; - istr = receiveResponse(*session, request, response); + } + catch (const Poco::Exception & e) + { + /// We use session data storage as storage for exception text + /// Depend on it we can deduce to reconnect session or reresolve session host + sess->attachSessionData(e.message()); + throw; + } + } + public: + using OutStreamCallback = std::function; + + explicit ReadWriteBufferFromHTTPBase(UpdatableSessionPtr session_, + Poco::URI uri_, + const std::string & method_ = {}, + OutStreamCallback out_stream_callback_ = {}, + const Poco::Net::HTTPBasicCredentials & credentials_ = {}, + size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE) + : ReadBuffer(nullptr, 0) + , uri {uri_} + , method {!method_.empty() ? method_ : out_stream_callback ? Poco::Net::HTTPRequest::HTTP_POST : Poco::Net::HTTPRequest::HTTP_GET} + , session {session_} + , out_stream_callback {out_stream_callback_} + , credentials {credentials_} + { + Poco::Net::HTTPResponse response; + + istr = call(uri, response); + + while (isRedirect(response.getStatus())) + { + Poco::URI uri_redirect(response.get("Location")); + + session->updateSession(uri_redirect); + + istr = call(uri_redirect,response); + } + + try + { impl = std::make_unique(*istr, buffer_size_); } catch (const Poco::Exception & e) { /// We use session data storage as storage for exception text /// Depend on it we can deduce to reconnect session or reresolve session host - session->attachSessionData(e.message()); + auto sess = session->getSession(); + sess->attachSessionData(e.message()); throw; } } @@ -103,47 +186,88 @@ namespace detail }; } -class ReadWriteBufferFromHTTP : public detail::ReadWriteBufferFromHTTPBase +class UpdatableSession : public UpdatableSessionBase { - using Parent = detail::ReadWriteBufferFromHTTPBase; + using Parent = UpdatableSessionBase; + +public: + explicit UpdatableSession(const Poco::URI uri, + const ConnectionTimeouts & timeouts_, + const SettingUInt64 max_redirects_) + : Parent(uri, timeouts_, max_redirects_) + { + session = makeHTTPSession(initial_uri, timeouts); + } + + void buildNewSession(const Poco::URI uri) + { + session = makeHTTPSession(uri, timeouts); + } +}; + +class ReadWriteBufferFromHTTP : public detail::ReadWriteBufferFromHTTPBase> +{ + using Parent = detail::ReadWriteBufferFromHTTPBase>; public: explicit ReadWriteBufferFromHTTP(Poco::URI uri_, const std::string & method_ = {}, - OutStreamCallback out_stream_callback = {}, + OutStreamCallback out_stream_callback_ = {}, const ConnectionTimeouts & timeouts = {}, - const Poco::Net::HTTPBasicCredentials & credentials = {}, + const DB::SettingUInt64 max_redirects = 0, + const Poco::Net::HTTPBasicCredentials & credentials_ = {}, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE) - : Parent(makeHTTPSession(uri_, timeouts), uri_, method_, out_stream_callback, credentials, buffer_size_) + : Parent(std::make_shared(uri_, timeouts, max_redirects), uri_, method_, out_stream_callback_, credentials_, buffer_size_) { } }; -class PooledReadWriteBufferFromHTTP : public detail::ReadWriteBufferFromHTTPBase + +class UpdatablePooledSession : public UpdatableSessionBase { - using Parent = detail::ReadWriteBufferFromHTTPBase; + using Parent = UpdatableSessionBase; + +private: + size_t per_endpoint_pool_size; + +public: + explicit UpdatablePooledSession(const Poco::URI uri, + const ConnectionTimeouts & timeouts_, + const SettingUInt64 max_redirects_, + size_t per_endpoint_pool_size_) + : Parent(uri, timeouts_, max_redirects_) + , per_endpoint_pool_size { per_endpoint_pool_size_ } + { + session = makePooledHTTPSession(initial_uri, timeouts, per_endpoint_pool_size); + } + + void buildNewSession(const Poco::URI uri) + { + session = makePooledHTTPSession(uri, timeouts, per_endpoint_pool_size); + } +}; + +class PooledReadWriteBufferFromHTTP : public detail::ReadWriteBufferFromHTTPBase> +{ + using Parent = detail::ReadWriteBufferFromHTTPBase>; public: explicit PooledReadWriteBufferFromHTTP(Poco::URI uri_, const std::string & method_ = {}, - OutStreamCallback out_stream_callback = {}, - const ConnectionTimeouts & timeouts = {}, - const Poco::Net::HTTPBasicCredentials & credentials = {}, + OutStreamCallback out_stream_callback_ = {}, + const ConnectionTimeouts & timeouts_ = {}, + const Poco::Net::HTTPBasicCredentials & credentials_ = {}, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, + const DB::SettingUInt64 max_redirects = 0, size_t max_connections_per_endpoint = DEFAULT_COUNT_OF_HTTP_CONNECTIONS_PER_ENDPOINT) - : Parent(makePooledHTTPSession(uri_, timeouts, max_connections_per_endpoint), + : Parent(std::make_shared(uri_, timeouts_, max_redirects, max_connections_per_endpoint), uri_, method_, - out_stream_callback, - credentials, + out_stream_callback_, + credentials_, buffer_size_) { } }; -std::unique_ptr makeReadWriteBufferFromHTTP(const Poco::URI & uri, - const std::string & method, - std::function callback, - const ConnectionTimeouts & timeouts, - const SettingUInt64 max_redirects); - } + diff --git a/dbms/src/IO/WriteBufferFromHTTP.cpp b/dbms/src/IO/WriteBufferFromHTTP.cpp index c74c74a0bd0..0a8095b960f 100644 --- a/dbms/src/IO/WriteBufferFromHTTP.cpp +++ b/dbms/src/IO/WriteBufferFromHTTP.cpp @@ -22,7 +22,7 @@ WriteBufferFromHTTP::WriteBufferFromHTTP( void WriteBufferFromHTTP::finalize() { - receiveResponse(*session, request, response); + receiveResponse(*session, request, response, false); /// TODO: Response body is ignored. } diff --git a/dbms/src/Storages/StorageURL.cpp b/dbms/src/Storages/StorageURL.cpp index ebbffd20675..074e99c533b 100644 --- a/dbms/src/Storages/StorageURL.cpp +++ b/dbms/src/Storages/StorageURL.cpp @@ -54,7 +54,7 @@ namespace const ConnectionTimeouts & timeouts) : name(name_) { - read_buf = makeReadWriteBufferFromHTTP(uri, method, callback, timeouts,context.getSettingsRef().max_http_get_redirects); + read_buf = std::make_unique(uri, method, callback, timeouts, context.getSettingsRef().max_http_get_redirects); reader = FormatFactory::instance().getInput(format, *read_buf, sample_block, context, max_block_size); } From 4df1f1bb9a217e3d15b5b7ed69f91cb415bdeceb Mon Sep 17 00:00:00 2001 From: CurtizJ Date: Tue, 17 Sep 2019 16:35:19 +0300 Subject: [PATCH 108/309] better integration test for redis dictionary (but still bad) --- .../Dictionaries/RedisDictionarySource.cpp | 8 +- dbms/tests/integration/helpers/cluster.py | 2 +- dbms/tests/integration/pytest.ini | 2 +- .../dictionary.py | 13 +- .../external_sources.py | 18 +- .../test.py | 218 +++++++++--- .../test_external_dictionaries/test_kv.py | 325 ------------------ 7 files changed, 201 insertions(+), 385 deletions(-) delete mode 100644 dbms/tests/integration/test_external_dictionaries/test_kv.py diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index 5d67dd6ae92..5957e891722 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -57,8 +57,7 @@ namespace DB } - static const size_t max_block_size = 4; - + static const size_t max_block_size = 8192; RedisDictionarySource::RedisDictionarySource( const DictionaryStructure & dict_struct_, @@ -155,13 +154,16 @@ namespace DB /// Get only keys for specified storage type. auto all_keys = client->execute(command_for_keys); + if (all_keys.isNull()) + return std::make_shared(client, RedisArray{}, storage_type, sample_block, max_block_size); + RedisArray keys; auto key_type = storageTypeToKeyType(storage_type); for (auto & key : all_keys) if (key_type == client->execute(RedisCommand("TYPE").addRedisType(key))) keys.addRedisType(std::move(key)); - if (storage_type == RedisStorageType::HASH_MAP && !keys.isNull()) + if (storage_type == RedisStorageType::HASH_MAP) { RedisArray hkeys; for (const auto & key : keys) diff --git a/dbms/tests/integration/helpers/cluster.py b/dbms/tests/integration/helpers/cluster.py index d3e4789d09e..30552975639 100644 --- a/dbms/tests/integration/helpers/cluster.py +++ b/dbms/tests/integration/helpers/cluster.py @@ -115,7 +115,7 @@ class ClickHouseCluster: cmd += " client" return cmd - def add_instance(self, name, config_dir=None, main_configs=[], user_configs=[], macros={}, with_zookeeper=False, with_mysql=False, with_kafka=False, clickhouse_path_dir=None, with_odbc_drivers=False, with_postgres=False, with_hdfs=False, with_mongo=False, with_redis=False, hostname=None, env_variables={}, image="yandex/clickhouse-integration-test", stay_alive=False, ipv4_address=None, ipv6_address=None): + def add_instance(self, name, config_dir=None, main_configs=[], user_configs=[], macros={}, with_zookeeper=False, with_mysql=False, with_kafka=False, clickhouse_path_dir=None, with_odbc_drivers=False, with_postgres=False, with_hdfs=False, with_mongo=False, with_redis=False, hostname=None, env_variables={}, image="yandex/clickhouse-integration-test", stay_alive=False, ipv4_address=None, ipv6_address=None, with_installed_binary=False): """Add an instance to the cluster. name - the name of the instance directory and the value of the 'instance' macro in ClickHouse. diff --git a/dbms/tests/integration/pytest.ini b/dbms/tests/integration/pytest.ini index de681b6e750..31364843b29 100644 --- a/dbms/tests/integration/pytest.ini +++ b/dbms/tests/integration/pytest.ini @@ -1,4 +1,4 @@ [pytest] -python_files = test*.py +python_files = test.py norecursedirs = _instances timeout = 600 diff --git a/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/dictionary.py b/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/dictionary.py index 6d53a5dfdd1..18e13fde2ad 100644 --- a/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/dictionary.py +++ b/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/dictionary.py @@ -44,6 +44,9 @@ class Row(object): for field, value in zip(fields, values): self.data[field.name] = value + def has_field(self, name): + return name in self.data + def get_value_by_name(self, name): return self.data[name] @@ -97,6 +100,7 @@ class DictionaryStructure(object): self.range_key = None self.ordinary_fields = [] self.range_fields = [] + self.has_hierarchy = False for field in fields: if field.is_key: @@ -105,6 +109,9 @@ class DictionaryStructure(object): self.range_fields.append(field) else: self.ordinary_fields.append(field) + + if field.hierarchical: + self.has_hierarchy = True if field.is_range_key: if self.range_key is not None: @@ -286,14 +293,13 @@ class DictionaryStructure(object): class Dictionary(object): - def __init__(self, name, structure, source, config_path, table_name, fields=None, values=None): + def __init__(self, name, structure, source, config_path, table_name, fields): self.name = name self.structure = copy.deepcopy(structure) self.source = copy.deepcopy(source) self.config_path = config_path self.table_name = table_name self.fields = fields - self.values = values def generate_config(self): with open(self.config_path, 'w') as result: @@ -343,3 +349,6 @@ class Dictionary(object): def is_complex(self): return self.structure.layout.is_complex + + def get_fields(self): + return self.fields diff --git a/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/external_sources.py b/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/external_sources.py index 58af8c6487b..d1503224e98 100644 --- a/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/external_sources.py +++ b/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/external_sources.py @@ -402,14 +402,20 @@ class SourceRedis(ExternalSource): def prepare(self, structure, table_name, cluster): self.client = redis.StrictRedis(host=self.internal_hostname, port=self.internal_port) self.prepared = True + self.ordered_names = structure.get_ordered_names() - def load_kv_data(self, values): + def load_data(self, data, table_name): self.client.flushdb() - if len(values[0]) == 2: - self.client.mset({value[0]: value[1] for value in values}) - else: - for value in values: - self.client.hset(value[0], value[1], value[2]) + for row in list(data): + values = [] + for name in self.ordered_names: + values.append(str(row.data[name])) + print 'values: ', values + if len(values) == 2: + self.client.set(*values) + print 'kek: ', self.client.get(values[0]) + else: + self.client.hset(*values) def compatible_with_layout(self, layout): if ( diff --git a/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/test.py b/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/test.py index 841a9124af0..01f9b15b51f 100644 --- a/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/test.py +++ b/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/test.py @@ -4,9 +4,10 @@ import os from helpers.cluster import ClickHouseCluster from dictionary import Field, Row, Dictionary, DictionaryStructure, Layout from external_sources import SourceMySQL, SourceClickHouse, SourceFile, SourceExecutableCache, SourceExecutableHashed -from external_sources import SourceMongo, SourceHTTP, SourceHTTPS +from external_sources import SourceMongo, SourceHTTP, SourceHTTPS, SourceRedis SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +dict_configs_path = os.path.join(SCRIPT_DIR, 'configs/dictionaries') FIELDS = { "simple": [ @@ -65,9 +66,44 @@ FIELDS = { Field("Float32_", 'Float32', default_value_for_get=555.11), Field("Float64_", 'Float64', default_value_for_get=777.11), ] - } +VALUES = { + "simple": [ + [1, 22, 333, 4444, 55555, -6, -77, + -888, -999, '550e8400-e29b-41d4-a716-446655440003', + '1973-06-28', '1985-02-28 23:43:25', 'hello', 22.543, 3332154213.4, 0], + [2, 3, 4, 5, 6, -7, -8, + -9, -10, '550e8400-e29b-41d4-a716-446655440002', + '1978-06-28', '1986-02-28 23:42:25', 'hello', 21.543, 3222154213.4, 1] + ], + "complex": [ + [1, 'world', 22, 333, 4444, 55555, -6, + -77, -888, -999, '550e8400-e29b-41d4-a716-446655440003', + '1973-06-28', '1985-02-28 23:43:25', + 'hello', 22.543, 3332154213.4], + [2, 'qwerty2', 52, 2345, 6544, 9191991, -2, + -717, -81818, -92929, '550e8400-e29b-41d4-a716-446655440007', + '1975-09-28', '2000-02-28 23:33:24', + 'my', 255.543, 3332221.44] + + ], + "ranged": [ + [1, '2019-02-10', '2019-02-01', '2019-02-28', + 22, 333, 4444, 55555, -6, -77, -888, -999, + '550e8400-e29b-41d4-a716-446655440003', + '1973-06-28', '1985-02-28 23:43:25', 'hello', + 22.543, 3332154213.4], + [2, '2019-04-10', '2019-04-01', '2019-04-28', + 11, 3223, 41444, 52515, -65, -747, -8388, -9099, + '550e8400-e29b-41d4-a716-446655440004', + '1973-06-29', '2002-02-28 23:23:25', '!!!!', + 32.543, 3332543.4] + ] +} + + + LAYOUTS = [ Layout("hashed"), Layout("cache"), @@ -91,36 +127,59 @@ SOURCES = [ DICTIONARIES = [] +# Key-value dictionaries with onle one possible field for key +SOURCES_KV = [ + SourceRedis("RedisSimple", "localhost", "6380", "redis1", "6379", "", "", storage_type="simple"), + SourceRedis("RedisHash", "localhost", "6380", "redis1", "6379", "", "", storage_type="hash_map"), +] + +DICTIONARIES_KV = [] + cluster = None node = None +def get_dict(source, layout, fields, suffix_name=''): + global dict_configs_path + + structure = DictionaryStructure(layout, fields) + dict_name = source.name + "_" + layout.name + '_' + suffix_name + dict_path = os.path.join(dict_configs_path, dict_name + '.xml') + dictionary = Dictionary(dict_name, structure, source, dict_path, "table_" + dict_name, fields) + dictionary.generate_config() + return dictionary def setup_module(module): global DICTIONARIES global cluster global node + global dict_configs_path - dict_configs_path = os.path.join(SCRIPT_DIR, 'configs/dictionaries') for f in os.listdir(dict_configs_path): os.remove(os.path.join(dict_configs_path, f)) for layout in LAYOUTS: for source in SOURCES: if source.compatible_with_layout(layout): - structure = DictionaryStructure(layout, FIELDS[layout.layout_type]) - dict_name = source.name + "_" + layout.name - dict_path = os.path.join(dict_configs_path, dict_name + '.xml') - dictionary = Dictionary(dict_name, structure, source, dict_path, "table_" + dict_name) - dictionary.generate_config() - DICTIONARIES.append(dictionary) + DICTIONARIES.append(get_dict(source, layout, FIELDS[layout.layout_type])) else: print "Source", source.name, "incompatible with layout", layout.name + + for layout in LAYOUTS: + field_keys = list(filter(lambda x: x.is_key, FIELDS[layout.layout_type])) + for source in SOURCES_KV: + if not source.compatible_with_layout(layout): + print "Source", source.name, "incompatible with layout", layout.name + continue + + for field in FIELDS[layout.layout_type]: + if not (field.is_key or field.is_range or field.is_range_key): + DICTIONARIES_KV.append(get_dict(source, layout, field_keys + [field], field.name)) main_configs = [] for fname in os.listdir(dict_configs_path): main_configs.append(os.path.join(dict_configs_path, fname)) cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs')) - node = cluster.add_instance('node', main_configs=main_configs, with_mysql=True, with_mongo=True) + node = cluster.add_instance('node', main_configs=main_configs, with_mysql=True, with_mongo=True, with_redis=True) cluster.add_instance('clickhouse1') @@ -128,7 +187,7 @@ def setup_module(module): def started_cluster(): try: cluster.start() - for dictionary in DICTIONARIES: + for dictionary in DICTIONARIES + DICTIONARIES_KV: print "Preparing", dictionary.name dictionary.prepare_source(cluster) print "Prepared" @@ -141,16 +200,8 @@ def started_cluster(): def test_simple_dictionaries(started_cluster): fields = FIELDS["simple"] - data = [ - Row(fields, - [1, 22, 333, 4444, 55555, -6, -77, - -888, -999, '550e8400-e29b-41d4-a716-446655440003', - '1973-06-28', '1985-02-28 23:43:25', 'hello', 22.543, 3332154213.4, 0]), - Row(fields, - [2, 3, 4, 5, 6, -7, -8, - -9, -10, '550e8400-e29b-41d4-a716-446655440002', - '1978-06-28', '1986-02-28 23:42:25', 'hello', 21.543, 3222154213.4, 1]), - ] + values = VALUES["simple"] + data = [Row(fields, vals) for vals in values] simple_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "simple"] for dct in simple_dicts: @@ -192,18 +243,8 @@ def test_simple_dictionaries(started_cluster): def test_complex_dictionaries(started_cluster): fields = FIELDS["complex"] - data = [ - Row(fields, - [1, 'world', 22, 333, 4444, 55555, -6, - -77, -888, -999, '550e8400-e29b-41d4-a716-446655440003', - '1973-06-28', '1985-02-28 23:43:25', - 'hello', 22.543, 3332154213.4]), - Row(fields, - [2, 'qwerty2', 52, 2345, 6544, 9191991, -2, - -717, -81818, -92929, '550e8400-e29b-41d4-a716-446655440007', - '1975-09-28', '2000-02-28 23:33:24', - 'my', 255.543, 3332221.44]), - ] + values = VALUES["complex"] + data = [Row(fields, vals) for vals in values] complex_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "complex"] for dct in complex_dicts: @@ -232,20 +273,8 @@ def test_complex_dictionaries(started_cluster): def test_ranged_dictionaries(started_cluster): fields = FIELDS["ranged"] - data = [ - Row(fields, - [1, '2019-02-10', '2019-02-01', '2019-02-28', - 22, 333, 4444, 55555, -6, -77, -888, -999, - '550e8400-e29b-41d4-a716-446655440003', - '1973-06-28', '1985-02-28 23:43:25', 'hello', - 22.543, 3332154213.4]), - Row(fields, - [2, '2019-04-10', '2019-04-01', '2019-04-28', - 11, 3223, 41444, 52515, -65, -747, -8388, -9099, - '550e8400-e29b-41d4-a716-446655440004', - '1973-06-29', '2002-02-28 23:23:25', '!!!!', - 32.543, 3332543.4]), - ] + values = VALUES["ranged"] + data = [Row(fields, vals) for vals in values] ranged_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "ranged"] for dct in ranged_dicts: @@ -264,3 +293,98 @@ def test_ranged_dictionaries(started_cluster): for query, answer in queries_with_answers: print query assert node.query(query) == str(answer) + '\n' + + +def test_key_value_simple_dictionaries(started_cluster): + fields = FIELDS["simple"] + values = VALUES["simple"] + data = [Row(fields, vals) for vals in values] + + simple_dicts = [d for d in DICTIONARIES_KV if d.structure.layout.layout_type == "simple"] + + for dct in simple_dicts: + queries_with_answers = [] + local_data = [] + for row in data: + local_fields = dct.get_fields() + local_values = [row.get_value_by_name(field.name) for field in local_fields if row.has_field(field.name)] + local_data.append(Row(local_fields, local_values)) + + dct.load_data(local_data) + + node.query("system reload dictionary {}".format(dct.name)) + + print 'name: ', dct.name + + for row in local_data: + print dct.get_fields() + for field in dct.get_fields(): + print field.name, field.is_key + if not field.is_key: + for query in dct.get_select_get_queries(field, row): + queries_with_answers.append((query, row.get_value_by_name(field.name))) + + for query in dct.get_select_has_queries(field, row): + queries_with_answers.append((query, 1)) + + for query in dct.get_select_get_or_default_queries(field, row): + queries_with_answers.append((query, field.default_value_for_get)) + + if dct.structure.has_hierarchy: + for query in dct.get_hierarchical_queries(data[0]): + queries_with_answers.append((query, [1])) + + for query in dct.get_hierarchical_queries(data[1]): + queries_with_answers.append((query, [2, 1])) + + for query in dct.get_is_in_queries(data[0], data[1]): + queries_with_answers.append((query, 0)) + + for query in dct.get_is_in_queries(data[1], data[0]): + queries_with_answers.append((query, 1)) + + for query, answer in queries_with_answers: + print query + if isinstance(answer, list): + answer = str(answer).replace(' ', '') + assert node.query(query) == str(answer) + '\n' + + +def test_key_value_complex_dictionaries(started_cluster): + fields = FIELDS["complex"] + values = VALUES["complex"] + data = [Row(fields, vals) for vals in values] + + complex_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "complex"] + for dct in complex_dicts: + dct.load_data(data) + + node.query("system reload dictionaries") + + for dct in complex_dicts: + queries_with_answers = [] + local_data = [] + for row in data: + local_fields = dct.get_fields() + local_values = [row.get_value_by_name(field.name) for field in local_fields if row.has_field(field.name)] + local_data.append(Row(local_fields, local_values)) + + dct.load_data(local_data) + + node.query("system reload dictionary {}".format(dct.name)) + + for row in local_data: + for field in dct.get_fields(): + if not field.is_key: + for query in dct.get_select_get_queries(field, row): + queries_with_answers.append((query, row.get_value_by_name(field.name))) + + for query in dct.get_select_has_queries(field, row): + queries_with_answers.append((query, 1)) + + for query in dct.get_select_get_or_default_queries(field, row): + queries_with_answers.append((query, field.default_value_for_get)) + + for query, answer in queries_with_answers: + print query + assert node.query(query) == str(answer) + '\n' diff --git a/dbms/tests/integration/test_external_dictionaries/test_kv.py b/dbms/tests/integration/test_external_dictionaries/test_kv.py deleted file mode 100644 index 69fa48d5e2e..00000000000 --- a/dbms/tests/integration/test_external_dictionaries/test_kv.py +++ /dev/null @@ -1,325 +0,0 @@ -import os - -import pytest -from dictionary import Field, Row, Dictionary, DictionaryStructure, Layout -from external_sources import SourceRedis, SourceAerospike - -from helpers.cluster import ClickHouseCluster - -SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) - -FIELDS = { - "simple": [ - Field("KeyField", 'UInt64', is_key=True, default_value_for_get=9999999), - Field("UInt8_", 'UInt8', default_value_for_get=55), - Field("UInt16_", 'UInt16', default_value_for_get=66), - Field("UInt32_", 'UInt32', default_value_for_get=77), - Field("UInt64_", 'UInt64', default_value_for_get=88), - Field("Int8_", 'Int8', default_value_for_get=-55), - Field("Int16_", 'Int16', default_value_for_get=-66), - Field("Int32_", 'Int32', default_value_for_get=-77), - Field("Int64_", 'Int64', default_value_for_get=-88), - Field("UUID_", 'UUID', default_value_for_get='550e8400-0000-0000-0000-000000000000'), - Field("Date_", 'Date', default_value_for_get='2018-12-30'), - Field("DateTime_", 'DateTime', default_value_for_get='2018-12-30 00:00:00'), - Field("String_", 'String', default_value_for_get='hi'), - Field("Float32_", 'Float32', default_value_for_get=555.11), - Field("Float64_", 'Float64', default_value_for_get=777.11), - Field("ParentKeyField", "UInt64", default_value_for_get=444, hierarchical=True), - ], - "complex": [ - Field("KeyField1", 'UInt64', is_key=True, default_value_for_get=9999999), - Field("KeyField2", 'String', is_key=True, default_value_for_get='xxxxxxxxx'), - Field("UInt8_", 'UInt8', default_value_for_get=55), - Field("UInt16_", 'UInt16', default_value_for_get=66), - Field("UInt32_", 'UInt32', default_value_for_get=77), - Field("UInt64_", 'UInt64', default_value_for_get=88), - Field("Int8_", 'Int8', default_value_for_get=-55), - Field("Int16_", 'Int16', default_value_for_get=-66), - Field("Int32_", 'Int32', default_value_for_get=-77), - Field("Int64_", 'Int64', default_value_for_get=-88), - Field("UUID_", 'UUID', default_value_for_get='550e8400-0000-0000-0000-000000000000'), - Field("Date_", 'Date', default_value_for_get='2018-12-30'), - Field("DateTime_", 'DateTime', default_value_for_get='2018-12-30 00:00:00'), - Field("String_", 'String', default_value_for_get='hi'), - Field("Float32_", 'Float32', default_value_for_get=555.11), - Field("Float64_", 'Float64', default_value_for_get=777.11), - ], - "ranged": [ - Field("KeyField1", 'UInt64', is_key=True), - Field("KeyField2", 'Date', is_range_key=True), - Field("StartDate", 'Date', range_hash_type='min'), - Field("EndDate", 'Date', range_hash_type='max'), - Field("UInt8_", 'UInt8', default_value_for_get=55), - Field("UInt16_", 'UInt16', default_value_for_get=66), - Field("UInt32_", 'UInt32', default_value_for_get=77), - Field("UInt64_", 'UInt64', default_value_for_get=88), - Field("Int8_", 'Int8', default_value_for_get=-55), - Field("Int16_", 'Int16', default_value_for_get=-66), - Field("Int32_", 'Int32', default_value_for_get=-77), - Field("Int64_", 'Int64', default_value_for_get=-88), - Field("UUID_", 'UUID', default_value_for_get='550e8400-0000-0000-0000-000000000000'), - Field("Date_", 'Date', default_value_for_get='2018-12-30'), - Field("DateTime_", 'DateTime', default_value_for_get='2018-12-30 00:00:00'), - Field("String_", 'String', default_value_for_get='hi'), - Field("Float32_", 'Float32', default_value_for_get=555.11), - Field("Float64_", 'Float64', default_value_for_get=777.11), - ], -} - -VALUES = { - "simple": [ - [ - 1, 22, 333, 4444, 55555, -6, -77, - -888, -999, '550e8400-e29b-41d4-a716-446655440003', - '1973-06-28', '1985-02-28 23:43:25', 'hello', 22.543, 3332154213.4, 0, - ], - [ - 2, 3, 4, 5, 6, -7, -8, - -9, -10, '550e8400-e29b-41d4-a716-446655440002', - '1978-06-28', '1986-02-28 23:42:25', 'hello', 21.543, 3222154213.4, 1, - ], - ], - "complex": [ - [ - 1, 'world', 22, 333, 4444, 55555, -6, - -77, -888, -999, '550e8400-e29b-41d4-a716-446655440003', - '1973-06-28', '1985-02-28 23:43:25', - 'hello', 22.543, 3332154213.4, - ], - [ - 2, 'qwerty2', 52, 2345, 6544, 9191991, -2, - -717, -81818, -92929, '550e8400-e29b-41d4-a716-446655440007', - '1975-09-28', '2000-02-28 23:33:24', - 'my', 255.543, 3332221.44, - ], - ], - "ranged": [ - [ - 1, '2019-02-10', '2019-02-01', '2019-02-28', - 22, 333, 4444, 55555, -6, -77, -888, -999, - '550e8400-e29b-41d4-a716-446655440003', - '1973-06-28', '1985-02-28 23:43:25', 'hello', - 22.543, 3332154213.4, - ], - [ - 2, '2019-04-10', '2019-04-01', '2019-04-28', - 11, 3223, 41444, 52515, -65, -747, -8388, -9099, - '550e8400-e29b-41d4-a716-446655440004', - '1973-06-29', '2002-02-28 23:23:25', '!!!!', - 32.543, 3332543.4, - ], - ], -} - -LAYOUTS = [ - Layout("flat"), - Layout("hashed"), - Layout("cache"), - Layout('complex_key_hashed_one_key'), - Layout('complex_key_hashed_two_keys'), - Layout("complex_key_cache"), - Layout("range_hashed"), -] - -SOURCES = [ - SourceRedis("RedisSimple", "localhost", "6380", "redis1", "6379", "", "", storage_type="simple"), - SourceRedis("RedisHash", "localhost", "6380", "redis1", "6379", "", "", storage_type="hash_map"), - # SourceAerospike("Aerospike", "localhost", "3000", "aerospike1", "3000", "", ""), -] - -DICTIONARIES = [] - -cluster = None -node = None - - -def setup_kv_dict(suffix, layout, fields, kv_source, dict_configs_path, values): - global DICTIONARIES - - structure = DictionaryStructure(layout, fields) - dict_name = "{}_{}_{}".format(kv_source.name, layout.name, suffix) - dict_path = os.path.join(dict_configs_path, dict_name + '.xml') - dictionary = Dictionary(dict_name, structure, kv_source, dict_path, "table_" + dict_name, fields, values) - dictionary.generate_config() - DICTIONARIES.append(dictionary) - - -def setup_module(module): - global DICTIONARIES - global cluster - global node - - dict_configs_path = os.path.join(SCRIPT_DIR, 'configs/dictionaries') - for f in os.listdir(dict_configs_path): - os.remove(os.path.join(dict_configs_path, f)) - - for layout in LAYOUTS: - for source in SOURCES: - if source.compatible_with_layout(layout): - if layout.layout_type == "simple": - fields_len = len(FIELDS["simple"]) - for i in range(fields_len - 1): - local_fields = [FIELDS["simple"][0], FIELDS["simple"][i + 1]] - local_values = [[value[0], value[i + 1]] for value in VALUES["simple"]] - setup_kv_dict(i + 1, layout, local_fields, source, dict_configs_path, local_values) - elif layout.layout_type == "complex": - fields_len = len(FIELDS["complex"]) - for i in range(fields_len - 2): - if layout.name == 'complex_key_hashed_two_keys': - local_fields = [FIELDS['complex'][0], FIELDS['complex'][1], FIELDS['complex'][i + 2]] - local_values = [[value[0], value[1], value[i + 2]] for value in VALUES["complex"]] - else: - local_fields = [FIELDS['complex'][1], FIELDS['complex'][i + 2]] - local_values = [[value[1], value[i + 2]] for value in VALUES["complex"]] - setup_kv_dict(i + 2, layout, local_fields, source, dict_configs_path, local_values) - elif layout.layout_type == "ranged": - fields_len = len(FIELDS["ranged"]) - local_fields = FIELDS["ranged"][0:5] - local_values = VALUES["ranged"][0:5] - for i in range(fields_len - 4): - local_fields[4] = FIELDS["ranged"][i + 4] - for j, value in enumerate(VALUES["ranged"]): - local_values[j][4] = value[i + 4] - setup_kv_dict(i + 2, layout, local_fields, source, dict_configs_path, local_values) - else: - print "Source", source.name, "incompatible with layout", layout.name - - main_configs = [] - for fname in os.listdir(dict_configs_path): - main_configs.append(os.path.join(dict_configs_path, fname)) - cluster = ClickHouseCluster(__file__, base_configs_dir=os.path.join(SCRIPT_DIR, 'configs')) - node = cluster.add_instance('node', main_configs=main_configs, with_redis=True) - cluster.add_instance('clickhouse1') - - -@pytest.fixture(scope="module") -def started_cluster(): - try: - cluster.start() - for dictionary in DICTIONARIES: - print "Preparing", dictionary.name - dictionary.prepare_source(cluster) - print "Prepared" - - yield cluster - - finally: - cluster.shutdown() - - -def prepare_data(fields, values_by_row): - return [Row(fields, values) for values in values_by_row] - - -def test_simple_kv_dictionaries(started_cluster): - simple_kv_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "simple"] - - for dct in simple_kv_dicts: - queries_with_answers = [] - fields = dct.fields - print("FIELDS AND VALUES FOR " + dct.name) - print(fields) - print(dct.values) - data = prepare_data(fields, dct.values) - dct.source.load_kv_data(dct.values) - - try: - node.query("system reload dictionary '{}'".format(dct.name)) - except Exception: - print(dct.name) - raise - - for row in data: - for field in fields: - if not field.is_key: - for query in dct.get_select_get_queries(field, row): - queries_with_answers.append((query, row.get_value_by_name(field.name))) - - for query in dct.get_select_has_queries(field, row): - queries_with_answers.append((query, 1)) - - for query in dct.get_select_get_or_default_queries(field, row): - queries_with_answers.append((query, field.default_value_for_get)) - if dct.fields[1].hierarchical: - for query in dct.get_hierarchical_queries(data[0]): - queries_with_answers.append((query, [1])) - - for query in dct.get_hierarchical_queries(data[1]): - queries_with_answers.append((query, [2, 1])) - - for query in dct.get_is_in_queries(data[0], data[1]): - queries_with_answers.append((query, 0)) - - for query in dct.get_is_in_queries(data[1], data[0]): - queries_with_answers.append((query, 1)) - - for query, answer in queries_with_answers: - if isinstance(answer, list): - answer = str(answer).replace(' ', '') - print query - assert node.query(query) == str(answer) + '\n', query - - -def test_complex_dictionaries(started_cluster): - complex_kv_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "complex"] - - for dct in complex_kv_dicts: - queries_with_answers = [] - fields = dct.fields - print("FIELDS AND VALUES FOR " + dct.name) - print(fields) - print(dct.values) - data = prepare_data(fields, dct.values) - dct.source.load_kv_data(dct.values) - - try: - node.query("system reload dictionary '{}'".format(dct.name)) - except Exception: - print(dct.name) - raise - - for row in data: - for field in fields: - if not field.is_key: - for query in dct.get_select_get_queries(field, row): - queries_with_answers.append((query, row.get_value_by_name(field.name))) - - for query in dct.get_select_has_queries(field, row): - queries_with_answers.append((query, 1)) - - for query in dct.get_select_get_or_default_queries(field, row): - queries_with_answers.append((query, field.default_value_for_get)) - - for query, answer in queries_with_answers: - print query - assert node.query(query) == str(answer) + '\n' - - -def test_ranged_dictionaries(started_cluster): - complex_kv_dicts = [d for d in DICTIONARIES if d.structure.layout.layout_type == "ranged"] - - for dct in complex_kv_dicts: - queries_with_answers = [] - fields = dct.fields - print("FIELDS AND VALUES FOR " + dct.name) - print(fields) - print(dct.values) - data = prepare_data(fields, dct.values) - dct.source.load_kv_data(dct.values) - - try: - node.query("system reload dictionary '{}'".format(dct.name)) - except Exception: - print(dct.name) - raise - - for row in data: - for field in fields: - if not field.is_key and not field.is_range: - for query in dct.get_select_get_queries(field, row): - queries_with_answers.append((query, row.get_value_by_name(field.name))) - - for query, answer in queries_with_answers: - print query - assert node.query(query) == str(answer) + '\n' From 4480e97f9fbbe825b13a4eb19ecd3056e750b2d5 Mon Sep 17 00:00:00 2001 From: CurtizJ Date: Tue, 17 Sep 2019 17:16:07 +0300 Subject: [PATCH 109/309] fix build --- dbms/src/Common/config.h.in | 1 - dbms/src/Dictionaries/RedisBlockInputStream.cpp | 3 +-- dbms/src/Dictionaries/RedisBlockInputStream.h | 3 --- dbms/src/Dictionaries/RedisDictionarySource.h | 2 +- .../Storages/System/StorageSystemBuildOptions.generated.cpp.in | 1 + 5 files changed, 3 insertions(+), 7 deletions(-) diff --git a/dbms/src/Common/config.h.in b/dbms/src/Common/config.h.in index ad017d3bf6b..7804068e5c4 100644 --- a/dbms/src/Common/config.h.in +++ b/dbms/src/Common/config.h.in @@ -10,4 +10,3 @@ #cmakedefine01 USE_BROTLI #cmakedefine01 USE_UNWIND #cmakedefine01 CLICKHOUSE_SPLIT_BINARY -#cmakedefine01 USE_POCO_REDIS diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 016a13cf9e0..5b680a965a3 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -1,4 +1,4 @@ -#include +#include "RedisBlockInputStream.h" #if USE_POCO_REDIS # include @@ -17,7 +17,6 @@ # include # include "DictionaryStructure.h" -# include "RedisBlockInputStream.h" namespace DB diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.h b/dbms/src/Dictionaries/RedisBlockInputStream.h index 5034e16080b..578e644c9f8 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.h +++ b/dbms/src/Dictionaries/RedisBlockInputStream.h @@ -11,7 +11,6 @@ namespace Poco { class Array; class Client; - class RedisType; } } @@ -22,8 +21,6 @@ namespace DB { public: using RedisArray = Poco::Redis::Array; - using RedisTypePtr = Poco::Redis::RedisType::Ptr; - using RedisBulkString = Poco::Redis::BulkString; RedisBlockInputStream( const std::shared_ptr & client_, diff --git a/dbms/src/Dictionaries/RedisDictionarySource.h b/dbms/src/Dictionaries/RedisDictionarySource.h index 19ba0a00e5f..f63dd9545d2 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.h +++ b/dbms/src/Dictionaries/RedisDictionarySource.h @@ -1,6 +1,6 @@ #pragma once -#include +#include "config_core.h" #include #if USE_POCO_REDIS diff --git a/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in b/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in index 63ddfe15649..1bb87068426 100644 --- a/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in +++ b/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in @@ -61,6 +61,7 @@ const char * auto_config_build[] "USE_SSL", "@USE_SSL@", "USE_HYPERSCAN", "@USE_HYPERSCAN@", "USE_SIMDJSON", "@USE_SIMDJSON@", + "USE_POCO_REDIS", "@USE_POCO_REDIS", nullptr, nullptr }; From df82e4bde869e9a16d9fe0fa3a888e9ca9114ad1 Mon Sep 17 00:00:00 2001 From: CurtizJ Date: Tue, 17 Sep 2019 17:55:09 +0300 Subject: [PATCH 110/309] fix build --- dbms/src/Dictionaries/RedisBlockInputStream.cpp | 1 + dbms/src/Dictionaries/RedisBlockInputStream.h | 13 ++++++++++--- dbms/src/Dictionaries/RedisDictionarySource.cpp | 8 ++++---- .../StorageSystemBuildOptions.generated.cpp.in | 2 +- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index 5b680a965a3..daaae7d1d00 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -1,4 +1,5 @@ #include "RedisBlockInputStream.h" + #if USE_POCO_REDIS # include diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.h b/dbms/src/Dictionaries/RedisBlockInputStream.h index 578e644c9f8..448005f1ef5 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.h +++ b/dbms/src/Dictionaries/RedisBlockInputStream.h @@ -1,9 +1,13 @@ #pragma once +#include "config_core.h" #include -#include -#include -#include "RedisDictionarySource.h" + +#if USE_POCO_REDIS +# include +# include +# include "RedisDictionarySource.h" +# include namespace Poco { @@ -11,6 +15,7 @@ namespace Poco { class Array; class Client; + class RedisType; } } @@ -48,3 +53,5 @@ namespace DB }; } + +#endif diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index 5957e891722..ef901a4ea1b 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -19,10 +19,10 @@ namespace DB #if USE_POCO_REDIS return std::make_unique(dict_struct, config, config_prefix + ".redis", sample_block); #else - (void)dict_struct; - (void)config; - (void)config_prefix; - (void)sample_block; + UNUSED(dict_struct); + UNUSED(config); + UNUSED(config_prefix); + UNUSED(sample_block); throw Exception{"Dictionary source of type `redis` is disabled because poco library was built without redis support.", ErrorCodes::SUPPORT_IS_DISABLED}; #endif diff --git a/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in b/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in index 1bb87068426..25e7086c1a6 100644 --- a/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in +++ b/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in @@ -61,7 +61,7 @@ const char * auto_config_build[] "USE_SSL", "@USE_SSL@", "USE_HYPERSCAN", "@USE_HYPERSCAN@", "USE_SIMDJSON", "@USE_SIMDJSON@", - "USE_POCO_REDIS", "@USE_POCO_REDIS", + "USE_POCO_REDIS", "@USE_POCO_REDIS@", nullptr, nullptr }; From 7ccf04440a24e5f5f33a878a5c55b28310191d30 Mon Sep 17 00:00:00 2001 From: CurtizJ Date: Tue, 17 Sep 2019 20:57:48 +0300 Subject: [PATCH 111/309] better code in Redis external dictionary --- dbms/src/Common/ErrorCodes.cpp | 2 +- .../Dictionaries/RedisBlockInputStream.cpp | 93 ++++++------------- dbms/src/Dictionaries/RedisBlockInputStream.h | 4 +- .../Dictionaries/RedisDictionarySource.cpp | 28 +++--- 4 files changed, 44 insertions(+), 83 deletions(-) diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp index c4aa1449e0f..06a967ecded 100644 --- a/dbms/src/Common/ErrorCodes.cpp +++ b/dbms/src/Common/ErrorCodes.cpp @@ -451,7 +451,7 @@ namespace ErrorCodes extern const int INVALID_TEMPLATE_FORMAT = 474; extern const int INVALID_WITH_FILL_EXPRESSION = 475; extern const int WITH_TIES_WITHOUT_ORDER_BY = 476; - extern const int INVALID_USAGE_OF_INPUT = 477; + extern const int INTERNAL_REDIS_ERROR = 477; extern const int KEEPER_EXCEPTION = 999; extern const int POCO_EXCEPTION = 1000; diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.cpp b/dbms/src/Dictionaries/RedisBlockInputStream.cpp index daaae7d1d00..ad3d9002b36 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.cpp +++ b/dbms/src/Dictionaries/RedisBlockInputStream.cpp @@ -26,8 +26,8 @@ namespace DB { extern const int TYPE_MISMATCH; extern const int LOGICAL_ERROR; - extern const int LIMIT_EXCEEDED; extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH; + extern const int INTERNAL_REDIS_ERROR; } @@ -49,42 +49,18 @@ namespace DB { using ValueType = ExternalResultDescription::ValueType; - bool isNullString(const Poco::Redis::RedisType::Ptr & value) - { - return value->isBulkString() && - static_cast *>(value.get())->value().isNull(); - } - - std::string getStringOrThrow(const Poco::Redis::RedisType::Ptr & value, const std::string & column_name) - { - switch (value->type()) - { - case Poco::Redis::RedisTypeTraits::TypeId: - { - const auto & bs = static_cast *>(value.get())->value(); - if (bs.isNull()) - throw Exception{"Type mismatch, expected not null String for column " + column_name, - ErrorCodes::TYPE_MISMATCH}; - return bs.value(); - } - case Poco::Redis::RedisTypeTraits::TypeId: - return static_cast *>(value.get())->value(); - default: - throw Exception{"Type mismatch, expected std::string, got type id = " + toString(value->type()) + " for column " + column_name, - ErrorCodes::TYPE_MISMATCH}; - } - } - template inline void insert(IColumn & column, const String & stringValue) { - static_cast &>(column).insertValue(parse(stringValue)); + assert_cast &>(column).insertValue(parse(stringValue)); } - void insertValue(IColumn & column, const ValueType type, const Poco::Redis::RedisType::Ptr & value, const std::string & name) + void insertValue(IColumn & column, const ValueType type, const Poco::Redis::BulkString & bulk_string) { - String stringValue = getStringOrThrow(value, name); + if (bulk_string.isNull()) + throw Exception{"Type mismatch, expected not Null String", ErrorCodes::TYPE_MISMATCH}; + String stringValue = bulk_string.value(); switch (type) { case ValueType::vtUInt8: @@ -118,16 +94,16 @@ namespace DB insert(column, stringValue); break; case ValueType::vtString: - static_cast(column).insert(parse(stringValue)); + assert_cast(column).insert(parse(stringValue)); break; case ValueType::vtDate: - static_cast(column).insertValue(parse(stringValue).getDayNum()); + assert_cast(column).insertValue(parse(stringValue).getDayNum()); break; case ValueType::vtDateTime: - static_cast(column).insertValue(static_cast(parse(stringValue))); + assert_cast(column).insertValue(static_cast(parse(stringValue))); break; case ValueType::vtUUID: - static_cast(column).insertValue(parse(stringValue)); + assert_cast(column).insertValue(parse(stringValue)); break; } } @@ -150,25 +126,21 @@ namespace DB const auto insertValueByIdx = [this, &columns](size_t idx, const auto & value) { - const auto & name = description.sample_block.getByPosition(idx).name; if (description.types[idx].second) { ColumnNullable & column_nullable = static_cast(*columns[idx]); - insertValue(column_nullable.getNestedColumn(), description.types[idx].first, value, name); + insertValue(column_nullable.getNestedColumn(), description.types[idx].first, value); column_nullable.getNullMapData().emplace_back(0); } else - insertValue(*columns[idx], description.types[idx].first, value, name); + insertValue(*columns[idx], description.types[idx].first, value); }; if (storage_type == RedisStorageType::HASH_MAP) { size_t num_rows = 0; - while (num_rows < max_block_size && !all_read) + for (; cursor < keys.size(); ++cursor) { - if (cursor >= keys.size()) - break; - const auto & keys_array = keys.get(cursor); if (keys_array.size() < 2) { @@ -183,24 +155,20 @@ namespace DB for (auto it = keys_array.begin(); it != keys_array.end(); ++it) command_for_values.addRedisType(*it); - ++cursor; auto values = client->execute(command_for_values); if (keys_array.size() != values.size() + 1) // 'HMGET' primary_key secondary_keys throw Exception{"Inconsistent sizes of keys and values in Redis request", ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH}; - const auto & primary_key = *keys_array.begin(); + const auto & primary_key = keys_array.get(0); for (size_t i = 0; i < values.size(); ++i) { - const auto & secondary_key = *(keys_array.begin() + i + 1); - const auto & value = *(values.begin() + i); - - if (value.isNull()) - throw Exception("Got NULL value in response from Redis", ErrorCodes::LOGICAL_ERROR); + const auto & secondary_key = keys_array.get(i + 1); + const auto & value = values.get(i); /// null string means 'no value for requested key' - if (!isNullString(value)) + if (!value.isNull()) { insertValueByIdx(0, primary_key); insertValueByIdx(1, secondary_key); @@ -214,34 +182,27 @@ namespace DB { Poco::Redis::Command command_for_values("MGET"); - // keys.size() > 0 - for (size_t i = 0; i < max_block_size && cursor < keys.size(); ++i) - { - const auto & key = *(keys.begin() + cursor); - command_for_values.addRedisType(key); - ++cursor; - } + size_t need_values = std::min(max_block_size, keys.size() - cursor); + for (size_t i = 0; i < need_values; ++i) + command_for_values.add(keys.get(cursor + i)); auto values = client->execute(command_for_values); - if (command_for_values.size() != values.size() + 1) // 'MGET' keys - throw Exception{"Inconsistent sizes of keys and values in Redis request", - ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH}; + if (values.size() != need_values) + throw Exception{"Inconsistent sizes of keys and values in Redis request", ErrorCodes::INTERNAL_REDIS_ERROR}; for (size_t i = 0; i < values.size(); ++i) { - const auto & key = *(keys.begin() + cursor - i - 1); - const auto & value = *(values.begin() + values.size() - i - 1); + const auto & key = keys.get(cursor + i); + const auto & value = values.get(i); - if (value.isNull()) - throw Exception("Got NULL value in response from Redis", ErrorCodes::LOGICAL_ERROR); - - /// null string means 'no value for requested key' - if (!isNullString(value)) + /// Null string means 'no value for requested key' + if (!value.isNull()) { insertValueByIdx(0, key); insertValueByIdx(1, value); } } + cursor += need_values; } return description.sample_block.cloneWithColumns(std::move(columns)); diff --git a/dbms/src/Dictionaries/RedisBlockInputStream.h b/dbms/src/Dictionaries/RedisBlockInputStream.h index 448005f1ef5..86448095787 100644 --- a/dbms/src/Dictionaries/RedisBlockInputStream.h +++ b/dbms/src/Dictionaries/RedisBlockInputStream.h @@ -8,14 +8,13 @@ # include # include "RedisDictionarySource.h" # include +# include namespace Poco { namespace Redis { - class Array; class Client; - class RedisType; } } @@ -26,6 +25,7 @@ namespace DB { public: using RedisArray = Poco::Redis::Array; + using RedisBulkString = Poco::Redis::BulkString; RedisBlockInputStream( const std::shared_ptr & client_, diff --git a/dbms/src/Dictionaries/RedisDictionarySource.cpp b/dbms/src/Dictionaries/RedisDictionarySource.cpp index ef901a4ea1b..905ae104dc0 100644 --- a/dbms/src/Dictionaries/RedisDictionarySource.cpp +++ b/dbms/src/Dictionaries/RedisDictionarySource.cpp @@ -13,7 +13,7 @@ namespace DB { auto createTableSource = [=](const DictionaryStructure & dict_struct, const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, + const String & config_prefix, Block & sample_block, const Context & /* context */) -> DictionarySourcePtr { #if USE_POCO_REDIS @@ -52,8 +52,8 @@ namespace DB namespace ErrorCodes { extern const int UNSUPPORTED_METHOD; - extern const int CANNOT_SELECT; extern const int INVALID_CONFIG_PARAMETER; + extern const int INTERNAL_REDIS_ERROR; } @@ -61,7 +61,7 @@ namespace DB RedisDictionarySource::RedisDictionarySource( const DictionaryStructure & dict_struct_, - const std::string & host_, + const String & host_, UInt16 port_, UInt8 db_index_, RedisStorageType storage_type_, @@ -81,12 +81,12 @@ namespace DB if (storage_type == RedisStorageType::HASH_MAP) { - if (!dict_struct.key.has_value()) + if (!dict_struct.key) throw Exception{"Redis source with storage type \'hash_map\' must have key", ErrorCodes::INVALID_CONFIG_PARAMETER}; - if (dict_struct.key.value().size() > 2) - throw Exception{"Redis source with complex keys having more than 2 attributes are unsupported", + if (dict_struct.key->size() != 2) + throw Exception{"Redis source with storage type \'hash_map\' requiers 2 keys", ErrorCodes::INVALID_CONFIG_PARAMETER}; // suppose key[0] is primary key, key[1] is secondary key } @@ -95,10 +95,10 @@ namespace DB { RedisCommand command("SELECT"); command << static_cast(db_index); - std::string reply = client->execute(command); + String reply = client->execute(command); if (reply != "+OK\r\n") - throw Exception{"Selecting db with index " + DB::toString(db_index) + " failed with reason " + reply, - ErrorCodes::CANNOT_SELECT}; + throw Exception{"Selecting database with index " + DB::toString(db_index) + + " failed with reason " + reply, ErrorCodes::INTERNAL_REDIS_ERROR}; } } @@ -106,7 +106,7 @@ namespace DB RedisDictionarySource::RedisDictionarySource( const DictionaryStructure & dict_struct_, const Poco::Util::AbstractConfiguration & config_, - const std::string & config_prefix_, + const String & config_prefix_, Block & sample_block_) : RedisDictionarySource( dict_struct_, @@ -132,7 +132,7 @@ namespace DB RedisDictionarySource::~RedisDictionarySource() = default; - static std::string storageTypeToKeyType(RedisStorageType type) + static String storageTypeToKeyType(RedisStorageType type) { switch (type) { @@ -160,7 +160,7 @@ namespace DB RedisArray keys; auto key_type = storageTypeToKeyType(storage_type); for (auto & key : all_keys) - if (key_type == client->execute(RedisCommand("TYPE").addRedisType(key))) + if (key_type == client->execute(RedisCommand("TYPE").addRedisType(key))) keys.addRedisType(std::move(key)); if (storage_type == RedisStorageType::HASH_MAP) @@ -213,12 +213,12 @@ namespace DB return std::make_shared(client, std::move(keys), storage_type, sample_block, max_block_size); } - std::string RedisDictionarySource::toString() const + String RedisDictionarySource::toString() const { return "Redis: " + host + ':' + DB::toString(port); } - RedisStorageType RedisDictionarySource::parseStorageType(const std::string & storage_type_str) + RedisStorageType RedisDictionarySource::parseStorageType(const String & storage_type_str) { if (storage_type_str == "hash_map") return RedisStorageType::HASH_MAP; From 392bdd60083a7398adda52a6f6aaf1d5f2fae2c2 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 18 Sep 2019 08:49:46 +0000 Subject: [PATCH 112/309] +UpdatableSessionBase Committer: maqroll --- dbms/src/IO/ReadWriteBufferFromHTTP.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.h b/dbms/src/IO/ReadWriteBufferFromHTTP.h index 47ab3233024..785182875a3 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.h +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.h @@ -43,7 +43,7 @@ protected: DB::SettingUInt64 max_redirects; public: - void buildNewSession(const Poco::URI & uri); + virtual void buildNewSession(const Poco::URI & uri) = 0; explicit UpdatableSessionBase(const Poco::URI uri, const ConnectionTimeouts & timeouts_, @@ -73,6 +73,10 @@ public: throw Exception(error_message.str(), ErrorCodes::TOO_MANY_REDIRECTS); } } + + virtual ~UpdatableSessionBase() + { + } }; namespace detail @@ -199,7 +203,7 @@ public: session = makeHTTPSession(initial_uri, timeouts); } - void buildNewSession(const Poco::URI uri) + void buildNewSession(const Poco::URI & uri) override { session = makeHTTPSession(uri, timeouts); } @@ -240,7 +244,7 @@ public: session = makePooledHTTPSession(initial_uri, timeouts, per_endpoint_pool_size); } - void buildNewSession(const Poco::URI uri) + void buildNewSession(const Poco::URI & uri) override { session = makePooledHTTPSession(uri, timeouts, per_endpoint_pool_size); } From 651f5b0e9ff63fe521992eb0f46cfb37cb326d6d Mon Sep 17 00:00:00 2001 From: CurtizJ Date: Wed, 18 Sep 2019 13:21:10 +0300 Subject: [PATCH 113/309] merging with master --- dbms/src/Common/ErrorCodes.cpp | 5 +---- dbms/tests/integration/helpers/cluster.py | 6 +----- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp index 54b155c3872..0347f2c164a 100644 --- a/dbms/src/Common/ErrorCodes.cpp +++ b/dbms/src/Common/ErrorCodes.cpp @@ -451,16 +451,13 @@ namespace ErrorCodes extern const int INVALID_TEMPLATE_FORMAT = 474; extern const int INVALID_WITH_FILL_EXPRESSION = 475; extern const int WITH_TIES_WITHOUT_ORDER_BY = 476; -<<<<<<< HEAD - extern const int INTERNAL_REDIS_ERROR = 477; -======= extern const int INVALID_USAGE_OF_INPUT = 477; extern const int UNKNOWN_POLICY = 478; extern const int UNKNOWN_DISK = 479; extern const int UNKNOWN_PROTOCOL = 480; extern const int PATH_ACCESS_DENIED = 481; extern const int DICTIONARY_ACCESS_DENIED = 482; ->>>>>>> upstream/master + extern const int INTERNAL_REDIS_ERROR = 483; extern const int KEEPER_EXCEPTION = 999; extern const int POCO_EXCEPTION = 1000; diff --git a/dbms/tests/integration/helpers/cluster.py b/dbms/tests/integration/helpers/cluster.py index 14139ba43a8..0ba48c32a8d 100644 --- a/dbms/tests/integration/helpers/cluster.py +++ b/dbms/tests/integration/helpers/cluster.py @@ -115,11 +115,7 @@ class ClickHouseCluster: cmd += " client" return cmd -<<<<<<< HEAD - def add_instance(self, name, config_dir=None, main_configs=[], user_configs=[], macros={}, with_zookeeper=False, with_mysql=False, with_kafka=False, clickhouse_path_dir=None, with_odbc_drivers=False, with_postgres=False, with_hdfs=False, with_mongo=False, with_redis=False, hostname=None, env_variables={}, image="yandex/clickhouse-integration-test", stay_alive=False, ipv4_address=None, ipv6_address=None, with_installed_binary=False): -======= - def add_instance(self, name, config_dir=None, main_configs=[], user_configs=[], macros={}, with_zookeeper=False, with_mysql=False, with_kafka=False, clickhouse_path_dir=None, with_odbc_drivers=False, with_postgres=False, with_hdfs=False, with_mongo=False, hostname=None, env_variables={}, image="yandex/clickhouse-integration-test", stay_alive=False, ipv4_address=None, ipv6_address=None, with_installed_binary=False, tmpfs=[]): ->>>>>>> upstream/master + def add_instance(self, name, config_dir=None, main_configs=[], user_configs=[], macros={}, with_zookeeper=False, with_mysql=False, with_kafka=False, clickhouse_path_dir=None, with_odbc_drivers=False, with_postgres=False, with_hdfs=False, with_mongo=False, with_redis=False, hostname=None, env_variables={}, image="yandex/clickhouse-integration-test", stay_alive=False, ipv4_address=None, ipv6_address=None, with_installed_binary=False, tmpfs=[]): """Add an instance to the cluster. name - the name of the instance directory and the value of the 'instance' macro in ClickHouse. From ee311ff03c3bbf6a7657f90e8f40f37be0252cb7 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 18 Sep 2019 10:38:00 +0000 Subject: [PATCH 114/309] style checks Committer: maqroll --- dbms/src/IO/ReadWriteBufferFromHTTP.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.h b/dbms/src/IO/ReadWriteBufferFromHTTP.h index 785182875a3..6d692ad1691 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.h +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.h @@ -91,8 +91,8 @@ namespace detail UpdatableSessionPtr session; std::istream * istr; /// owned by session std::unique_ptr impl; - std::function out_stream_callback; - const Poco::Net::HTTPBasicCredentials & credentials; + std::function out_stream_callback; + const Poco::Net::HTTPBasicCredentials & credentials; protected: std::istream * call(const Poco::URI uri_, Poco::Net::HTTPResponse & response) @@ -112,7 +112,7 @@ namespace detail LOG_TRACE((&Logger::get("ReadWriteBufferFromHTTP")), "Sending request to " << uri.toString()); - auto sess = session->getSession(); + auto sess = session->getSession(); auto & stream_out = sess->sendRequest(request); @@ -133,7 +133,7 @@ namespace detail sess->attachSessionData(e.message()); throw; } - } + } public: using OutStreamCallback = std::function; @@ -148,16 +148,16 @@ namespace detail , uri {uri_} , method {!method_.empty() ? method_ : out_stream_callback ? Poco::Net::HTTPRequest::HTTP_POST : Poco::Net::HTTPRequest::HTTP_GET} , session {session_} - , out_stream_callback {out_stream_callback_} - , credentials {credentials_} + , out_stream_callback {out_stream_callback_} + , credentials {credentials_} { Poco::Net::HTTPResponse response; - + istr = call(uri, response); while (isRedirect(response.getStatus())) { - Poco::URI uri_redirect(response.get("Location")); + Poco::URI uri_redirect(response.get("Location")); session->updateSession(uri_redirect); @@ -172,7 +172,7 @@ namespace detail { /// We use session data storage as storage for exception text /// Depend on it we can deduce to reconnect session or reresolve session host - auto sess = session->getSession(); + auto sess = session->getSession(); sess->attachSessionData(e.message()); throw; } @@ -198,7 +198,7 @@ public: explicit UpdatableSession(const Poco::URI uri, const ConnectionTimeouts & timeouts_, const SettingUInt64 max_redirects_) - : Parent(uri, timeouts_, max_redirects_) + : Parent(uri, timeouts_, max_redirects_) { session = makeHTTPSession(initial_uri, timeouts); } @@ -218,7 +218,7 @@ public: const std::string & method_ = {}, OutStreamCallback out_stream_callback_ = {}, const ConnectionTimeouts & timeouts = {}, - const DB::SettingUInt64 max_redirects = 0, + const DB::SettingUInt64 max_redirects = 0, const Poco::Net::HTTPBasicCredentials & credentials_ = {}, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE) : Parent(std::make_shared(uri_, timeouts, max_redirects), uri_, method_, out_stream_callback_, credentials_, buffer_size_) From 0bad4b4a05870dd76a23027a65af37afcfa15f71 Mon Sep 17 00:00:00 2001 From: sfod Date: Wed, 18 Sep 2019 16:08:51 +0300 Subject: [PATCH 115/309] Throw exceptions if WITH TOTALS/ROLLUP/CUBE are specified without aggregate functions --- .../Interpreters/InterpreterSelectQuery.cpp | 28 ++----------------- ...01013_totals_without_aggregation.reference | 3 ++ .../01013_totals_without_aggregation.sh | 15 ++++++++++ 3 files changed, 21 insertions(+), 25 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/01013_totals_without_aggregation.reference create mode 100755 dbms/tests/queries/0_stateless/01013_totals_without_aggregation.sh diff --git a/dbms/src/Interpreters/InterpreterSelectQuery.cpp b/dbms/src/Interpreters/InterpreterSelectQuery.cpp index 39a1976d2d4..10f71b9337e 100644 --- a/dbms/src/Interpreters/InterpreterSelectQuery.cpp +++ b/dbms/src/Interpreters/InterpreterSelectQuery.cpp @@ -1209,33 +1209,11 @@ void InterpreterSelectQuery::executeImpl(TPipeline & pipeline, const BlockInputS executeExpression(pipeline, expressions.before_order_and_select); executeDistinct(pipeline, true, expressions.selected_columns); - need_second_distinct_pass = query.distinct && pipeline.hasMixedStreams(); } - else - { - need_second_distinct_pass = query.distinct && pipeline.hasMixedStreams(); + else if (query.group_by_with_totals || query.group_by_with_rollup || query.group_by_with_cube) + throw Exception("WITH TOTALS, ROLLUP or CUBE are not supported without aggregation", ErrorCodes::LOGICAL_ERROR); - if (query.group_by_with_totals && !aggregate_final) - { - bool final = !query.group_by_with_rollup && !query.group_by_with_cube; - executeTotalsAndHaving(pipeline, expressions.has_having, expressions.before_having, aggregate_overflow_row, final); - } - - if ((query.group_by_with_rollup || query.group_by_with_cube) && !aggregate_final) - { - if (query.group_by_with_rollup) - executeRollupOrCube(pipeline, Modificator::ROLLUP); - else if (query.group_by_with_cube) - executeRollupOrCube(pipeline, Modificator::CUBE); - - if (expressions.has_having) - { - if (query.group_by_with_totals) - throw Exception("WITH TOTALS and WITH ROLLUP or CUBE are not supported together in presence of HAVING", ErrorCodes::NOT_IMPLEMENTED); - executeHaving(pipeline, expressions.before_having); - } - } - } + need_second_distinct_pass = query.distinct && pipeline.hasMixedStreams(); if (expressions.has_order_by) { diff --git a/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.reference b/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.reference new file mode 100644 index 00000000000..7614df8ec46 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.reference @@ -0,0 +1,3 @@ +ok +ok +ok diff --git a/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.sh b/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.sh new file mode 100755 index 00000000000..c159a73388d --- /dev/null +++ b/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +. $CURDIR/../shell_config.sh + +EXCEPTION_SUCCESS_TEXT=ok + +# Must throw an exception +EXCEPTION_TEXT="WITH TOTALS, ROLLUP or CUBE are not supported without aggregation" +$CLICKHOUSE_CLIENT --query="SELECT 1 AS id, 'hello' AS s WITH TOTALS" 2>&1 \ + | grep -q "$EXCEPTION_TEXT" && echo "$EXCEPTION_SUCCESS_TEXT" || echo "Did not throw an exception" +$CLICKHOUSE_CLIENT --query="SELECT 1 AS id, 'hello' AS s WITH ROLLUP" 2>&1 \ + | grep -q "$EXCEPTION_TEXT" && echo "$EXCEPTION_SUCCESS_TEXT" || echo "Did not throw an exception" +$CLICKHOUSE_CLIENT --query="SELECT 1 AS id, 'hello' AS s WITH CUBE" 2>&1 \ + | grep -q "$EXCEPTION_TEXT" && echo "$EXCEPTION_SUCCESS_TEXT" || echo "Did not throw an exception" From 8175d4c2d16bf1f8f2a0e27e27fa0f2d66a1ede5 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 18 Sep 2019 13:38:17 +0000 Subject: [PATCH 116/309] style checks Committer: maqroll --- dbms/src/IO/ReadWriteBufferFromHTTP.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.h b/dbms/src/IO/ReadWriteBufferFromHTTP.h index b00df9e0c08..13e20c446c5 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.h +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.h @@ -122,7 +122,7 @@ namespace detail try { istr = receiveResponse(*sess, request, response, true); - response.getCookies(cookies); + response.getCookies(cookies); return istr; From 952d9449a20b700e664ec54d723d9f979c408733 Mon Sep 17 00:00:00 2001 From: maqroll Date: Wed, 18 Sep 2019 14:53:45 +0000 Subject: [PATCH 117/309] typo --- dbms/src/IO/ReadWriteBufferFromHTTP.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.h b/dbms/src/IO/ReadWriteBufferFromHTTP.h index 13e20c446c5..6382077dd6a 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.h +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.h @@ -147,7 +147,7 @@ namespace detail size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE) : ReadBuffer(nullptr, 0) , uri {uri_} - , method {!method_.empty() ? method_ : out_stream_callback ? Poco::Net::HTTPRequest::HTTP_POST : Poco::Net::HTTPRequest::HTTP_GET} + , method {!method_.empty() ? method_ : out_stream_callback_ ? Poco::Net::HTTPRequest::HTTP_POST : Poco::Net::HTTPRequest::HTTP_GET} , session {session_} , out_stream_callback {out_stream_callback_} , credentials {credentials_} From 67e1cf9b73d3cb78c00f24b3dd406c0f0014cadf Mon Sep 17 00:00:00 2001 From: Mikhail Filimonov Date: Wed, 18 Sep 2019 17:35:45 +0200 Subject: [PATCH 118/309] Adding performance test for huge pk (issue #6924) --- dbms/tests/performance/merge_tree_huge_pk.xml | 206 ++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 dbms/tests/performance/merge_tree_huge_pk.xml diff --git a/dbms/tests/performance/merge_tree_huge_pk.xml b/dbms/tests/performance/merge_tree_huge_pk.xml new file mode 100644 index 00000000000..351cc17cb6c --- /dev/null +++ b/dbms/tests/performance/merge_tree_huge_pk.xml @@ -0,0 +1,206 @@ + + loop + + + + 10 + 12000 + + + 50 + 60000 + + + + + CREATE TABLE huge_pk ENGINE = MergeTree ORDER BY ( + c001, c002, c003, c004, c005, c006, c007, c008, c009, c010, c011, c012, c013, c014, c015, c016, c017, c018, c019, c020, + c021, c022, c023, c024, c025, c026, c027, c028, c029, c030, c031, c032, c033, c034, c035, c036, c037, c038, c039, c040, + c041, c042, c043, c044, c045, c046, c047, c048, c049, c050, c051, c052, c053, c054, c055, c056, c057, c058, c059, c060, + c061, c062, c063, c064, c065, c066, c067, c068, c069, c070, c071, c072, c073, c074, c075, c076, c077, c078, c079, c080, + c081, c082, c083, c084, c085, c086, c087, c088, c089, c090, c091, c092, c093, c094, c095, c096, c097, c098, c099, c100, + c101, c102, c103, c104, c105, c106, c107, c108, c109, c110, c111, c112, c113, c114, c115, c116, c117, c118, c119, c120, + c121, c122, c123, c124, c125, c126, c127, c128, c129, c130, c131, c132, c133, c134, c135, c136, c137, c138, c139, c140, + c141, c142, c143, c144, c145, c146, c147, c148, c149, c150, c151, c152, c153, c154, c155, c156, c157, c158, c159, c160, + c161, c162, c163, c164, c165, c166, c167, c168, c169, c170, c171, c172, c173, c174, c175, c176, c177, c178, c179, c180, + c181, c182, c183, c184, c185, c186, c187, c188, c189, c190, c191, c192, c193, c194, c195, c196, c197, c198, c199, c200, + c201, c202, c203, c204, c205, c206, c207, c208, c209, c210, c211, c212, c213, c214, c215, c216, c217, c218, c219, c220, + c221, c222, c223, c224, c225, c226, c227, c228, c229, c230, c231, c232, c233, c234, c235, c236, c237, c238, c239, c240, + c241, c242, c243, c244, c245, c246, c247, c248, c249, c250, c251, c252, c253, c254, c255, c256, c257, c258, c259, c260, + c261, c262, c263, c264, c265, c266, c267, c268, c269, c270, c271, c272, c273, c274, c275, c276, c277, c278, c279, c280, + c281, c282, c283, c284, c285, c286, c287, c288, c289, c290, c291, c292, c293, c294, c295, c296, c297, c298, c299, c300, + c301, c302, c303, c304, c305, c306, c307, c308, c309, c310, c311, c312, c313, c314, c315, c316, c317, c318, c319, c320, + c321, c322, c323, c324, c325, c326, c327, c328, c329, c330, c331, c332, c333, c334, c335, c336, c337, c338, c339, c340, + c341, c342, c343, c344, c345, c346, c347, c348, c349, c350, c351, c352, c353, c354, c355, c356, c357, c358, c359, c360, + c361, c362, c363, c364, c365, c366, c367, c368, c369, c370, c371, c372, c373, c374, c375, c376, c377, c378, c379, c380, + c381, c382, c383, c384, c385, c386, c387, c388, c389, c390, c391, c392, c393, c394, c395, c396, c397, c398, c399, c400, + c401, c402, c403, c404, c405, c406, c407, c408, c409, c410, c411, c412, c413, c414, c415, c416, c417, c418, c419, c420, + c421, c422, c423, c424, c425, c426, c427, c428, c429, c430, c431, c432, c433, c434, c435, c436, c437, c438, c439, c440, + c441, c442, c443, c444, c445, c446, c447, c448, c449, c450, c451, c452, c453, c454, c455, c456, c457, c458, c459, c460, + c461, c462, c463, c464, c465, c466, c467, c468, c469, c470, c471, c472, c473, c474, c475, c476, c477, c478, c479, c480, + c481, c482, c483, c484, c485, c486, c487, c488, c489, c490, c491, c492, c493, c494, c495, c496, c497, c498, c499, c500, + c501, c502, c503, c504, c505, c506, c507, c508, c509, c510, c511, c512, c513, c514, c515, c516, c517, c518, c519, c520, + c521, c522, c523, c524, c525, c526, c527, c528, c529, c530, c531, c532, c533, c534, c535, c536, c537, c538, c539, c540, + c541, c542, c543, c544, c545, c546, c547, c548, c549, c550, c551, c552, c553, c554, c555, c556, c557, c558, c559, c560, + c561, c562, c563, c564, c565, c566, c567, c568, c569, c570, c571, c572, c573, c574, c575, c576, c577, c578, c579, c580, + c581, c582, c583, c584, c585, c586, c587, c588, c589, c590, c591, c592, c593, c594, c595, c596, c597, c598, c599, c600, + c601, c602, c603, c604, c605, c606, c607, c608, c609, c610, c611, c612, c613, c614, c615, c616, c617, c618, c619, c620, + c621, c622, c623, c624, c625, c626, c627, c628, c629, c630, c631, c632, c633, c634, c635, c636, c637, c638, c639, c640, + c641, c642, c643, c644, c645, c646, c647, c648, c649, c650, c651, c652, c653, c654, c655, c656, c657, c658, c659, c660, + c661, c662, c663, c664, c665, c666, c667, c668, c669, c670, c671, c672, c673, c674, c675, c676, c677, c678, c679, c680, + c681, c682, c683, c684, c685, c686, c687, c688, c689, c690, c691, c692, c693, c694, c695, c696, c697, c698, c699, c700) + AS SELECT + rand64( 1) % 5 as c001, rand64( 2) % 5 as c002, rand64( 3) % 5 as c003, rand64( 4) % 5 as c004, rand64( 5) % 5 as c005, + rand64( 6) % 5 as c006, rand64( 7) % 5 as c007, rand64( 8) % 5 as c008, rand64( 9) % 5 as c009, rand64( 10) % 5 as c010, + rand64( 11) % 5 as c011, rand64( 12) % 5 as c012, rand64( 13) % 5 as c013, rand64( 14) % 5 as c014, rand64( 15) % 5 as c015, + rand64( 16) % 5 as c016, rand64( 17) % 5 as c017, rand64( 18) % 5 as c018, rand64( 19) % 5 as c019, rand64( 20) % 5 as c020, + rand64( 21) % 5 as c021, rand64( 22) % 5 as c022, rand64( 23) % 5 as c023, rand64( 24) % 5 as c024, rand64( 25) % 5 as c025, + rand64( 26) % 5 as c026, rand64( 27) % 5 as c027, rand64( 28) % 5 as c028, rand64( 29) % 5 as c029, rand64( 30) % 5 as c030, + rand64( 31) % 5 as c031, rand64( 32) % 5 as c032, rand64( 33) % 5 as c033, rand64( 34) % 5 as c034, rand64( 35) % 5 as c035, + rand64( 36) % 5 as c036, rand64( 37) % 5 as c037, rand64( 38) % 5 as c038, rand64( 39) % 5 as c039, rand64( 40) % 5 as c040, + rand64( 41) % 5 as c041, rand64( 42) % 5 as c042, rand64( 43) % 5 as c043, rand64( 44) % 5 as c044, rand64( 45) % 5 as c045, + rand64( 46) % 5 as c046, rand64( 47) % 5 as c047, rand64( 48) % 5 as c048, rand64( 49) % 5 as c049, rand64( 50) % 5 as c050, + rand64( 51) % 5 as c051, rand64( 52) % 5 as c052, rand64( 53) % 5 as c053, rand64( 54) % 5 as c054, rand64( 55) % 5 as c055, + rand64( 56) % 5 as c056, rand64( 57) % 5 as c057, rand64( 58) % 5 as c058, rand64( 59) % 5 as c059, rand64( 60) % 5 as c060, + rand64( 61) % 5 as c061, rand64( 62) % 5 as c062, rand64( 63) % 5 as c063, rand64( 64) % 5 as c064, rand64( 65) % 5 as c065, + rand64( 66) % 5 as c066, rand64( 67) % 5 as c067, rand64( 68) % 5 as c068, rand64( 69) % 5 as c069, rand64( 70) % 5 as c070, + rand64( 71) % 5 as c071, rand64( 72) % 5 as c072, rand64( 73) % 5 as c073, rand64( 74) % 5 as c074, rand64( 75) % 5 as c075, + rand64( 76) % 5 as c076, rand64( 77) % 5 as c077, rand64( 78) % 5 as c078, rand64( 79) % 5 as c079, rand64( 80) % 5 as c080, + rand64( 81) % 5 as c081, rand64( 82) % 5 as c082, rand64( 83) % 5 as c083, rand64( 84) % 5 as c084, rand64( 85) % 5 as c085, + rand64( 86) % 5 as c086, rand64( 87) % 5 as c087, rand64( 88) % 5 as c088, rand64( 89) % 5 as c089, rand64( 90) % 5 as c090, + rand64( 91) % 5 as c091, rand64( 92) % 5 as c092, rand64( 93) % 5 as c093, rand64( 94) % 5 as c094, rand64( 95) % 5 as c095, + rand64( 96) % 5 as c096, rand64( 97) % 5 as c097, rand64( 98) % 5 as c098, rand64( 99) % 5 as c099, rand64(100) % 5 as c100, + rand64(101) % 5 as c101, rand64(102) % 5 as c102, rand64(103) % 5 as c103, rand64(104) % 5 as c104, rand64(105) % 5 as c105, + rand64(106) % 5 as c106, rand64(107) % 5 as c107, rand64(108) % 5 as c108, rand64(109) % 5 as c109, rand64(110) % 5 as c110, + rand64(111) % 5 as c111, rand64(112) % 5 as c112, rand64(113) % 5 as c113, rand64(114) % 5 as c114, rand64(115) % 5 as c115, + rand64(116) % 5 as c116, rand64(117) % 5 as c117, rand64(118) % 5 as c118, rand64(119) % 5 as c119, rand64(120) % 5 as c120, + rand64(121) % 5 as c121, rand64(122) % 5 as c122, rand64(123) % 5 as c123, rand64(124) % 5 as c124, rand64(125) % 5 as c125, + rand64(126) % 5 as c126, rand64(127) % 5 as c127, rand64(128) % 5 as c128, rand64(129) % 5 as c129, rand64(130) % 5 as c130, + rand64(131) % 5 as c131, rand64(132) % 5 as c132, rand64(133) % 5 as c133, rand64(134) % 5 as c134, rand64(135) % 5 as c135, + rand64(136) % 5 as c136, rand64(137) % 5 as c137, rand64(138) % 5 as c138, rand64(139) % 5 as c139, rand64(140) % 5 as c140, + rand64(141) % 5 as c141, rand64(142) % 5 as c142, rand64(143) % 5 as c143, rand64(144) % 5 as c144, rand64(145) % 5 as c145, + rand64(146) % 5 as c146, rand64(147) % 5 as c147, rand64(148) % 5 as c148, rand64(149) % 5 as c149, rand64(150) % 5 as c150, + rand64(151) % 5 as c151, rand64(152) % 5 as c152, rand64(153) % 5 as c153, rand64(154) % 5 as c154, rand64(155) % 5 as c155, + rand64(156) % 5 as c156, rand64(157) % 5 as c157, rand64(158) % 5 as c158, rand64(159) % 5 as c159, rand64(160) % 5 as c160, + rand64(161) % 5 as c161, rand64(162) % 5 as c162, rand64(163) % 5 as c163, rand64(164) % 5 as c164, rand64(165) % 5 as c165, + rand64(166) % 5 as c166, rand64(167) % 5 as c167, rand64(168) % 5 as c168, rand64(169) % 5 as c169, rand64(170) % 5 as c170, + rand64(171) % 5 as c171, rand64(172) % 5 as c172, rand64(173) % 5 as c173, rand64(174) % 5 as c174, rand64(175) % 5 as c175, + rand64(176) % 5 as c176, rand64(177) % 5 as c177, rand64(178) % 5 as c178, rand64(179) % 5 as c179, rand64(180) % 5 as c180, + rand64(181) % 5 as c181, rand64(182) % 5 as c182, rand64(183) % 5 as c183, rand64(184) % 5 as c184, rand64(185) % 5 as c185, + rand64(186) % 5 as c186, rand64(187) % 5 as c187, rand64(188) % 5 as c188, rand64(189) % 5 as c189, rand64(190) % 5 as c190, + rand64(191) % 5 as c191, rand64(192) % 5 as c192, rand64(193) % 5 as c193, rand64(194) % 5 as c194, rand64(195) % 5 as c195, + rand64(196) % 5 as c196, rand64(197) % 5 as c197, rand64(198) % 5 as c198, rand64(199) % 5 as c199, rand64(200) % 5 as c200, + rand64(201) % 5 as c201, rand64(202) % 5 as c202, rand64(203) % 5 as c203, rand64(204) % 5 as c204, rand64(205) % 5 as c205, + rand64(206) % 5 as c206, rand64(207) % 5 as c207, rand64(208) % 5 as c208, rand64(209) % 5 as c209, rand64(210) % 5 as c210, + rand64(211) % 5 as c211, rand64(212) % 5 as c212, rand64(213) % 5 as c213, rand64(214) % 5 as c214, rand64(215) % 5 as c215, + rand64(216) % 5 as c216, rand64(217) % 5 as c217, rand64(218) % 5 as c218, rand64(219) % 5 as c219, rand64(220) % 5 as c220, + rand64(221) % 5 as c221, rand64(222) % 5 as c222, rand64(223) % 5 as c223, rand64(224) % 5 as c224, rand64(225) % 5 as c225, + rand64(226) % 5 as c226, rand64(227) % 5 as c227, rand64(228) % 5 as c228, rand64(229) % 5 as c229, rand64(230) % 5 as c230, + rand64(231) % 5 as c231, rand64(232) % 5 as c232, rand64(233) % 5 as c233, rand64(234) % 5 as c234, rand64(235) % 5 as c235, + rand64(236) % 5 as c236, rand64(237) % 5 as c237, rand64(238) % 5 as c238, rand64(239) % 5 as c239, rand64(240) % 5 as c240, + rand64(241) % 5 as c241, rand64(242) % 5 as c242, rand64(243) % 5 as c243, rand64(244) % 5 as c244, rand64(245) % 5 as c245, + rand64(246) % 5 as c246, rand64(247) % 5 as c247, rand64(248) % 5 as c248, rand64(249) % 5 as c249, rand64(250) % 5 as c250, + rand64(251) % 5 as c251, rand64(252) % 5 as c252, rand64(253) % 5 as c253, rand64(254) % 5 as c254, rand64(255) % 5 as c255, + rand64(256) % 5 as c256, rand64(257) % 5 as c257, rand64(258) % 5 as c258, rand64(259) % 5 as c259, rand64(260) % 5 as c260, + rand64(261) % 5 as c261, rand64(262) % 5 as c262, rand64(263) % 5 as c263, rand64(264) % 5 as c264, rand64(265) % 5 as c265, + rand64(266) % 5 as c266, rand64(267) % 5 as c267, rand64(268) % 5 as c268, rand64(269) % 5 as c269, rand64(270) % 5 as c270, + rand64(271) % 5 as c271, rand64(272) % 5 as c272, rand64(273) % 5 as c273, rand64(274) % 5 as c274, rand64(275) % 5 as c275, + rand64(276) % 5 as c276, rand64(277) % 5 as c277, rand64(278) % 5 as c278, rand64(279) % 5 as c279, rand64(280) % 5 as c280, + rand64(281) % 5 as c281, rand64(282) % 5 as c282, rand64(283) % 5 as c283, rand64(284) % 5 as c284, rand64(285) % 5 as c285, + rand64(286) % 5 as c286, rand64(287) % 5 as c287, rand64(288) % 5 as c288, rand64(289) % 5 as c289, rand64(290) % 5 as c290, + rand64(291) % 5 as c291, rand64(292) % 5 as c292, rand64(293) % 5 as c293, rand64(294) % 5 as c294, rand64(295) % 5 as c295, + rand64(296) % 5 as c296, rand64(297) % 5 as c297, rand64(298) % 5 as c298, rand64(299) % 5 as c299, rand64(300) % 5 as c300, + rand64(301) % 5 as c301, rand64(302) % 5 as c302, rand64(303) % 5 as c303, rand64(304) % 5 as c304, rand64(305) % 5 as c305, + rand64(306) % 5 as c306, rand64(307) % 5 as c307, rand64(308) % 5 as c308, rand64(309) % 5 as c309, rand64(310) % 5 as c310, + rand64(311) % 5 as c311, rand64(312) % 5 as c312, rand64(313) % 5 as c313, rand64(314) % 5 as c314, rand64(315) % 5 as c315, + rand64(316) % 5 as c316, rand64(317) % 5 as c317, rand64(318) % 5 as c318, rand64(319) % 5 as c319, rand64(320) % 5 as c320, + rand64(321) % 5 as c321, rand64(322) % 5 as c322, rand64(323) % 5 as c323, rand64(324) % 5 as c324, rand64(325) % 5 as c325, + rand64(326) % 5 as c326, rand64(327) % 5 as c327, rand64(328) % 5 as c328, rand64(329) % 5 as c329, rand64(330) % 5 as c330, + rand64(331) % 5 as c331, rand64(332) % 5 as c332, rand64(333) % 5 as c333, rand64(334) % 5 as c334, rand64(335) % 5 as c335, + rand64(336) % 5 as c336, rand64(337) % 5 as c337, rand64(338) % 5 as c338, rand64(339) % 5 as c339, rand64(340) % 5 as c340, + rand64(341) % 5 as c341, rand64(342) % 5 as c342, rand64(343) % 5 as c343, rand64(344) % 5 as c344, rand64(345) % 5 as c345, + rand64(346) % 5 as c346, rand64(347) % 5 as c347, rand64(348) % 5 as c348, rand64(349) % 5 as c349, rand64(350) % 5 as c350, + rand64(351) % 5 as c351, rand64(352) % 5 as c352, rand64(353) % 5 as c353, rand64(354) % 5 as c354, rand64(355) % 5 as c355, + rand64(356) % 5 as c356, rand64(357) % 5 as c357, rand64(358) % 5 as c358, rand64(359) % 5 as c359, rand64(360) % 5 as c360, + rand64(361) % 5 as c361, rand64(362) % 5 as c362, rand64(363) % 5 as c363, rand64(364) % 5 as c364, rand64(365) % 5 as c365, + rand64(366) % 5 as c366, rand64(367) % 5 as c367, rand64(368) % 5 as c368, rand64(369) % 5 as c369, rand64(370) % 5 as c370, + rand64(371) % 5 as c371, rand64(372) % 5 as c372, rand64(373) % 5 as c373, rand64(374) % 5 as c374, rand64(375) % 5 as c375, + rand64(376) % 5 as c376, rand64(377) % 5 as c377, rand64(378) % 5 as c378, rand64(379) % 5 as c379, rand64(380) % 5 as c380, + rand64(381) % 5 as c381, rand64(382) % 5 as c382, rand64(383) % 5 as c383, rand64(384) % 5 as c384, rand64(385) % 5 as c385, + rand64(386) % 5 as c386, rand64(387) % 5 as c387, rand64(388) % 5 as c388, rand64(389) % 5 as c389, rand64(390) % 5 as c390, + rand64(391) % 5 as c391, rand64(392) % 5 as c392, rand64(393) % 5 as c393, rand64(394) % 5 as c394, rand64(395) % 5 as c395, + rand64(396) % 5 as c396, rand64(397) % 5 as c397, rand64(398) % 5 as c398, rand64(399) % 5 as c399, rand64(400) % 5 as c400, + rand64(401) % 5 as c401, rand64(402) % 5 as c402, rand64(403) % 5 as c403, rand64(404) % 5 as c404, rand64(405) % 5 as c405, + rand64(406) % 5 as c406, rand64(407) % 5 as c407, rand64(408) % 5 as c408, rand64(409) % 5 as c409, rand64(410) % 5 as c410, + rand64(411) % 5 as c411, rand64(412) % 5 as c412, rand64(413) % 5 as c413, rand64(414) % 5 as c414, rand64(415) % 5 as c415, + rand64(416) % 5 as c416, rand64(417) % 5 as c417, rand64(418) % 5 as c418, rand64(419) % 5 as c419, rand64(420) % 5 as c420, + rand64(421) % 5 as c421, rand64(422) % 5 as c422, rand64(423) % 5 as c423, rand64(424) % 5 as c424, rand64(425) % 5 as c425, + rand64(426) % 5 as c426, rand64(427) % 5 as c427, rand64(428) % 5 as c428, rand64(429) % 5 as c429, rand64(430) % 5 as c430, + rand64(431) % 5 as c431, rand64(432) % 5 as c432, rand64(433) % 5 as c433, rand64(434) % 5 as c434, rand64(435) % 5 as c435, + rand64(436) % 5 as c436, rand64(437) % 5 as c437, rand64(438) % 5 as c438, rand64(439) % 5 as c439, rand64(440) % 5 as c440, + rand64(441) % 5 as c441, rand64(442) % 5 as c442, rand64(443) % 5 as c443, rand64(444) % 5 as c444, rand64(445) % 5 as c445, + rand64(446) % 5 as c446, rand64(447) % 5 as c447, rand64(448) % 5 as c448, rand64(449) % 5 as c449, rand64(450) % 5 as c450, + rand64(451) % 5 as c451, rand64(452) % 5 as c452, rand64(453) % 5 as c453, rand64(454) % 5 as c454, rand64(455) % 5 as c455, + rand64(456) % 5 as c456, rand64(457) % 5 as c457, rand64(458) % 5 as c458, rand64(459) % 5 as c459, rand64(460) % 5 as c460, + rand64(461) % 5 as c461, rand64(462) % 5 as c462, rand64(463) % 5 as c463, rand64(464) % 5 as c464, rand64(465) % 5 as c465, + rand64(466) % 5 as c466, rand64(467) % 5 as c467, rand64(468) % 5 as c468, rand64(469) % 5 as c469, rand64(470) % 5 as c470, + rand64(471) % 5 as c471, rand64(472) % 5 as c472, rand64(473) % 5 as c473, rand64(474) % 5 as c474, rand64(475) % 5 as c475, + rand64(476) % 5 as c476, rand64(477) % 5 as c477, rand64(478) % 5 as c478, rand64(479) % 5 as c479, rand64(480) % 5 as c480, + rand64(481) % 5 as c481, rand64(482) % 5 as c482, rand64(483) % 5 as c483, rand64(484) % 5 as c484, rand64(485) % 5 as c485, + rand64(486) % 5 as c486, rand64(487) % 5 as c487, rand64(488) % 5 as c488, rand64(489) % 5 as c489, rand64(490) % 5 as c490, + rand64(491) % 5 as c491, rand64(492) % 5 as c492, rand64(493) % 5 as c493, rand64(494) % 5 as c494, rand64(495) % 5 as c495, + rand64(496) % 5 as c496, rand64(497) % 5 as c497, rand64(498) % 5 as c498, rand64(499) % 5 as c499, rand64(500) % 5 as c500, + rand64(501) % 5 as c501, rand64(502) % 5 as c502, rand64(503) % 5 as c503, rand64(504) % 5 as c504, rand64(505) % 5 as c505, + rand64(506) % 5 as c506, rand64(507) % 5 as c507, rand64(508) % 5 as c508, rand64(509) % 5 as c509, rand64(510) % 5 as c510, + rand64(511) % 5 as c511, rand64(512) % 5 as c512, rand64(513) % 5 as c513, rand64(514) % 5 as c514, rand64(515) % 5 as c515, + rand64(516) % 5 as c516, rand64(517) % 5 as c517, rand64(518) % 5 as c518, rand64(519) % 5 as c519, rand64(520) % 5 as c520, + rand64(521) % 5 as c521, rand64(522) % 5 as c522, rand64(523) % 5 as c523, rand64(524) % 5 as c524, rand64(525) % 5 as c525, + rand64(526) % 5 as c526, rand64(527) % 5 as c527, rand64(528) % 5 as c528, rand64(529) % 5 as c529, rand64(530) % 5 as c530, + rand64(531) % 5 as c531, rand64(532) % 5 as c532, rand64(533) % 5 as c533, rand64(534) % 5 as c534, rand64(535) % 5 as c535, + rand64(536) % 5 as c536, rand64(537) % 5 as c537, rand64(538) % 5 as c538, rand64(539) % 5 as c539, rand64(540) % 5 as c540, + rand64(541) % 5 as c541, rand64(542) % 5 as c542, rand64(543) % 5 as c543, rand64(544) % 5 as c544, rand64(545) % 5 as c545, + rand64(546) % 5 as c546, rand64(547) % 5 as c547, rand64(548) % 5 as c548, rand64(549) % 5 as c549, rand64(550) % 5 as c550, + rand64(551) % 5 as c551, rand64(552) % 5 as c552, rand64(553) % 5 as c553, rand64(554) % 5 as c554, rand64(555) % 5 as c555, + rand64(556) % 5 as c556, rand64(557) % 5 as c557, rand64(558) % 5 as c558, rand64(559) % 5 as c559, rand64(560) % 5 as c560, + rand64(561) % 5 as c561, rand64(562) % 5 as c562, rand64(563) % 5 as c563, rand64(564) % 5 as c564, rand64(565) % 5 as c565, + rand64(566) % 5 as c566, rand64(567) % 5 as c567, rand64(568) % 5 as c568, rand64(569) % 5 as c569, rand64(570) % 5 as c570, + rand64(571) % 5 as c571, rand64(572) % 5 as c572, rand64(573) % 5 as c573, rand64(574) % 5 as c574, rand64(575) % 5 as c575, + rand64(576) % 5 as c576, rand64(577) % 5 as c577, rand64(578) % 5 as c578, rand64(579) % 5 as c579, rand64(580) % 5 as c580, + rand64(581) % 5 as c581, rand64(582) % 5 as c582, rand64(583) % 5 as c583, rand64(584) % 5 as c584, rand64(585) % 5 as c585, + rand64(586) % 5 as c586, rand64(587) % 5 as c587, rand64(588) % 5 as c588, rand64(589) % 5 as c589, rand64(590) % 5 as c590, + rand64(591) % 5 as c591, rand64(592) % 5 as c592, rand64(593) % 5 as c593, rand64(594) % 5 as c594, rand64(595) % 5 as c595, + rand64(596) % 5 as c596, rand64(597) % 5 as c597, rand64(598) % 5 as c598, rand64(599) % 5 as c599, rand64(600) % 5 as c600, + rand64(601) % 5 as c601, rand64(602) % 5 as c602, rand64(603) % 5 as c603, rand64(604) % 5 as c604, rand64(605) % 5 as c605, + rand64(606) % 5 as c606, rand64(607) % 5 as c607, rand64(608) % 5 as c608, rand64(609) % 5 as c609, rand64(610) % 5 as c610, + rand64(611) % 5 as c611, rand64(612) % 5 as c612, rand64(613) % 5 as c613, rand64(614) % 5 as c614, rand64(615) % 5 as c615, + rand64(616) % 5 as c616, rand64(617) % 5 as c617, rand64(618) % 5 as c618, rand64(619) % 5 as c619, rand64(620) % 5 as c620, + rand64(621) % 5 as c621, rand64(622) % 5 as c622, rand64(623) % 5 as c623, rand64(624) % 5 as c624, rand64(625) % 5 as c625, + rand64(626) % 5 as c626, rand64(627) % 5 as c627, rand64(628) % 5 as c628, rand64(629) % 5 as c629, rand64(630) % 5 as c630, + rand64(631) % 5 as c631, rand64(632) % 5 as c632, rand64(633) % 5 as c633, rand64(634) % 5 as c634, rand64(635) % 5 as c635, + rand64(636) % 5 as c636, rand64(637) % 5 as c637, rand64(638) % 5 as c638, rand64(639) % 5 as c639, rand64(640) % 5 as c640, + rand64(641) % 5 as c641, rand64(642) % 5 as c642, rand64(643) % 5 as c643, rand64(644) % 5 as c644, rand64(645) % 5 as c645, + rand64(646) % 5 as c646, rand64(647) % 5 as c647, rand64(648) % 5 as c648, rand64(649) % 5 as c649, rand64(650) % 5 as c650, + rand64(651) % 5 as c651, rand64(652) % 5 as c652, rand64(653) % 5 as c653, rand64(654) % 5 as c654, rand64(655) % 5 as c655, + rand64(656) % 5 as c656, rand64(657) % 5 as c657, rand64(658) % 5 as c658, rand64(659) % 5 as c659, rand64(660) % 5 as c660, + rand64(661) % 5 as c661, rand64(662) % 5 as c662, rand64(663) % 5 as c663, rand64(664) % 5 as c664, rand64(665) % 5 as c665, + rand64(666) % 5 as c666, rand64(667) % 5 as c667, rand64(668) % 5 as c668, rand64(669) % 5 as c669, rand64(670) % 5 as c670, + rand64(671) % 5 as c671, rand64(672) % 5 as c672, rand64(673) % 5 as c673, rand64(674) % 5 as c674, rand64(675) % 5 as c675, + rand64(676) % 5 as c676, rand64(677) % 5 as c677, rand64(678) % 5 as c678, rand64(679) % 5 as c679, rand64(680) % 5 as c680, + rand64(681) % 5 as c681, rand64(682) % 5 as c682, rand64(683) % 5 as c683, rand64(684) % 5 as c684, rand64(685) % 5 as c685, + rand64(686) % 5 as c686, rand64(687) % 5 as c687, rand64(688) % 5 as c688, rand64(689) % 5 as c689, rand64(690) % 5 as c690, + rand64(691) % 5 as c691, rand64(692) % 5 as c692, rand64(693) % 5 as c693, rand64(694) % 5 as c694, rand64(695) % 5 as c695, + rand64(696) % 5 as c696, rand64(697) % 5 as c697, rand64(698) % 5 as c698, rand64(699) % 5 as c699, rand64(700) % 5 as c700, + rand64(701) % 5 as c701 + FROM system.numbers + LIMIT 1048576 + + + + 10]]> + + 10]]> + + 10]]> + + DROP TABLE IF EXISTS huge_pk + \ No newline at end of file From db470be07706157056fc3e3d05e013cd49d39423 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Wed, 18 Sep 2019 23:07:23 +0300 Subject: [PATCH 119/309] Update merge_tree_huge_pk.xml --- dbms/tests/performance/merge_tree_huge_pk.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/tests/performance/merge_tree_huge_pk.xml b/dbms/tests/performance/merge_tree_huge_pk.xml index 351cc17cb6c..e39ff7501f7 100644 --- a/dbms/tests/performance/merge_tree_huge_pk.xml +++ b/dbms/tests/performance/merge_tree_huge_pk.xml @@ -203,4 +203,4 @@ 10]]> DROP TABLE IF EXISTS huge_pk - \ No newline at end of file + From 0dc3866c36d2306a53acb9286916c422ea2c8c22 Mon Sep 17 00:00:00 2001 From: Zhichang Yu Date: Tue, 17 Sep 2019 14:34:08 +0800 Subject: [PATCH 120/309] added bitmapSubsetLimit --- .../AggregateFunctionGroupBitmapData.h | 46 +++++++++++++++++-- dbms/src/Functions/FunctionsBitmap.cpp | 1 + dbms/src/Functions/FunctionsBitmap.h | 37 +++++++++++++-- .../00829_bitmap_function.reference | 8 ++++ .../0_stateless/00829_bitmap_function.sql | 19 ++++++++ .../functions/bitmap_functions.md | 26 +++++++++++ .../functions/bitmap_functions.md | 26 +++++++++++ 7 files changed, 154 insertions(+), 9 deletions(-) diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h index 220493a918c..e1fbd092490 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h @@ -467,11 +467,10 @@ public: return count; if (isSmall()) { - std::vector ans; for (const auto & x : small) { T val = x.getValue(); - if ((UInt32)val >= range_start && (UInt32)val < range_end) + if (UInt32(val) >= range_start && UInt32(val) < range_end) { r1.add(val); count++; @@ -483,10 +482,47 @@ public: roaring_uint32_iterator_t iterator; roaring_init_iterator(rb, &iterator); roaring_move_uint32_iterator_equalorlarger(&iterator, range_start); - while (iterator.has_value) + while (iterator.has_value && UInt32(iterator.current_value) < range_end) + { + r1.add(iterator.current_value); + roaring_advance_uint32_iterator(&iterator); + count++; + } + } + return count; + } + + /** + * Return new set of the smallest `limit` values in set which is no less than `range_start`. + */ + UInt64 rb_limit(UInt32 range_start, UInt32 limit, RoaringBitmapWithSmallSet& r1) const + { + UInt64 count = 0; + if (isSmall()) + { + std::vector ans; + for (const auto & x : small) + { + T val = x.getValue(); + if (UInt32(val) >= range_start) + { + ans.push_back(val); + } + } + sort(ans.begin(), ans.end()); + if (limit > ans.size()) + limit = ans.size(); + for (size_t i=0; i= range_end) - break; r1.add(iterator.current_value); roaring_advance_uint32_iterator(&iterator); count++; diff --git a/dbms/src/Functions/FunctionsBitmap.cpp b/dbms/src/Functions/FunctionsBitmap.cpp index b24e9cdbd9f..62faf49d2b3 100644 --- a/dbms/src/Functions/FunctionsBitmap.cpp +++ b/dbms/src/Functions/FunctionsBitmap.cpp @@ -10,6 +10,7 @@ void registerFunctionsBitmap(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); diff --git a/dbms/src/Functions/FunctionsBitmap.h b/dbms/src/Functions/FunctionsBitmap.h index ad4f16b16ef..740dc5bafe4 100644 --- a/dbms/src/Functions/FunctionsBitmap.h +++ b/dbms/src/Functions/FunctionsBitmap.h @@ -34,6 +34,9 @@ namespace ErrorCodes * Return subset in specified range (not include the range_end): * bitmapSubsetInRange: bitmap,integer,integer -> bitmap * + * Return subset of the smallest `limit` values in set which is no smaller than `range_start`. + * bitmapSubsetInRange: bitmap,integer,integer -> bitmap + * * Two bitmap and calculation: * bitmapAnd: bitmap,bitmap -> bitmap * @@ -250,12 +253,13 @@ private: } }; -class FunctionBitmapSubsetInRange : public IFunction +template +class FunctionBitmapSubset : public IFunction { public: - static constexpr auto name = "bitmapSubsetInRange"; + static constexpr auto name = Impl::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(const Context &) { return std::make_shared>(); } String getName() const override { return name; } @@ -357,12 +361,37 @@ private: col_to->insertDefault(); AggregateFunctionGroupBitmapData & bd2 = *reinterpret_cast *>(col_to->getData()[i]); - bd0.rbs.rb_range(range_start, range_end, bd2.rbs); + Impl::apply(bd0, range_start, range_end, bd2); } block.getByPosition(result).column = std::move(col_to); } }; +struct BitmapSubsetInRangeImpl +{ +public: + static constexpr auto name = "bitmapSubsetInRange"; + template + static void apply(const AggregateFunctionGroupBitmapData & bd0, UInt32 range_start, UInt32 range_end, AggregateFunctionGroupBitmapData & bd2) + { + bd0.rbs.rb_range(range_start, range_end, bd2.rbs); + } +}; + +struct BitmapSubsetLimitImpl +{ +public: + static constexpr auto name = "bitmapSubsetLimit"; + template + static void apply(const AggregateFunctionGroupBitmapData & bd0, UInt32 range_start, UInt32 range_end, AggregateFunctionGroupBitmapData & bd2) + { + bd0.rbs.rb_limit(range_start, range_end, bd2.rbs); + } +}; + +using FunctionBitmapSubsetInRange = FunctionBitmapSubset; +using FunctionBitmapSubsetLimit = FunctionBitmapSubset; + template class FunctionBitmapSelfCardinalityImpl : public IFunction { diff --git a/dbms/tests/queries/0_stateless/00829_bitmap_function.reference b/dbms/tests/queries/0_stateless/00829_bitmap_function.reference index 3edcd0e1214..b5b6c1e0d49 100644 --- a/dbms/tests/queries/0_stateless/00829_bitmap_function.reference +++ b/dbms/tests/queries/0_stateless/00829_bitmap_function.reference @@ -67,6 +67,14 @@ [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33] [30,31,32,33,100] [100] +[] +[] +[1,5,7,9] +[] +[5,7,9] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500] +[30,31,32,33,100,200,500] +[100,200,500] 4294967295 4294967295 4294967295 diff --git a/dbms/tests/queries/0_stateless/00829_bitmap_function.sql b/dbms/tests/queries/0_stateless/00829_bitmap_function.sql index 82e1030c036..f12fe4c8218 100644 --- a/dbms/tests/queries/0_stateless/00829_bitmap_function.sql +++ b/dbms/tests/queries/0_stateless/00829_bitmap_function.sql @@ -212,6 +212,25 @@ select bitmapToArray(bitmapSubsetInRange(bitmapBuild([ 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33, 100,200,500]), toUInt32(100), toUInt32(200))); +-- bitmapSubsetLimit: +---- Empty +SELECT bitmapToArray(bitmapSubsetLimit(bitmapBuild(emptyArrayUInt32()), toUInt32(0), toUInt32(10))); +SELECT bitmapToArray(bitmapSubsetLimit(bitmapBuild(emptyArrayUInt16()), toUInt32(0), toUInt32(10))); +---- Small +select bitmapToArray(bitmapSubsetLimit(bitmapBuild([1,5,7,9]), toUInt32(0), toUInt32(4))); +select bitmapToArray(bitmapSubsetLimit(bitmapBuild([1,5,7,9]), toUInt32(10), toUInt32(10))); +select bitmapToArray(bitmapSubsetLimit(bitmapBuild([1,5,7,9]), toUInt32(3), toUInt32(7))); +---- Large +select bitmapToArray(bitmapSubsetLimit(bitmapBuild([ + 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33, + 100,200,500]), toUInt32(0), toUInt32(100))); +select bitmapToArray(bitmapSubsetLimit(bitmapBuild([ + 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33, + 100,200,500]), toUInt32(30), toUInt32(200))); +select bitmapToArray(bitmapSubsetLimit(bitmapBuild([ + 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33, + 100,200,500]), toUInt32(100), toUInt32(200))); + -- bitmapMin: ---- Empty SELECT bitmapMin(bitmapBuild(emptyArrayUInt8())); diff --git a/docs/en/query_language/functions/bitmap_functions.md b/docs/en/query_language/functions/bitmap_functions.md index fdc2e8a7a0d..3d328359c88 100644 --- a/docs/en/query_language/functions/bitmap_functions.md +++ b/docs/en/query_language/functions/bitmap_functions.md @@ -82,6 +82,32 @@ SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11, └───────────────────┘ ``` +## bitmapSubsetLimit {#bitmap_functions-bitmapsubsetlimit} + +Return subset of the smallest `limit` values in set which is no less than `range_start`. + +``` +bitmapSubsetLimit(bitmap, range_start, limit) +``` + +**Parameters** + +- `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). +- `range_start` – range start point. Type: [UInt32](../../data_types/int_uint.md). +- `limit` – subset cardinality upper limit. Type: [UInt32](../../data_types/int_uint.md). + +**Example** + +``` sql +SELECT bitmapToArray(bitmapSubsetLimit(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res +``` + +``` +┌─res───────────────────────┐ +│ [30,31,32,33,100,200,500] │ +└───────────────────────────┘ +``` + ## bitmapContains {#bitmap_functions-bitmapcontains} Checks whether the bitmap contains an element. diff --git a/docs/zh/query_language/functions/bitmap_functions.md b/docs/zh/query_language/functions/bitmap_functions.md index b727a4aba16..1fbfd3c8fe7 100644 --- a/docs/zh/query_language/functions/bitmap_functions.md +++ b/docs/zh/query_language/functions/bitmap_functions.md @@ -77,6 +77,32 @@ SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11, └───────────────────┘ ``` +## bitmapSubsetLimit + +将位图指定范围(起始点和数目上限)转换为另一个位图。 + +``` +bitmapSubsetLimit(bitmap, range_start, limit) +``` + +**参数** + +- `bitmap` – 位图对象. +- `range_start` – 范围起始点(含). +- `limit` – 子位图基数上限. + +**示例** + +``` sql +SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res +``` + +``` +┌─res───────────────────────┐ +│ [30,31,32,33,100,200,500] │ +└───────────────────────────┘ +``` + ## bitmapContains 检查位图是否包含指定元素。 From 3765084358a269f0678c43275a1a68787bd9eaf3 Mon Sep 17 00:00:00 2001 From: maqroll Date: Thu, 19 Sep 2019 07:33:54 +0000 Subject: [PATCH 121/309] Proper ReadWriteBufferFromHTTP constructor args --- dbms/src/Storages/MergeTree/DataPartsExchange.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dbms/src/Storages/MergeTree/DataPartsExchange.cpp b/dbms/src/Storages/MergeTree/DataPartsExchange.cpp index 1ab4c92042f..073f33441ac 100644 --- a/dbms/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/dbms/src/Storages/MergeTree/DataPartsExchange.cpp @@ -219,6 +219,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( timeouts, creds, DBMS_DEFAULT_BUFFER_SIZE, + 0, /* no redirects */ data_settings->replicated_max_parallel_fetches_for_host }; From bd1ce56797fb08b524a045b9d29452aa431a3821 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Thu, 19 Sep 2019 09:34:33 +0000 Subject: [PATCH 122/309] Tests decomposition. --- .../configs/min_chunk_size.xml | 3 + .../tests/integration/test_storage_s3/test.py | 255 ++++++++++-------- .../test_storage_s3/test_server.py | 121 ++++++++- 3 files changed, 259 insertions(+), 120 deletions(-) create mode 100644 dbms/tests/integration/test_storage_s3/configs/min_chunk_size.xml diff --git a/dbms/tests/integration/test_storage_s3/configs/min_chunk_size.xml b/dbms/tests/integration/test_storage_s3/configs/min_chunk_size.xml new file mode 100644 index 00000000000..f61fcd2c5c9 --- /dev/null +++ b/dbms/tests/integration/test_storage_s3/configs/min_chunk_size.xml @@ -0,0 +1,3 @@ + + 1000000 + diff --git a/dbms/tests/integration/test_storage_s3/test.py b/dbms/tests/integration/test_storage_s3/test.py index b975c4c92d5..2013daa6ae6 100644 --- a/dbms/tests/integration/test_storage_s3/test.py +++ b/dbms/tests/integration/test_storage_s3/test.py @@ -1,19 +1,3 @@ -import pytest - -from helpers.cluster import ClickHouseCluster - -@pytest.fixture(scope="module") -def started_cluster(): - try: - cluster = ClickHouseCluster(__file__) - instance = cluster.add_instance('dummy') - cluster.start() - yield cluster - - finally: - cluster.shutdown() - - import httplib import json import logging @@ -21,115 +5,152 @@ import os import time import traceback +import pytest + +from helpers.cluster import ClickHouseCluster + logging.getLogger().setLevel(logging.INFO) logging.getLogger().addHandler(logging.StreamHandler()) -def test_simple(started_cluster): - instance = started_cluster.instances['dummy'] - instance.copy_file_to_container(os.path.join(os.path.dirname(__file__), 'test_server.py'), 'test_server.py') - communication_port = 10000 - bucket = 'abc' - instance.exec_in_container(['python', 'test_server.py', str(communication_port), bucket], detach=True) - def get_data(): - conn = httplib.HTTPConnection(started_cluster.instances['dummy'].ip_address, communication_port) - conn.request("GET", "/") - r = conn.getresponse() - raw_data = r.read() - conn.close() - return json.loads(raw_data) +def get_communication_data(started_cluster): + conn = httplib.HTTPConnection(started_cluster.instances['dummy'].ip_address, started_cluster.communication_port) + conn.request("GET", "/") + r = conn.getresponse() + raw_data = r.read() + conn.close() + return json.loads(raw_data) - format = 'column1 UInt32, column2 UInt32, column3 UInt32' - values = '(1, 2, 3), (3, 2, 1), (78, 43, 45)' - other_values = '(1, 1, 1), (1, 1, 1), (11, 11, 11)' - for i in range(10): - try: - data = get_data() - redirecting_to_http_port = data['redirecting_to_http_port'] - preserving_data_port = data['preserving_data_port'] - redirecting_preserving_data_port = data['redirecting_preserving_data_port'] - except: - logging.error(traceback.format_exc()) - time.sleep(0.5) - else: - break - else: - assert False, 'Could not initialize mock server' - mock_host = started_cluster.instances['dummy'].ip_address +def put_communication_data(started_cluster, body): + conn = httplib.HTTPConnection(started_cluster.instances['dummy'].ip_address, started_cluster.communication_port) + conn.request("PUT", "/", body) + r = conn.getresponse() + conn.close() - def run_query(query): - logging.info('Running query "{}"...'.format(query)) - result = instance.query(query) - logging.info('Query finished') - return result - - - prepare_put_queries = [ - "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(mock_host, preserving_data_port, bucket, format, values), - ] - - queries = [ - "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(mock_host, redirecting_to_http_port, format), - ] - - put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(mock_host, preserving_data_port, bucket, format, values) - - redirect_put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(mock_host, redirecting_preserving_data_port, bucket, format, other_values) - - check_queries = [ - "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(mock_host, preserving_data_port, bucket, format), - ] - + +@pytest.fixture(scope="module") +def started_cluster(): try: - logging.info('Phase 1') - for query in prepare_put_queries: - run_query(query) - - logging.info('Phase 2') - for query in queries: - stdout = run_query(query) - assert list(map(str.split, stdout.splitlines())) == [ - ['42', '87', '44', '160776'], - ['55', '33', '81', '147015'], - ['1', '0', '9', '0'], - ] - - logging.info('Phase 3') - query = put_query - run_query(query) - data = get_data() - received_data_completed = data['received_data_completed'] - received_data = data['received_data'] - finalize_data = data['finalize_data'] - finalize_data_query = data['finalize_data_query'] - assert received_data[-1].decode() == '1,2,3\n3,2,1\n78,43,45\n' - assert received_data_completed - assert finalize_data == '1hello-etag' - assert finalize_data_query == 'uploadId=TEST' - - logging.info('Phase 4') - query = redirect_put_query - run_query(query) - - for query in check_queries: - logging.info(query) - stdout = run_query(query) - assert list(map(str.split, stdout.splitlines())) == [ - ['1', '1', '1', '1'], - ['1', '1', '1', '1'], - ['11', '11', '11', '1331'], - ] - data = get_data() - received_data = data['received_data'] - assert received_data[-1].decode() == '1,1,1\n1,1,1\n11,11,11\n' + cluster = ClickHouseCluster(__file__) + instance = cluster.add_instance('dummy', config_dir="configs", main_configs=['configs/min_chunk_size.xml']) + cluster.start() - # FIXME tests for multipart - - except: - logging.error(traceback.format_exc()) - raise + cluster.communication_port = 10000 + instance.copy_file_to_container(os.path.join(os.path.dirname(__file__), 'test_server.py'), 'test_server.py') + cluster.bucket = 'abc' + instance.exec_in_container(['python', 'test_server.py', str(cluster.communication_port), cluster.bucket], detach=True) + cluster.mock_host = instance.ip_address - else: - logging.info('Done') + for i in range(10): + try: + data = get_communication_data(cluster) + cluster.redirecting_to_http_port = data['redirecting_to_http_port'] + cluster.preserving_data_port = data['preserving_data_port'] + cluster.multipart_preserving_data_port = data['multipart_preserving_data_port'] + cluster.redirecting_preserving_data_port = data['redirecting_preserving_data_port'] + except: + logging.error(traceback.format_exc()) + time.sleep(0.5) + else: + break + else: + assert False, 'Could not initialize mock server' + + yield cluster + + finally: + cluster.shutdown() + + +def run_query(instance, query, stdin=None): + logging.info('Running query "{}"...'.format(query)) + result = instance.query(query, stdin=stdin) + logging.info('Query finished') + return result + +def test_get_with_redirect(started_cluster): + instance = started_cluster.instances['dummy'] + format = 'column1 UInt32, column2 UInt32, column3 UInt32' + + put_communication_data(started_cluster, '=== Get with redirect test ===') + query = "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(started_cluster.mock_host, started_cluster.redirecting_to_http_port, format) + stdout = run_query(instance, query) + assert list(map(str.split, stdout.splitlines())) == [ + ['42', '87', '44', '160776'], + ['55', '33', '81', '147015'], + ['1', '0', '9', '0'], + ] + +def test_put(started_cluster): + instance = started_cluster.instances['dummy'] + format = 'column1 UInt32, column2 UInt32, column3 UInt32' + + logging.info('Phase 3') + put_communication_data(started_cluster, '=== Put test ===') + values = '(1, 2, 3), (3, 2, 1), (78, 43, 45)' + put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(started_cluster.mock_host, started_cluster.preserving_data_port, started_cluster.bucket, format, values) + run_query(instance, put_query) + data = get_communication_data(started_cluster) + received_data_completed = data['received_data_completed'] + received_data = data['received_data'] + finalize_data = data['finalize_data'] + finalize_data_query = data['finalize_data_query'] + assert received_data[-1].decode() == '1,2,3\n3,2,1\n78,43,45\n' + assert received_data_completed + assert finalize_data == '1hello-etag' + assert finalize_data_query == 'uploadId=TEST' + +def test_put_csv(started_cluster): + instance = started_cluster.instances['dummy'] + format = 'column1 UInt32, column2 UInt32, column3 UInt32' + + put_communication_data(started_cluster, '=== Put test CSV ===') + put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') format CSV".format(started_cluster.mock_host, started_cluster.preserving_data_port, started_cluster.bucket, format) + csv_data = '8,9,16\n11,18,13\n22,14,2\n' + run_query(instance, put_query, stdin=csv_data) + data = get_communication_data(started_cluster) + received_data_completed = data['received_data_completed'] + received_data = data['received_data'] + finalize_data = data['finalize_data'] + finalize_data_query = data['finalize_data_query'] + assert received_data[-1].decode() == csv_data + assert received_data_completed + assert finalize_data == '1hello-etag' + assert finalize_data_query == 'uploadId=TEST' + +def test_put_with_redirect(started_cluster): + instance = started_cluster.instances['dummy'] + format = 'column1 UInt32, column2 UInt32, column3 UInt32' + + put_communication_data(started_cluster, '=== Put with redirect test ===') + other_values = '(1, 1, 1), (1, 1, 1), (11, 11, 11)' + query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(started_cluster.mock_host, started_cluster.redirecting_preserving_data_port, started_cluster.bucket, format, other_values) + run_query(instance, query) + + query = "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(started_cluster.mock_host, started_cluster.preserving_data_port, started_cluster.bucket, format) + stdout = run_query(instance, query) + assert list(map(str.split, stdout.splitlines())) == [ + ['1', '1', '1', '1'], + ['1', '1', '1', '1'], + ['11', '11', '11', '1331'], + ] + data = get_communication_data(started_cluster) + received_data = data['received_data'] + assert received_data[-1].decode() == '1,1,1\n1,1,1\n11,11,11\n' + +def test_multipart_put(started_cluster): + instance = started_cluster.instances['dummy'] + format = 'column1 UInt32, column2 UInt32, column3 UInt32' + + put_communication_data(started_cluster, '=== Multipart test ===') + long_data = [[i, i+1, i+2] for i in range(100000)] + long_values = ''.join([ '{},{},{}\n'.format(x,y,z) for x, y, z in long_data ]) + put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') format CSV".format(started_cluster.mock_host, started_cluster.multipart_preserving_data_port, started_cluster.bucket, format) + run_query(instance, put_query, stdin=long_values) + data = get_communication_data(started_cluster) + assert 'multipart_received_data' in data + received_data = data['multipart_received_data'] + assert received_data[-1].decode() == ''.join([ '{},{},{}\n'.format(x, y, z) for x, y, z in long_data ]) + assert 1 < data['multipart_parts'] < 10000 diff --git a/dbms/tests/integration/test_storage_s3/test_server.py b/dbms/tests/integration/test_storage_s3/test_server.py index bc22b0df085..3c10445566a 100644 --- a/dbms/tests/integration/test_storage_s3/test_server.py +++ b/dbms/tests/integration/test_storage_s3/test_server.py @@ -20,6 +20,8 @@ import socket import sys import threading import time +import uuid +import xml.etree.ElementTree logging.getLogger().setLevel(logging.INFO) @@ -43,13 +45,20 @@ def GetFreeTCPPortsAndIP(n): [ s.close() for s in sockets ] return result, addr -(redirecting_to_http_port, simple_server_port, preserving_data_port, redirecting_preserving_data_port), localhost = GetFreeTCPPortsAndIP(4) +( + redirecting_to_http_port, + simple_server_port, + preserving_data_port, + multipart_preserving_data_port, + redirecting_preserving_data_port +), localhost = GetFreeTCPPortsAndIP(5) + data = { 'redirecting_to_http_port': redirecting_to_http_port, 'preserving_data_port': preserving_data_port, + 'multipart_preserving_data_port': multipart_preserving_data_port, 'redirecting_preserving_data_port': redirecting_preserving_data_port, } -redirecting_host = localhost class SimpleHTTPServerHandler(BaseHTTPRequestHandler): @@ -113,7 +122,7 @@ class PreservingDataHandler(BaseHTTPRequestHandler): def do_POST(self): self.send_response(200) query = urlparse.urlparse(self.path).query - logging.info('POST ' + query) + logging.info('PreservingDataHandler POST ?' + query) if query == 'uploads': post_data = r''' TEST'''.encode() @@ -161,6 +170,104 @@ class PreservingDataHandler(BaseHTTPRequestHandler): self.finish() +class MultipartPreservingDataHandler(BaseHTTPRequestHandler): + protocol_version = 'HTTP/1.1' + + def parse_request(self): + result = BaseHTTPRequestHandler.parse_request(self) + # Adaptation to Python 3. + if sys.version_info.major == 2 and result == True: + expect = self.headers.get('Expect', "") + if (expect.lower() == "100-continue" and self.protocol_version >= "HTTP/1.1" and self.request_version >= "HTTP/1.1"): + if not self.handle_expect_100(): + return False + return result + + def send_response_only(self, code, message=None): + if message is None: + if code in self.responses: + message = self.responses[code][0] + else: + message = '' + if self.request_version != 'HTTP/0.9': + self.wfile.write("%s %d %s\r\n" % (self.protocol_version, code, message)) + + def handle_expect_100(self): + logging.info('Received Expect-100') + self.send_response_only(100) + self.end_headers() + return True + + def do_POST(self): + query = urlparse.urlparse(self.path).query + logging.info('MultipartPreservingDataHandler POST ?' + query) + if query == 'uploads': + self.send_response(200) + post_data = r''' +TEST'''.encode() + self.send_header('Content-length', str(len(post_data))) + self.send_header('Content-type', 'text/plain') + self.end_headers() + self.wfile.write(post_data) + else: + try: + assert query == 'uploadId=TEST' + logging.info('Content-Length = ' + self.headers.get('Content-Length')) + post_data = self.rfile.read(int(self.headers.get('Content-Length'))) + root = xml.etree.ElementTree.fromstring(post_data) + assert root.tag == 'CompleteMultipartUpload' + assert len(root) > 1 + content = '' + for i, part in enumerate(root): + assert part.tag == 'Part' + assert len(part) == 2 + assert part[0].tag == 'PartNumber' + assert part[1].tag == 'ETag' + assert int(part[0].text) == i + 1 + content += self.server.storage['@'+part[1].text] + data.setdefault('multipart_received_data', []).append(content) + data['multipart_parts'] = len(root) + self.send_response(200) + self.send_header('Content-type', 'text/plain') + self.end_headers() + logging.info('Sending 200') + except: + logging.error('Sending 500') + self.send_response(500) + self.finish() + + def do_PUT(self): + uid = uuid.uuid4() + self.send_response(200) + self.send_header('Content-type', 'text/plain') + self.send_header('ETag', str(uid)) + self.end_headers() + query = urlparse.urlparse(self.path).query + path = urlparse.urlparse(self.path).path + logging.info('Content-Length = ' + self.headers.get('Content-Length')) + logging.info('PUT ' + query) + assert self.headers.get('Content-Length') + assert self.headers['Expect'] == '100-continue' + put_data = self.rfile.read() + data.setdefault('received_data', []).append(put_data) + logging.info('PUT to {}'.format(path)) + self.server.storage['@'+str(uid)] = put_data + self.finish() + + def do_GET(self): + path = urlparse.urlparse(self.path).path + if path in self.server.storage: + self.send_response(200) + self.send_header('Content-type', 'text/plain') + self.send_header('Content-length', str(len(self.server.storage[path]))) + self.end_headers() + self.wfile.write(self.server.storage[path]) + else: + self.send_response(404) + self.end_headers() + self.finish() + + class RedirectingPreservingDataHandler(BaseHTTPRequestHandler): protocol_version = 'HTTP/1.1' @@ -229,12 +336,20 @@ class CommunicationServerHandler(BaseHTTPRequestHandler): self.wfile.write(json.dumps(data)) self.finish() + def do_PUT(self): + self.send_response(200) + self.end_headers() + logging.info(self.rfile.read()) + self.finish() + servers = [] servers.append(HTTPServer((localhost, communication_port), CommunicationServerHandler)) servers.append(HTTPServer((localhost, redirecting_to_http_port), RedirectingToHTTPHandler)) servers.append(HTTPServer((localhost, preserving_data_port), PreservingDataHandler)) servers[-1].storage = {} +servers.append(HTTPServer((localhost, multipart_preserving_data_port), MultipartPreservingDataHandler)) +servers[-1].storage = {} servers.append(HTTPServer((localhost, simple_server_port), SimpleHTTPServerHandler)) servers.append(HTTPServer((localhost, redirecting_preserving_data_port), RedirectingPreservingDataHandler)) jobs = [ threading.Thread(target=server.serve_forever) for server in servers ] From 283e09d2a550bb7f2012b5519b0e211548f7c1ad Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Thu, 19 Sep 2019 10:02:45 +0000 Subject: [PATCH 123/309] Removed trash logging and fixed query parameters in multipart uploads. --- dbms/src/IO/ReadBufferFromS3.cpp | 2 +- dbms/src/IO/WriteBufferFromS3.cpp | 10 +++------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/dbms/src/IO/ReadBufferFromS3.cpp b/dbms/src/IO/ReadBufferFromS3.cpp index e26f683cdd4..852e601b9ab 100644 --- a/dbms/src/IO/ReadBufferFromS3.cpp +++ b/dbms/src/IO/ReadBufferFromS3.cpp @@ -32,7 +32,7 @@ ReadBufferFromS3::ReadBufferFromS3(Poco::URI uri_, if (!credentials.getUsername().empty()) credentials.authenticate(*request); - LOG_TRACE((&Logger::get("ReadWriteBufferFromS3")), "Sending request to " << uri.toString()); + LOG_TRACE((&Logger::get("ReadBufferFromS3")), "Sending request to " << uri.toString()); session->sendRequest(*request); diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 4e574a11c0b..181fba7f63f 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -49,9 +49,6 @@ void WriteBufferFromS3::nextImpl() if (!offset()) return; - - LOG_TRACE((&Logger::get("WriteBufferFromS3")), "nextImpl(), offset() == " << offset()); - temporary_buffer->write(working_buffer.begin(), offset()); last_part_size += offset(); @@ -68,11 +65,9 @@ void WriteBufferFromS3::nextImpl() void WriteBufferFromS3::finalize() { - LOG_TRACE((&Logger::get("WriteBufferFromS3")), "finalize()"); temporary_buffer->finish(); if (!buffer_string.empty()) { - LOG_TRACE((&Logger::get("WriteBufferFromS3")), "finalize(), writing last part"); writePart(buffer_string); } @@ -101,7 +96,8 @@ void WriteBufferFromS3::initiate() HTTPSessionPtr session; std::istream * istr = nullptr; /// owned by session Poco::URI initiate_uri = uri; - initiate_uri.setRawQuery("uploads"); // FIXME find how to leave user params as is + initiate_uri.setRawQuery("uploads"); + initiate_uri.setQueryParameters(uri.getQueryParameters()); for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT; ++i) { @@ -225,7 +221,7 @@ void WriteBufferFromS3::complete() String data; WriteBufferFromString buffer(data); - writeString("", buffer); // FIXME move to Poco::XML maybe?? + writeString("", buffer); for (size_t i = 0; i < part_tags.size(); ++i) { writeString("", buffer); From a88d795f0fece799c401dfcf969f9656470da46a Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Thu, 19 Sep 2019 10:15:59 +0000 Subject: [PATCH 124/309] Proper passing of query parameters in initiate multipart upload. --- dbms/src/IO/WriteBufferFromS3.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 181fba7f63f..82f3290f24d 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -90,14 +90,18 @@ WriteBufferFromS3::~WriteBufferFromS3() void WriteBufferFromS3::initiate() { - // https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadInitiate.html + // See https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadInitiate.html Poco::Net::HTTPResponse response; std::unique_ptr request; HTTPSessionPtr session; std::istream * istr = nullptr; /// owned by session Poco::URI initiate_uri = uri; initiate_uri.setRawQuery("uploads"); - initiate_uri.setQueryParameters(uri.getQueryParameters()); + auto params = uri.getQueryParameters(); + for (QueryParameters::const_iterator it = params.begin(); it != params.end(); ++it) + { + initiate_uri.addQueryParameter(it->first, it->second); + } for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT; ++i) { @@ -148,7 +152,7 @@ void WriteBufferFromS3::initiate() void WriteBufferFromS3::writePart(const String & data) { - // https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadUploadPart.html + // See https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadUploadPart.html Poco::Net::HTTPResponse response; std::unique_ptr request; HTTPSessionPtr session; @@ -211,7 +215,7 @@ void WriteBufferFromS3::writePart(const String & data) void WriteBufferFromS3::complete() { - // https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadComplete.html + // See https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadComplete.html Poco::Net::HTTPResponse response; std::unique_ptr request; HTTPSessionPtr session; From a5d6bd9e017a1f73da4ad14ed1a1eae2df68976e Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Thu, 19 Sep 2019 10:16:39 +0000 Subject: [PATCH 125/309] Proper passing of query parameters in initiate multipart upload. --- dbms/src/IO/WriteBufferFromS3.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 82f3290f24d..a40230e9878 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -98,7 +98,7 @@ void WriteBufferFromS3::initiate() Poco::URI initiate_uri = uri; initiate_uri.setRawQuery("uploads"); auto params = uri.getQueryParameters(); - for (QueryParameters::const_iterator it = params.begin(); it != params.end(); ++it) + for (auto it = params.begin(); it != params.end(); ++it) { initiate_uri.addQueryParameter(it->first, it->second); } From 12f7361095171cf152529534b466e18d8984f89c Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Thu, 19 Sep 2019 10:25:31 +0000 Subject: [PATCH 126/309] Removed unused headers from `ReadBufferFromS3.h` --- dbms/src/IO/ReadBufferFromS3.cpp | 2 ++ dbms/src/IO/ReadBufferFromS3.h | 13 +------------ 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/dbms/src/IO/ReadBufferFromS3.cpp b/dbms/src/IO/ReadBufferFromS3.cpp index 852e601b9ab..0091930c92e 100644 --- a/dbms/src/IO/ReadBufferFromS3.cpp +++ b/dbms/src/IO/ReadBufferFromS3.cpp @@ -1,5 +1,7 @@ #include +#include + #include diff --git a/dbms/src/IO/ReadBufferFromS3.h b/dbms/src/IO/ReadBufferFromS3.h index ec53a24c5a6..ffc0c5c0ab1 100644 --- a/dbms/src/IO/ReadBufferFromS3.h +++ b/dbms/src/IO/ReadBufferFromS3.h @@ -1,23 +1,12 @@ #pragma once -#include #include -#include + #include #include #include -#include -#include -#include #include -#include -#include -#include #include -#include -#include -#include -#include namespace DB From 0ee6f6231a4f8b514d3abe40ccb8ea875fc235d7 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Thu, 19 Sep 2019 10:33:34 +0000 Subject: [PATCH 127/309] Added comments about redirecting requests and got rid of magic values. --- dbms/src/IO/ReadBufferFromS3.cpp | 4 +++- dbms/src/IO/WriteBufferFromS3.cpp | 12 +++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/dbms/src/IO/ReadBufferFromS3.cpp b/dbms/src/IO/ReadBufferFromS3.cpp index 0091930c92e..7fcb7a0ca41 100644 --- a/dbms/src/IO/ReadBufferFromS3.cpp +++ b/dbms/src/IO/ReadBufferFromS3.cpp @@ -40,7 +40,9 @@ ReadBufferFromS3::ReadBufferFromS3(Poco::URI uri_, istr = &session->receiveResponse(response); - if (response.getStatus() != 307) + // Handle 307 Temporary Redirect in order to allow request redirection + // See https://docs.aws.amazon.com/AmazonS3/latest/dev/Redirects.html + if (response.getStatus() != Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT) break; auto location_iterator = response.find("Location"); diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index a40230e9878..5b6f9fdff4c 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -123,7 +123,9 @@ void WriteBufferFromS3::initiate() istr = &session->receiveResponse(response); - if (response.getStatus() != 307) + // Handle 307 Temporary Redirect in order to allow request redirection + // See https://docs.aws.amazon.com/AmazonS3/latest/dev/Redirects.html + if (response.getStatus() != Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT) break; auto location_iterator = response.find("Location"); @@ -193,7 +195,9 @@ void WriteBufferFromS3::writePart(const String & data) istr = &session->receiveResponse(response); - if (response.getStatus() != 307) + // Handle 307 Temporary Redirect in order to allow request redirection + // See https://docs.aws.amazon.com/AmazonS3/latest/dev/Redirects.html + if (response.getStatus() != Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT) break; auto location_iterator = response.find("Location"); @@ -264,7 +268,9 @@ void WriteBufferFromS3::complete() istr = &session->receiveResponse(response); - if (response.getStatus() != 307) + // Handle 307 Temporary Redirect in order to allow request redirection + // See https://docs.aws.amazon.com/AmazonS3/latest/dev/Redirects.html + if (response.getStatus() != Poco::Net::HTTPResponse::HTTP_TEMPORARY_REDIRECT) break; auto location_iterator = response.find("Location"); From b30dee621bc5f40bd89024d9bb81debc0f7ea0ae Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 18 Sep 2019 21:27:18 +0300 Subject: [PATCH 128/309] Search for ld.gold (since it is more common then just "gold") --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 53635ed3a05..9d82da6838a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,7 +139,7 @@ endif () string(REGEX MATCH "-?[0-9]+(.[0-9]+)?$" COMPILER_POSTFIX ${CMAKE_CXX_COMPILER}) find_program (LLD_PATH NAMES "lld${COMPILER_POSTFIX}" "lld") -find_program (GOLD_PATH NAMES "gold") +find_program (GOLD_PATH NAMES "ld.gold" "gold") if (COMPILER_CLANG AND LLD_PATH AND NOT LINKER_NAME) set (LINKER_NAME "lld") From 2062bedaf03a30ed0c1c84b893827efa6e38e9eb Mon Sep 17 00:00:00 2001 From: fenglv Date: Fri, 20 Sep 2019 00:46:41 +0800 Subject: [PATCH 129/309] Add repeat function --- .../src/Functions/registerFunctionsString.cpp | 2 + dbms/src/Functions/repeat.cpp | 236 ++++++++++++++++++ 2 files changed, 238 insertions(+) create mode 100644 dbms/src/Functions/repeat.cpp diff --git a/dbms/src/Functions/registerFunctionsString.cpp b/dbms/src/Functions/registerFunctionsString.cpp index e55e43ba2f9..1f4219b18f2 100644 --- a/dbms/src/Functions/registerFunctionsString.cpp +++ b/dbms/src/Functions/registerFunctionsString.cpp @@ -4,6 +4,7 @@ namespace DB { class FunctionFactory; +void registerFunctionRepeat(FunctionFactory &); void registerFunctionEmpty(FunctionFactory &); void registerFunctionNotEmpty(FunctionFactory &); void registerFunctionLength(FunctionFactory &); @@ -34,6 +35,7 @@ void registerFunctionTryBase64Decode(FunctionFactory &); void registerFunctionsString(FunctionFactory & factory) { + registerFunctionRepeat(factory); registerFunctionEmpty(factory); registerFunctionNotEmpty(factory); registerFunctionLength(factory); diff --git a/dbms/src/Functions/repeat.cpp b/dbms/src/Functions/repeat.cpp new file mode 100644 index 00000000000..5c821e498ca --- /dev/null +++ b/dbms/src/Functions/repeat.cpp @@ -0,0 +1,236 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + namespace ErrorCodes + { + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + } + + struct RepeatImpl + { + static void vectorNonConstStr( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets, + const UInt64 repeatTime) + { + UInt64 data_size = 0; + res_offsets.assign(offsets); + for (UInt64 i = 0; i < offsets.size(); ++i) + { + data_size += (offsets[i] - offsets[i - 1] - 1) * repeatTime + 1; + res_offsets[i] = data_size; + } + res_data.resize(data_size); + for (UInt64 i = 0; i < res_offsets.size(); ++i) + { + array(data.data() + offsets[i - 1], res_data.data() + res_offsets[i - 1], offsets[i] - offsets[i - 1], repeatTime); + } + } + + static void + vectorConst(const String & copy_str, const UInt64 repeatTime, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) + { + UInt64 data_size = copy_str.size() * repeatTime + 1; + res_data.resize(data_size); + res_offsets.resize_fill(1, data_size); + array((UInt8 *)copy_str.data(), res_data.data(), copy_str.size() + 1, repeatTime); + } + + template + static void vectorNonConst( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets, + const PaddedPODArray & col_num) + { + UInt64 data_size = 0; + res_offsets.assign(offsets); + for (UInt64 i = 0; i < col_num.size(); ++i) + { + data_size += (offsets[i] - offsets[i - 1] - 1) * col_num[i] + 1; + res_offsets[i] = data_size; + } + res_data.resize(data_size); + for (UInt64 i = 0; i < col_num.size(); ++i) + { + array(data.data() + offsets[i - 1], res_data.data() + res_offsets[i - 1], offsets[i] - offsets[i - 1], col_num[i]); + } + } + + + template + static void vectorNonConstInteger( + const String & copy_str, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets, const PaddedPODArray & col_num) + { + UInt64 data_size = 0; + res_offsets.resize(col_num.size()); + UInt64 str_size = copy_str.size(); + for (UInt64 i = 0; i < col_num.size(); ++i) + { + data_size += str_size * col_num[i] + 1; + res_offsets[i] = data_size; + } + res_data.resize(data_size); + for (UInt64 i = 0; i < col_num.size(); ++i) + { + array((UInt8 *)copy_str.data(), res_data.data() + res_offsets[i - 1], str_size + 1, col_num[i]); + } + } + + private: + template + static void array(const UInt8 * src, UInt8 * dst, const UInt64 size, T repeatTime) + { + UInt64 i = 0; + do + { + memcpy(dst, src, size - 1); + dst += size - 1; + ++i; + } while (i < repeatTime); + *dst = 0; + } + }; + + + template + class FunctionRepeatImpl : public IFunction + { + template + static bool castType(const IDataType * type, F && f) + { + return castTypeToEither< + DataTypeUInt8, + DataTypeUInt16, + DataTypeUInt32, + DataTypeUInt64>(type, std::forward(f)); + } + + public: + static constexpr auto name = "repeat"; + static FunctionPtr create(const Context &) { return std::make_shared(); } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 2; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isString(arguments[0])) + throw Exception( + "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + if (!isUnsignedInteger(arguments[1])) + throw Exception( + "Illegal type " + arguments[1]->getName() + " of argument of function 1" + getName(), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + return arguments[0]; + } + + bool useDefaultImplementationForConstants() const override { return true; } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override + { + const ColumnPtr strcolumn = block.getByPosition(arguments[0]).column; + const ColumnPtr numcolumn = block.getByPosition(arguments[1]).column; + + if (const ColumnString * col = checkAndGetColumn(strcolumn.get())) + { + if (const ColumnConst * scale_column_num = checkAndGetColumn(numcolumn.get())) + { + Field scale_field_num = scale_column_num->getField(); + UInt64 repeat_time = scale_field_num.get(); + auto col_res = ColumnString::create(); + Impl::vectorNonConstStr( + col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), repeat_time); + block.getByPosition(result).column = std::move(col_res); + } + else if (!castType( + block.getByPosition(arguments[1]).type.get(), [&](const auto & type) { + using DataType = std::decay_t; + using T0 = typename DataType::FieldType; + const ColumnVector * colnum = checkAndGetColumn>(numcolumn.get()); + if (col->size() > 1 && colnum->size() > 1 && col->size() != colnum->size()) + throw Exception( + "Column size doesn't match of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); + auto col_res = ColumnString::create(); + if (colnum->size() == 1 && col->size() >= 1) + { + UInt64 repeat_time = colnum->get64(0); + Impl::vectorNonConstStr( + col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), repeat_time); + } + else + { + Impl::vectorNonConst( + col->getChars(), + col->getOffsets(), + col_res->getChars(), + col_res->getOffsets(), + colnum->getData()); + } + block.getByPosition(result).column = std::move(col_res); + return 0; + })) + ; + else + throw Exception( + "Illegal column " + block.getByPosition(arguments[1]).column->getName() + " of argument of function2 " + + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else if (const ColumnConst * scale_column_str = checkAndGetColumn(strcolumn.get())) + { + Field scale_field_str = scale_column_str->getField(); + String copy_str = scale_field_str.get(); + if (const ColumnConst * scale_column_num = checkAndGetColumn(numcolumn.get())) + { + Field scale_field_num = scale_column_num->getField(); + UInt64 repeat_time = scale_field_num.get(); + auto col_res = ColumnString::create(); + Impl::vectorConst(copy_str, repeat_time, col_res->getChars(), col_res->getOffsets()); + block.getByPosition(result).column = std::move(col_res); + } + else if (!castType(block.getByPosition(arguments[1]).type.get(), [&](const auto & type) { + using DataType = std::decay_t; + using T0 = typename DataType::FieldType; + const ColumnVector * colnum = checkAndGetColumn>(numcolumn.get()); + auto col_res = ColumnString::create(); + Impl::vectorNonConstInteger(copy_str, col_res->getChars(), col_res->getOffsets(), colnum->getData()); + block.getByPosition(result).column = std::move(col_res); + return 0; + })) + ; + else + throw Exception( + "Illegal column " + block.getByPosition(arguments[1]).column->getName() + " of argument of function2 " + + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else + throw Exception( + "Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + }; + + using FunctionRepeat = FunctionRepeatImpl; + + void registerFunctionRepeat(FunctionFactory & factory) + { + factory.registerFunction(); + } +} From b2a3db1048629ba5bf04103ba1c8b99130691d44 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 18 Sep 2019 00:25:19 +0300 Subject: [PATCH 130/309] Ignore indices for CREATE TABLE .. AS .. if storage do not support them --- dbms/src/Interpreters/InterpreterCreateQuery.cpp | 4 +++- .../0_stateless/01011_test_create_as_skip_indices.reference | 0 .../0_stateless/01011_test_create_as_skip_indices.sql | 5 +++++ 3 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 dbms/tests/queries/0_stateless/01011_test_create_as_skip_indices.reference create mode 100644 dbms/tests/queries/0_stateless/01011_test_create_as_skip_indices.sql diff --git a/dbms/src/Interpreters/InterpreterCreateQuery.cpp b/dbms/src/Interpreters/InterpreterCreateQuery.cpp index 59ae2fc2b72..f2189a59866 100644 --- a/dbms/src/Interpreters/InterpreterCreateQuery.cpp +++ b/dbms/src/Interpreters/InterpreterCreateQuery.cpp @@ -2,6 +2,7 @@ #include +#include #include #include @@ -416,7 +417,8 @@ ColumnsDescription InterpreterCreateQuery::setProperties( else if (!create.as_table.empty()) { columns = as_storage->getColumns(); - indices = as_storage->getIndices(); + if (create.storage && endsWith(create.storage->engine->name, "MergeTree")) + indices = as_storage->getIndices(); constraints = as_storage->getConstraints(); } else if (create.select) diff --git a/dbms/tests/queries/0_stateless/01011_test_create_as_skip_indices.reference b/dbms/tests/queries/0_stateless/01011_test_create_as_skip_indices.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/queries/0_stateless/01011_test_create_as_skip_indices.sql b/dbms/tests/queries/0_stateless/01011_test_create_as_skip_indices.sql new file mode 100644 index 00000000000..b702fc3654c --- /dev/null +++ b/dbms/tests/queries/0_stateless/01011_test_create_as_skip_indices.sql @@ -0,0 +1,5 @@ +SET allow_experimental_data_skipping_indices=1; +CREATE TABLE foo (key int, INDEX i1 key TYPE minmax GRANULARITY 1) Engine=MergeTree() ORDER BY key; +CREATE TABLE as_foo AS foo; +CREATE TABLE dist (key int, INDEX i1 key TYPE minmax GRANULARITY 1) Engine=Distributed(test_shard_localhost, currentDatabase(), 'foo'); -- { serverError 36 } +CREATE TABLE dist_as_foo Engine=Distributed(test_shard_localhost, currentDatabase(), 'foo') AS foo; From 7309e813bcaff58f3fe5a320cf0e33440565283d Mon Sep 17 00:00:00 2001 From: stavrolia Date: Fri, 20 Sep 2019 14:26:00 +0300 Subject: [PATCH 131/309] cleanup hdfs docs --- dbms/src/IO/HDFSCommon.cpp | 4 +- dbms/src/IO/WriteBufferFromHDFS.cpp | 7 ++- dbms/src/Storages/StorageFile.cpp | 2 +- .../integration/test_storage_hdfs/test.py | 44 ++++++++++----- docs/en/operations/table_engines/hdfs.md | 52 ++++++++++++++++++ .../en/query_language/table_functions/file.md | 47 +++++++++++++--- .../en/query_language/table_functions/hdfs.md | 49 ++++++++++++++--- docs/ru/operations/table_engines/hdfs.md | 54 ++++++++++++++++++- .../ru/query_language/table_functions/file.md | 48 ++++++++++++++--- .../ru/query_language/table_functions/hdfs.md | 14 ++--- 10 files changed, 278 insertions(+), 43 deletions(-) diff --git a/dbms/src/IO/HDFSCommon.cpp b/dbms/src/IO/HDFSCommon.cpp index 1c9980105f0..2bd420131b0 100644 --- a/dbms/src/IO/HDFSCommon.cpp +++ b/dbms/src/IO/HDFSCommon.cpp @@ -17,8 +17,8 @@ HDFSBuilderPtr createHDFSBuilder(const std::string & uri_str) const Poco::URI uri(uri_str); auto & host = uri.getHost(); auto port = uri.getPort(); - auto & path = uri.getPath(); - if (host.empty() || path.empty()) + const std::string path = "//"; + if (host.empty()) throw Exception("Illegal HDFS URI: " + uri.toString(), ErrorCodes::BAD_ARGUMENTS); HDFSBuilderPtr builder(hdfsNewBuilder()); diff --git a/dbms/src/IO/WriteBufferFromHDFS.cpp b/dbms/src/IO/WriteBufferFromHDFS.cpp index 2cd83ee6479..9733d761ee4 100644 --- a/dbms/src/IO/WriteBufferFromHDFS.cpp +++ b/dbms/src/IO/WriteBufferFromHDFS.cpp @@ -15,6 +15,7 @@ namespace ErrorCodes extern const int NETWORK_ERROR; extern const int CANNOT_OPEN_FILE; extern const int CANNOT_FSYNC; +extern const int BAD_ARGUMENTS; } @@ -32,10 +33,12 @@ struct WriteBufferFromHDFS::WriteBufferFromHDFSImpl { const size_t begin_of_path = hdfs_uri.find('/', hdfs_uri.find("//") + 2); const std::string path = hdfs_uri.substr(begin_of_path); - if (path.find("*?{") != std::string::npos) + if (path.find_first_of("*?{") != std::string::npos) throw Exception("URI '" + hdfs_uri + "' contains globs, so the table is in readonly mode", ErrorCodes::CANNOT_OPEN_FILE); - fout = hdfsOpenFile(fs.get(), path.c_str(), O_WRONLY, 0, 0, 0); + if (!hdfsExists(fs.get(), path.c_str())) + throw Exception("File: " + path + " is already exists", ErrorCodes::BAD_ARGUMENTS); + fout = hdfsOpenFile(fs.get(), path.c_str(), O_WRONLY, 0, 0, 0); /// O_WRONLY meaning create or overwrite i.e., implies O_TRUNCAT here if (fout == nullptr) { diff --git a/dbms/src/Storages/StorageFile.cpp b/dbms/src/Storages/StorageFile.cpp index d5d8cd0856e..79534c1c6b0 100644 --- a/dbms/src/Storages/StorageFile.cpp +++ b/dbms/src/Storages/StorageFile.cpp @@ -282,7 +282,7 @@ public: else { if (storage.paths.size() != 1) - throw Exception("Table '" + storage.table_name + "' is in readonly mode", ErrorCodes::DATABASE_ACCESS_DENIED); + throw Exception("Table '" + storage.table_name + "' is in readonly mode because of globs in filepath", ErrorCodes::DATABASE_ACCESS_DENIED); write_buf = std::make_unique(storage.paths[0], DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_APPEND | O_CREAT); } diff --git a/dbms/tests/integration/test_storage_hdfs/test.py b/dbms/tests/integration/test_storage_hdfs/test.py index 55ef98f6fde..cf4205115ff 100644 --- a/dbms/tests/integration/test_storage_hdfs/test.py +++ b/dbms/tests/integration/test_storage_hdfs/test.py @@ -29,7 +29,6 @@ def started_cluster(): def test_read_write_storage(started_cluster): hdfs_api = HDFSApi("root") - hdfs_api.write_data("/simple_storage", "1\tMark\t72.53\n") node1.query("create table SimpleHDFSStorage (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/simple_storage', 'TSV')") node1.query("insert into SimpleHDFSStorage values (1, 'Mark', 72.53)") @@ -39,19 +38,40 @@ def test_read_write_storage(started_cluster): def test_read_write_storage_with_globs(started_cluster): hdfs_api = HDFSApi("root") - for i in ["1", "2", "3"]: - hdfs_api.write_data("/storage" + i, i + "\tMark\t72.53\n") - assert hdfs_api.read_data("/storage" + i) == i + "\tMark\t72.53\n" - node1.query("create table HDFSStorageWithRange (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage{1..5}', 'TSV')") node1.query("create table HDFSStorageWithEnum (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage{1,2,3,4,5}', 'TSV')") node1.query("create table HDFSStorageWithQuestionMark (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage?', 'TSV')") node1.query("create table HDFSStorageWithAsterisk (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/storage*', 'TSV')") - assert node1.query("select count(*) from HDFSStorageWithRange") == '3\n' - assert node1.query("select count(*) from HDFSStorageWithEnum") == '3\n' - assert node1.query("select count(*) from HDFSStorageWithQuestionMark") == '3\n' - assert node1.query("select count(*) from HDFSStorageWithAsterisk") == '3\n' + for i in ["1", "2", "3"]: + hdfs_api.write_data("/storage" + i, i + "\tMark\t72.53\n") + assert hdfs_api.read_data("/storage" + i) == i + "\tMark\t72.53\n" + + assert node1.query("select count(*) from HDFSStorageWithRange") == "3\n" + assert node1.query("select count(*) from HDFSStorageWithEnum") == "3\n" + assert node1.query("select count(*) from HDFSStorageWithQuestionMark") == "3\n" + assert node1.query("select count(*) from HDFSStorageWithAsterisk") == "3\n" + + try: + node1.query("insert into HDFSStorageWithEnum values (1, 'NEW', 4.2)") + assert False, "Exception have to be thrown" + except Exception as ex: + print ex + assert "in readonly mode" in str(ex) + + try: + node1.query("insert into HDFSStorageWithQuestionMark values (1, 'NEW', 4.2)") + assert False, "Exception have to be thrown" + except Exception as ex: + print ex + assert "in readonly mode" in str(ex) + + try: + node1.query("insert into HDFSStorageWithAsterisk values (1, 'NEW', 4.2)") + assert False, "Exception have to be thrown" + except Exception as ex: + print ex + assert "in readonly mode" in str(ex) def test_read_write_table(started_cluster): hdfs_api = HDFSApi("root") @@ -78,18 +98,18 @@ def test_bad_hdfs_uri(started_cluster): node1.query("create table BadStorage1 (id UInt32, name String, weight Float64) ENGINE = HDFS('hads:hgsdfs100500:9000/other_storage', 'TSV')") except Exception as ex: print ex - assert 'Illegal HDFS URI' in str(ex) + assert "Illegal HDFS URI" in str(ex) try: node1.query("create table BadStorage2 (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs100500:9000/other_storage', 'TSV')") except Exception as ex: print ex - assert 'Unable to create builder to connect to HDFS' in str(ex) + assert "Unable to create builder to connect to HDFS" in str(ex) try: node1.query("create table BadStorage3 (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/<>', 'TSV')") except Exception as ex: print ex - assert 'Unable to open HDFS file' in str(ex) + assert "Unable to open HDFS file" in str(ex) def test_globs_in_read_table(started_cluster): hdfs_api = HDFSApi("root") diff --git a/docs/en/operations/table_engines/hdfs.md b/docs/en/operations/table_engines/hdfs.md index 652ca43b176..efab476aded 100644 --- a/docs/en/operations/table_engines/hdfs.md +++ b/docs/en/operations/table_engines/hdfs.md @@ -13,6 +13,7 @@ The `format` parameter specifies one of the available file formats. To perform `SELECT` queries, the format must be supported for input, and to perform `INSERT` queries -- for output. The available formats are listed in the [Formats](../../interfaces/formats.md#formats) section. +The path part of `URI` may contain globs. In this case the table would be readonly. **Example:** @@ -48,4 +49,55 @@ SELECT * FROM hdfs_engine_table LIMIT 2 - Indexes. - Replication. +**Globs in path** + +Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern. Listing of files determines during `SELECT` (not at `CREATE` moment). + +- `*` — Substitutes any number of any characters including none. +- `?` — Substitutes any single character. +- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Substitutes any number in range from N to M including both borders. + + Constructions with `{}` are similar to the [remote table function](../../query_language/table_functions/remote.md)). + +**Example** + +1. Suppose we have several files in TSV format with the following URIs on HDFS: + +- 'hdfs://hdfs1:9000/some_dir/some_file_1' +- 'hdfs://hdfs1:9000/some_dir/some_file_2' +- 'hdfs://hdfs1:9000/some_dir/some_file_3' +- 'hdfs://hdfs1:9000/another_dir/some_file_1' +- 'hdfs://hdfs1:9000/another_dir/some_file_2' +- 'hdfs://hdfs1:9000/another_dir/some_file_3' + +2. There are several ways to make a table consisting of all six files: + +```sql +CREATE TABLE table_with_range (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV') +``` + +Another way: + +```sql +CREATE TABLE table_with_question_mark (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_?', 'TSV') +``` + +Table consists of all the files in both directories (all files should satisfy format and schema described in query): + +```sql +CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV') +``` + +!!! warning + If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. + +**Example** + +Create table with files named `file000`, `file001`, ... , `file999`: + +```sql +CREARE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV') +``` + [Original article](https://clickhouse.yandex/docs/en/operations/table_engines/hdfs/) diff --git a/docs/en/query_language/table_functions/file.md b/docs/en/query_language/table_functions/file.md index 0cb1f0d36bf..de5e679a7b8 100644 --- a/docs/en/query_language/table_functions/file.md +++ b/docs/en/query_language/table_functions/file.md @@ -1,7 +1,7 @@ # file -Creates a table from a file. +Creates a table from a file. This table function is similar to [url](url.md) and [hdfs](hdfs.md) ones. ``` file(path, format, structure) @@ -53,14 +53,49 @@ SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 U **Globs in path** -- `*` — Matches any number of any characters including none. -- `?` — Matches any single character. -- `{some_string,another_string,yet_another_one}` — Matches any of strings `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Matches any number in range from N to M including both borders. +Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix). +- `*` — Substitutes any number of any characters including none. +- `?` — Substitutes any single character. +- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Substitutes any number in range from N to M including both borders. + + Constructions with `{}` are similar to the [remote table function](../../query_language/table_functions/remote.md)). + +**Example** + +1. Suppose we have several files with the following relative paths: + +- 'some_dir/some_file_1' +- 'some_dir/some_file_2' +- 'some_dir/some_file_3' +- 'another_dir/some_file_1' +- 'another_dir/some_file_2' +- 'another_dir/some_file_3' + +2. Query the amount of rows in these files: + +```sql +SELECT count(*) +FROM file('{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32') +``` + +3. Query the amount of rows in all files of these two directories: + +```sql +SELECT count(*) +FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32') +``` !!! warning If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. -Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern. +**Example** + +Query the data from files named `file000`, `file001`, ... , `file999`: + +```sql +SELECT count(*) +FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32') +``` [Original article](https://clickhouse.yandex/docs/en/query_language/table_functions/file/) diff --git a/docs/en/query_language/table_functions/hdfs.md b/docs/en/query_language/table_functions/hdfs.md index cce9b308101..a438f9a9ae6 100644 --- a/docs/en/query_language/table_functions/hdfs.md +++ b/docs/en/query_language/table_functions/hdfs.md @@ -1,7 +1,7 @@ # hdfs -Creates a table from a file in HDFS. +Creates a table from files in HDFS. This table function is similar to [url](url.md) and [file](file.md) ones. ``` hdfs(URI, format, structure) @@ -36,14 +36,51 @@ LIMIT 2 **Globs in path** -- `*` — Matches any number of any characters including none. -- `?` — Matches any single character. -- `{some_string,another_string,yet_another_one}` — Matches any of strings `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Matches any number in range from N to M including both borders. +Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix). + +- `*` — Substitutes any number of any characters including none. +- `?` — Substitutes any single character. +- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Substitutes any number in range from N to M including both borders. + + + Constructions with `{}` are similar to the [remote table function](../../query_language/table_functions/remote.md)). + +**Example** + +1. Suppose that we have several files with following URIs on HDFS: + +- 'hdfs://hdfs1:9000/some_dir/some_file_1' +- 'hdfs://hdfs1:9000/some_dir/some_file_2' +- 'hdfs://hdfs1:9000/some_dir/some_file_3' +- 'hdfs://hdfs1:9000/another_dir/some_file_1' +- 'hdfs://hdfs1:9000/another_dir/some_file_2' +- 'hdfs://hdfs1:9000/another_dir/some_file_3' + +2. Query the amount of rows in these files: + +```sql +SELECT count(*) +FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32') +``` + +3. Query the amount of rows in all files of these two directories: + +```sql +SELECT count(*) +FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV', 'name String, value UInt32') +``` !!! warning If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. -Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern. +**Example** + +Query the data from files named `file000`, `file001`, ... , `file999`: + +```sql +SELECT count(*) +FROM hdfs('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32') +``` [Original article](https://clickhouse.yandex/docs/en/query_language/table_functions/hdfs/) diff --git a/docs/ru/operations/table_engines/hdfs.md b/docs/ru/operations/table_engines/hdfs.md index 3f42c9ec447..fd7006d1d9a 100644 --- a/docs/ru/operations/table_engines/hdfs.md +++ b/docs/ru/operations/table_engines/hdfs.md @@ -1,6 +1,6 @@ # HDFS {#table_engines-hdfs} -Управляет данными в HDFS. Данный движок похож на движок [File](file.md) и на движок [URL](url.md). +Управляет данными в HDFS. Данный движок похож на движки [File](file.md) и [URL](url.md). ## Использование движка @@ -10,6 +10,7 @@ ENGINE = HDFS(URI, format) В параметр `URI` нужно передавать полный URI файла в HDFS. Параметр `format` должен быть таким, который ClickHouse может использовать и в запросах `INSERT`, и в запросах `SELECT`. Полный список поддерживаемых форматов смотрите в разделе [Форматы](../../interfaces/formats.md#formats). +Часть URI с путем файла может содержать шаблоны. В этом случае таблица может использоваться только для чтения. **Пример:** @@ -45,4 +46,55 @@ SELECT * FROM hdfs_engine_table LIMIT 2 - индексы; - репликация. +**Шаблоны в пути** + +Шаблоны могут содержаться в нескольких компонентах пути. Обрабатываются только существующие файлы, название которых целиком удовлетворяет шаблону (не только суффиксом или префиксом). + +- `*` — Заменяет любое количество любых символов, включая отсутствие символов. +- `?` — Заменяет ровно один любой символ. +- `{some_string,another_string,yet_another_one}` — Заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Заменяет любое число в интервале от `N` до `M` включительно. + +Конструкция с `{}` аналогична табличной функции [remote](remote.md). + +**Пример** + +1. Предположим, у нас есть несколько файлов со следующими URI в HDFS: + +- 'hdfs://hdfs1:9000/some_dir/some_file_1' +- 'hdfs://hdfs1:9000/some_dir/some_file_2' +- 'hdfs://hdfs1:9000/some_dir/some_file_3' +- 'hdfs://hdfs1:9000/another_dir/some_file_1' +- 'hdfs://hdfs1:9000/another_dir/some_file_2' +- 'hdfs://hdfs1:9000/another_dir/some_file_3' + +2. Есть несколько возможностей создать таблицу, состояющую из этих шести файлов: + +```sql +CREATE TABLE table_with_range (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_{1..3}', 'TSV') +``` + +Другой способ: + +```sql +CREATE TABLE table_with_question_mark (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/some_file_?', 'TSV') +``` + +Таблица, состоящая из всех файлов в обеих директориях (все файлы должны удовлетворять формату и схеме, указанной в запросе): + +```sql +CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV') +``` + +!!! warning + Если список файлов содержит числовые интервалы с ведущими нулями, используйте конструкцию с фигурными скобочками для каждой цифры или используйте `?`. + +**Example** + +Создадим таблицу с именами `file000`, `file001`, ... , `file999`: + +```sql +CREARE TABLE big_table (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV') +``` + [Оригинальная статья](https://clickhouse.yandex/docs/ru/operations/table_engines/hdfs/) diff --git a/docs/ru/query_language/table_functions/file.md b/docs/ru/query_language/table_functions/file.md index 9fc82b151b8..6fbcebceba0 100644 --- a/docs/ru/query_language/table_functions/file.md +++ b/docs/ru/query_language/table_functions/file.md @@ -1,7 +1,7 @@ # file -Создаёт таблицу из файла. +Создаёт таблицу из файла. Данная табличная функция похожа на табличные функции [file](file.md) и [hdfs](hdfs.md). ``` file(path, format, structure) @@ -45,16 +45,50 @@ LIMIT 2 └─────────┴─────────┴─────────┘ ``` -**Шаблоны в пути файла** +Шаблоны могут содержаться в нескольких компонентах пути. Обрабатываются только существующие файлы, название которых целиком удовлетворяет шаблону (не только суффиксом или префиксом). -- `*` — Матчит любое количество любых символов, включая отсутствие символов. -- `?` — Матчит ровно один любой символ. -- `{some_string,another_string,yet_another_one}` — Матчит любую из строк `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Матчит любое число в интервале от `N` до `M` включительно. +- `*` — Заменяет любое количество любых символов, включая отсутствие символов. +- `?` — Заменяет ровно один любой символ. +- `{some_string,another_string,yet_another_one}` — Заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Заменяет любое число в интервале от `N` до `M` включительно. + +Конструкция с `{}` аналогична табличной функции [remote](remote.md). + +**Пример** + +1. Предположим у нас есть несколько файлов со следующими относительными путями: + +- 'some_dir/some_file_1' +- 'some_dir/some_file_2' +- 'some_dir/some_file_3' +- 'another_dir/some_file_1' +- 'another_dir/some_file_2' +- 'another_dir/some_file_3' + +2. Запросим количество строк в этих файлах: + +```sql +SELECT count(*) +FROM file('{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32') +``` + +3. Запросим количество строк во всех файлах этих двух директорий: + +```sql +SELECT count(*) +FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32') +``` !!! warning Если ваш список файлов содержит интервал с ведущими нулями, используйте конструкцию с фигурными скобками для каждой цифры по отдельности или используйте `?`. -Шаблоны могут содержаться в разных частях пути. Обрабатываться будут ровно те файлы, которые и удовлетворяют всему шаблону пути, и существуют в файловой системе. +**Пример** + +Запрос данных из файлов с именами `file000`, `file001`, ... , `file999`: + +```sql +SELECT count(*) +FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32') +``` [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/table_functions/file/) diff --git a/docs/ru/query_language/table_functions/hdfs.md b/docs/ru/query_language/table_functions/hdfs.md index ae881edea35..58f0accc19d 100644 --- a/docs/ru/query_language/table_functions/hdfs.md +++ b/docs/ru/query_language/table_functions/hdfs.md @@ -1,7 +1,7 @@ # hdfs -Создаёт таблицу из файла в HDFS. +Создаёт таблицу из файла в HDFS. Данная табличная функция похожа на табличные функции [url](url.md) и [file](file.md). ``` hdfs(URI, format, structure) @@ -33,12 +33,14 @@ LIMIT 2 └─────────┴─────────┴─────────┘ ``` -**Шаблоны в пути файла** +**Шаблоны в пути** -- `*` — Матчит любое количество любых символов, включая отсутствие символов. -- `?` — Матчит ровно один любой символ. -- `{some_string,another_string,yet_another_one}` — Матчит любую из строк `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Матчит любое число в интервале от `N` до `M` включительно. +- `*` — Заменяет любое количество любых символов, включая отсутствие символов. +- `?` — Заменяет ровно один любой символ. +- `{some_string,another_string,yet_another_one}` — Заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — Заменяет любое число в интервале от `N` до `M` включительно. + +Конструкция с `{}` аналогична табличной функции [remote](remote.md). !!! warning Если ваш список файлов содержит интервал с ведущими нулями, используйте конструкцию с фигурными скобками для каждой цифры по отдельности или используйте `?`. From ca36e3f861eb36b9ea928d80672b2ca8fa9f6901 Mon Sep 17 00:00:00 2001 From: maqroll Date: Fri, 20 Sep 2019 12:36:09 +0000 Subject: [PATCH 132/309] Attach error to force ip refresh --- dbms/src/IO/ReadWriteBufferFromHTTP.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.h b/dbms/src/IO/ReadWriteBufferFromHTTP.h index 6382077dd6a..62db3c22a2c 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.h +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.h @@ -114,13 +114,13 @@ namespace detail auto sess = session->getSession(); - auto & stream_out = sess->sendRequest(request); - - if (out_stream_callback) - out_stream_callback(stream_out); - try { + auto & stream_out = sess->sendRequest(request); + + if (out_stream_callback) + out_stream_callback(stream_out); + istr = receiveResponse(*sess, request, response, true); response.getCookies(cookies); From bf2654e0930f465b929fd28bfd43be5307d8b235 Mon Sep 17 00:00:00 2001 From: sfod Date: Fri, 20 Sep 2019 17:39:26 +0300 Subject: [PATCH 133/309] Use sql file to test changes --- .../01013_totals_without_aggregation.reference | 10 +++++++--- .../01013_totals_without_aggregation.sh | 15 --------------- .../01013_totals_without_aggregation.sql | 6 ++++++ 3 files changed, 13 insertions(+), 18 deletions(-) delete mode 100755 dbms/tests/queries/0_stateless/01013_totals_without_aggregation.sh create mode 100755 dbms/tests/queries/0_stateless/01013_totals_without_aggregation.sql diff --git a/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.reference b/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.reference index 7614df8ec46..6dddf22a467 100644 --- a/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.reference +++ b/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.reference @@ -1,3 +1,7 @@ -ok -ok -ok +11 + +11 +12 +12 +13 +13 diff --git a/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.sh b/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.sh deleted file mode 100755 index c159a73388d..00000000000 --- a/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -. $CURDIR/../shell_config.sh - -EXCEPTION_SUCCESS_TEXT=ok - -# Must throw an exception -EXCEPTION_TEXT="WITH TOTALS, ROLLUP or CUBE are not supported without aggregation" -$CLICKHOUSE_CLIENT --query="SELECT 1 AS id, 'hello' AS s WITH TOTALS" 2>&1 \ - | grep -q "$EXCEPTION_TEXT" && echo "$EXCEPTION_SUCCESS_TEXT" || echo "Did not throw an exception" -$CLICKHOUSE_CLIENT --query="SELECT 1 AS id, 'hello' AS s WITH ROLLUP" 2>&1 \ - | grep -q "$EXCEPTION_TEXT" && echo "$EXCEPTION_SUCCESS_TEXT" || echo "Did not throw an exception" -$CLICKHOUSE_CLIENT --query="SELECT 1 AS id, 'hello' AS s WITH CUBE" 2>&1 \ - | grep -q "$EXCEPTION_TEXT" && echo "$EXCEPTION_SUCCESS_TEXT" || echo "Did not throw an exception" diff --git a/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.sql b/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.sql new file mode 100755 index 00000000000..bed393b63d3 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01013_totals_without_aggregation.sql @@ -0,0 +1,6 @@ +SELECT 11 AS n GROUP BY n WITH TOTALS; +SELECT 12 AS n GROUP BY n WITH ROLLUP; +SELECT 13 AS n GROUP BY n WITH CUBE; +SELECT 1 AS n WITH TOTALS; -- { serverError 49 } +SELECT 1 AS n WITH ROLLUP; -- { serverError 49 } +SELECT 1 AS n WITH CUBE; -- { serverError 49 } From bc9bcfdb7ee79de92e16e47dc26196b6f93ad061 Mon Sep 17 00:00:00 2001 From: sfod Date: Fri, 20 Sep 2019 17:42:30 +0300 Subject: [PATCH 134/309] Fix tests which use WITH TOTALS without aggregation --- .../00378_json_quote_64bit_integers.reference | 20 +++++++++---------- .../00378_json_quote_64bit_integers.sql | 12 +++++------ .../00937_template_output_format.reference | 4 ++-- .../00937_template_output_format.sql | 2 +- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/dbms/tests/queries/0_stateless/00378_json_quote_64bit_integers.reference b/dbms/tests/queries/0_stateless/00378_json_quote_64bit_integers.reference index 22395188fe6..49c937e09df 100644 --- a/dbms/tests/queries/0_stateless/00378_json_quote_64bit_integers.reference +++ b/dbms/tests/queries/0_stateless/00378_json_quote_64bit_integers.reference @@ -48,10 +48,10 @@ { "i0": "0", "u0": "0", - "ip": "0", - "in": "0", - "up": "0", - "arr": [], + "ip": "9223372036854775807", + "in": "-9223372036854775808", + "up": "18446744073709551615", + "arr": ["0"], "tuple": ["0","0"] }, @@ -119,7 +119,7 @@ ["0", "0", "9223372036854775807", "-9223372036854775808", "18446744073709551615", ["0"], ["0","0"]] ], - "totals": ["0","0","0","0","0",[],["0","0"]], + "totals": ["0","0","9223372036854775807","-9223372036854775808","18446744073709551615",["0"],["0","0"]], "extremes": { @@ -180,10 +180,10 @@ { "i0": 0, "u0": 0, - "ip": 0, - "in": 0, - "up": 0, - "arr": [], + "ip": 9223372036854775807, + "in": -9223372036854775808, + "up": 18446744073709551615, + "arr": [0], "tuple": [0,0] }, @@ -251,7 +251,7 @@ [0, 0, 9223372036854775807, -9223372036854775808, 18446744073709551615, [0], [0,0]] ], - "totals": [0,0,0,0,0,[],[0,0]], + "totals": [0,0,9223372036854775807,-9223372036854775808,18446744073709551615,[0],[0,0]], "extremes": { diff --git a/dbms/tests/queries/0_stateless/00378_json_quote_64bit_integers.sql b/dbms/tests/queries/0_stateless/00378_json_quote_64bit_integers.sql index 261a044c711..2d99202a8ac 100644 --- a/dbms/tests/queries/0_stateless/00378_json_quote_64bit_integers.sql +++ b/dbms/tests/queries/0_stateless/00378_json_quote_64bit_integers.sql @@ -2,11 +2,11 @@ SET output_format_write_statistics = 0; SET extremes = 1; SET output_format_json_quote_64bit_integers = 1; -SELECT toInt64(0) as i0, toUInt64(0) as u0, toInt64(9223372036854775807) as ip, toInt64(-9223372036854775808) as in, toUInt64(18446744073709551615) as up, [toInt64(0)] as arr, (toUInt64(0), toUInt64(0)) as tuple WITH TOTALS FORMAT JSON; -SELECT toInt64(0) as i0, toUInt64(0) as u0, toInt64(9223372036854775807) as ip, toInt64(-9223372036854775808) as in, toUInt64(18446744073709551615) as up, [toInt64(0)] as arr, (toUInt64(0), toUInt64(0)) as tuple WITH TOTALS FORMAT JSONCompact; -SELECT toInt64(0) as i0, toUInt64(0) as u0, toInt64(9223372036854775807) as ip, toInt64(-9223372036854775808) as in, toUInt64(18446744073709551615) as up, [toInt64(0)] as arr, (toUInt64(0), toUInt64(0)) as tuple WITH TOTALS FORMAT JSONEachRow; +SELECT toInt64(0) as i0, toUInt64(0) as u0, toInt64(9223372036854775807) as ip, toInt64(-9223372036854775808) as in, toUInt64(18446744073709551615) as up, [toInt64(0)] as arr, (toUInt64(0), toUInt64(0)) as tuple GROUP BY i0, u0, ip, in, up, arr, tuple WITH TOTALS FORMAT JSON; +SELECT toInt64(0) as i0, toUInt64(0) as u0, toInt64(9223372036854775807) as ip, toInt64(-9223372036854775808) as in, toUInt64(18446744073709551615) as up, [toInt64(0)] as arr, (toUInt64(0), toUInt64(0)) as tuple GROUP BY i0, u0, ip, in, up, arr, tuple WITH TOTALS FORMAT JSONCompact; +SELECT toInt64(0) as i0, toUInt64(0) as u0, toInt64(9223372036854775807) as ip, toInt64(-9223372036854775808) as in, toUInt64(18446744073709551615) as up, [toInt64(0)] as arr, (toUInt64(0), toUInt64(0)) as tuple GROUP BY i0, u0, ip, in, up, arr, tuple WITH TOTALS FORMAT JSONEachRow; SET output_format_json_quote_64bit_integers = 0; -SELECT toInt64(0) as i0, toUInt64(0) as u0, toInt64(9223372036854775807) as ip, toInt64(-9223372036854775808) as in, toUInt64(18446744073709551615) as up, [toInt64(0)] as arr, (toUInt64(0), toUInt64(0)) as tuple WITH TOTALS FORMAT JSON; -SELECT toInt64(0) as i0, toUInt64(0) as u0, toInt64(9223372036854775807) as ip, toInt64(-9223372036854775808) as in, toUInt64(18446744073709551615) as up, [toInt64(0)] as arr, (toUInt64(0), toUInt64(0)) as tuple WITH TOTALS FORMAT JSONCompact; -SELECT toInt64(0) as i0, toUInt64(0) as u0, toInt64(9223372036854775807) as ip, toInt64(-9223372036854775808) as in, toUInt64(18446744073709551615) as up, [toInt64(0)] as arr, (toUInt64(0), toUInt64(0)) as tuple WITH TOTALS FORMAT JSONEachRow; \ No newline at end of file +SELECT toInt64(0) as i0, toUInt64(0) as u0, toInt64(9223372036854775807) as ip, toInt64(-9223372036854775808) as in, toUInt64(18446744073709551615) as up, [toInt64(0)] as arr, (toUInt64(0), toUInt64(0)) as tuple GROUP BY i0, u0, ip, in, up, arr, tuple WITH TOTALS FORMAT JSON; +SELECT toInt64(0) as i0, toUInt64(0) as u0, toInt64(9223372036854775807) as ip, toInt64(-9223372036854775808) as in, toUInt64(18446744073709551615) as up, [toInt64(0)] as arr, (toUInt64(0), toUInt64(0)) as tuple GROUP BY i0, u0, ip, in, up, arr, tuple WITH TOTALS FORMAT JSONCompact; +SELECT toInt64(0) as i0, toUInt64(0) as u0, toInt64(9223372036854775807) as ip, toInt64(-9223372036854775808) as in, toUInt64(18446744073709551615) as up, [toInt64(0)] as arr, (toUInt64(0), toUInt64(0)) as tuple GROUP BY i0, u0, ip, in, up, arr, tuple WITH TOTALS FORMAT JSONEachRow; diff --git a/dbms/tests/queries/0_stateless/00937_template_output_format.reference b/dbms/tests/queries/0_stateless/00937_template_output_format.reference index c4cfb4ed3a4..c7b81d262ae 100644 --- a/dbms/tests/queries/0_stateless/00937_template_output_format.reference +++ b/dbms/tests/queries/0_stateless/00937_template_output_format.reference @@ -2,8 +2,8 @@ n: "123", s1: qwe,rty, s2: 'as"df\'gh', s3: "", s4: "zx cv bn m", d: 2016-01-01, n: 123 ; n: "456", s1: as"df\'gh, s2: '', s3: "zx\ncv\tbn m", s4: "qwe,rty", d: 2016-01-02, n: 456 ; -n: "9876543210", s1: , s2: 'zx\ncv\tbn m', s3: "qwe,rty", s4: "as""df'gh", d: 2016-01-03, n: 9876543210 ; -n: "789", s1: zx\ncv\tbn m, s2: 'qwe,rty', s3: "as\"df'gh", s4: "", d: 2016-01-04, n: 789 +n: "789", s1: zx\ncv\tbn m, s2: 'qwe,rty', s3: "as\"df'gh", s4: "", d: 2016-01-04, n: 789 ; +n: "9876543210", s1: , s2: 'zx\ncv\tbn m', s3: "qwe,rty", s4: "as""df'gh", d: 2016-01-03, n: 9876543210 ------ n: "0", s1: , s2: '', s3: "", s4: "", d: 0000-00-00, n: 0 ------ diff --git a/dbms/tests/queries/0_stateless/00937_template_output_format.sql b/dbms/tests/queries/0_stateless/00937_template_output_format.sql index 7a981c641da..40312272ccb 100644 --- a/dbms/tests/queries/0_stateless/00937_template_output_format.sql +++ b/dbms/tests/queries/0_stateless/00937_template_output_format.sql @@ -3,7 +3,7 @@ CREATE TABLE template (s1 String, s2 String, `s 3` String, "s 4" String, n UInt6 INSERT INTO template VALUES ('qwe,rty', 'as"df''gh', '', 'zx\ncv\tbn m', 123, '2016-01-01'),('as"df''gh', '', 'zx\ncv\tbn m', 'qwe,rty', 456, '2016-01-02'),('', 'zx\ncv\tbn m', 'qwe,rty', 'as"df''gh', 9876543210, '2016-01-03'),('zx\ncv\tbn m', 'qwe,rty', 'as"df''gh', '', 789, '2016-01-04'); -SELECT * FROM template WITH TOTALS LIMIT 4 FORMAT Template SETTINGS +SELECT * FROM template GROUP BY s1, s2, `s 3`, "s 4", n, d WITH TOTALS ORDER BY n LIMIT 4 FORMAT Template SETTINGS extremes = 1, format_schema = '{prefix} \n${data:None}\n------\n${totals:}\n------\n${min}\n------\n${max}\n${rows:Escaped} rows\nbefore limit ${rows_before_limit:XML}\nread ${rows_read:Escaped} $$ suffix $$', format_schema_rows = 'n:\t${n:JSON}, s1:\t${s1:Escaped}, s2:\t${s2:Quoted}, s3:\t${`s 3`:JSON}, s4:\t${"s 4":CSV}, d:\t${d:Escaped}, n:\t${n:Raw}\t', From b64bab29ed1a329f6f7414f742c2a3a94f4bcf77 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Fri, 20 Sep 2019 22:45:25 +0800 Subject: [PATCH 135/309] Outline methods to get needed instantiations. --- dbms/src/DataTypes/DataTypeLowCardinality.cpp | 68 +++++++++++++++ dbms/src/DataTypes/DataTypeLowCardinality.h | 83 ++++--------------- 2 files changed, 82 insertions(+), 69 deletions(-) diff --git a/dbms/src/DataTypes/DataTypeLowCardinality.cpp b/dbms/src/DataTypes/DataTypeLowCardinality.cpp index b0f9d312773..57ff63483c1 100644 --- a/dbms/src/DataTypes/DataTypeLowCardinality.cpp +++ b/dbms/src/DataTypes/DataTypeLowCardinality.cpp @@ -742,6 +742,74 @@ void DataTypeLowCardinality::deserializeBinary(Field & field, ReadBuffer & istr) dictionary_type->deserializeBinary(field, istr); } +void DataTypeLowCardinality::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const +{ + serializeImpl(column, row_num, &IDataType::serializeBinary, ostr); +} +void DataTypeLowCardinality::deserializeBinary(IColumn & column, ReadBuffer & istr) const +{ + deserializeImpl(column, &IDataType::deserializeBinary, istr); +} + +void DataTypeLowCardinality::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &IDataType::serializeAsTextEscaped, ostr, settings); +} + +void DataTypeLowCardinality::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &IDataType::deserializeAsTextEscaped, istr, settings); +} + +void DataTypeLowCardinality::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &IDataType::serializeAsTextQuoted, ostr, settings); +} + +void DataTypeLowCardinality::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &IDataType::deserializeAsTextQuoted, istr, settings); +} + +void DataTypeLowCardinality::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &IDataType::deserializeAsTextEscaped, istr, settings); +} + +void DataTypeLowCardinality::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &IDataType::serializeAsTextCSV, ostr, settings); +} + +void DataTypeLowCardinality::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &IDataType::deserializeAsTextCSV, istr, settings); +} + +void DataTypeLowCardinality::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &IDataType::serializeAsText, ostr, settings); +} + +void DataTypeLowCardinality::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &IDataType::serializeAsTextJSON, ostr, settings); +} +void DataTypeLowCardinality::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeImpl(column, &IDataType::deserializeAsTextJSON, istr, settings); +} + +void DataTypeLowCardinality::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeImpl(column, row_num, &IDataType::serializeAsTextXML, ostr, settings); +} + +void DataTypeLowCardinality::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const +{ + serializeImpl(column, row_num, &IDataType::serializeProtobuf, protobuf, value_index); +} + void DataTypeLowCardinality::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const { if (allow_add_row) diff --git a/dbms/src/DataTypes/DataTypeLowCardinality.h b/dbms/src/DataTypes/DataTypeLowCardinality.h index 8e6e12fadba..638af7bb968 100644 --- a/dbms/src/DataTypes/DataTypeLowCardinality.h +++ b/dbms/src/DataTypes/DataTypeLowCardinality.h @@ -51,75 +51,20 @@ public: void serializeBinary(const Field & field, WriteBuffer & ostr) const override; void deserializeBinary(Field & field, ReadBuffer & istr) const override; - - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override - { - serializeImpl(column, row_num, &IDataType::serializeBinary, ostr); - } - void deserializeBinary(IColumn & column, ReadBuffer & istr) const override - { - deserializeImpl(column, &IDataType::deserializeBinary, istr); - } - - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - serializeImpl(column, row_num, &IDataType::serializeAsTextEscaped, ostr, settings); - } - - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override - { - deserializeImpl(column, &IDataType::deserializeAsTextEscaped, istr, settings); - } - - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - serializeImpl(column, row_num, &IDataType::serializeAsTextQuoted, ostr, settings); - } - - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override - { - deserializeImpl(column, &IDataType::deserializeAsTextQuoted, istr, settings); - } - - void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override - { - deserializeImpl(column, &IDataType::deserializeAsTextEscaped, istr, settings); - } - - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - serializeImpl(column, row_num, &IDataType::serializeAsTextCSV, ostr, settings); - } - - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override - { - deserializeImpl(column, &IDataType::deserializeAsTextCSV, istr, settings); - } - - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - serializeImpl(column, row_num, &IDataType::serializeAsText, ostr, settings); - } - - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - serializeImpl(column, row_num, &IDataType::serializeAsTextJSON, ostr, settings); - } - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override - { - deserializeImpl(column, &IDataType::deserializeAsTextJSON, istr, settings); - } - - void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - serializeImpl(column, row_num, &IDataType::serializeAsTextXML, ostr, settings); - } - - void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override - { - serializeImpl(column, row_num, &IDataType::serializeProtobuf, protobuf, value_index); - } - + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override; void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override; MutableColumnPtr createColumn() const override; From 7726130303494380f6b28db6c1f87a7fa0c54578 Mon Sep 17 00:00:00 2001 From: chertus Date: Fri, 20 Sep 2019 19:01:19 +0300 Subject: [PATCH 136/309] fix multiple joins aliasing for order by and group by --- .../JoinToSubqueryTransformVisitor.cpp | 21 ++++++----- .../00847_multiple_join_same_column.sql | 28 +++++++++++---- .../00882_multiple_join_no_alias.reference | 8 +++++ .../00882_multiple_join_no_alias.sql | 35 +++++++++++++++++++ 4 files changed, 77 insertions(+), 15 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/00882_multiple_join_no_alias.reference create mode 100644 dbms/tests/queries/0_stateless/00882_multiple_join_no_alias.sql diff --git a/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp b/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp index c6e72b4d252..b60e6533921 100644 --- a/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp +++ b/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp @@ -194,14 +194,14 @@ struct ColumnAliasesMatcher } }; - static bool needChildVisit(ASTPtr & node, const ASTPtr &) + static bool needChildVisit(const ASTPtr & node, const ASTPtr &) { if (node->as()) return false; return true; } - static void visit(ASTPtr & ast, Data & data) + static void visit(const ASTPtr & ast, Data & data) { if (auto * t = ast->as()) visit(*t, ast, data); @@ -210,8 +210,9 @@ struct ColumnAliasesMatcher throw Exception("Multiple JOIN do not support asterisks for complex queries yet", ErrorCodes::NOT_IMPLEMENTED); } - static void visit(ASTIdentifier & node, ASTPtr &, Data & data) + static void visit(const ASTIdentifier & const_node, const ASTPtr &, Data & data) { + ASTIdentifier & node = const_cast(const_node); /// we know it's not const if (node.isShort()) return; @@ -375,7 +376,7 @@ using RewriteVisitor = InDepthNodeVisitor; using SetSubqueryAliasMatcher = OneTypeMatcher; using SetSubqueryAliasVisitor = InDepthNodeVisitor; using ExtractAsterisksVisitor = ExtractAsterisksMatcher::Visitor; -using ColumnAliasesVisitor = InDepthNodeVisitor; +using ColumnAliasesVisitor = ConstInDepthNodeVisitor; using AppendSemanticMatcher = OneTypeMatcher; using AppendSemanticVisitor = InDepthNodeVisitor; @@ -403,15 +404,19 @@ void JoinToSubqueryTransformMatcher::visit(ASTSelectQuery & select, ASTPtr & ast if (select.select()) { aliases_data.public_names = true; - ColumnAliasesVisitor(aliases_data).visit(select.refSelect()); + ColumnAliasesVisitor(aliases_data).visit(select.select()); aliases_data.public_names = false; } if (select.where()) - ColumnAliasesVisitor(aliases_data).visit(select.refWhere()); + ColumnAliasesVisitor(aliases_data).visit(select.where()); if (select.prewhere()) - ColumnAliasesVisitor(aliases_data).visit(select.refPrewhere()); + ColumnAliasesVisitor(aliases_data).visit(select.prewhere()); + if (select.orderBy()) + ColumnAliasesVisitor(aliases_data).visit(select.orderBy()); + if (select.groupBy()) + ColumnAliasesVisitor(aliases_data).visit(select.groupBy()); if (select.having()) - ColumnAliasesVisitor(aliases_data).visit(select.refHaving()); + ColumnAliasesVisitor(aliases_data).visit(select.having()); /// JOIN sections for (auto & child : select.tables()->children) diff --git a/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql b/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql index d444655a6ce..44b3fe202d3 100644 --- a/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql +++ b/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql @@ -16,30 +16,44 @@ left join y on (y.a = s.a and y.b = s.b) format Vertical; select t.a, s.b, s.a, s.b, y.a, y.b from t left join s on (t.a = s.a and s.b = t.b) -left join y on (y.a = s.a and y.b = s.b) format PrettyCompactNoEscapes; +left join y on (y.a = s.a and y.b = s.b) +order by t.a +format PrettyCompactNoEscapes; select t.a as t_a from t -left join s on s.a = t_a format PrettyCompactNoEscapes; +left join s on s.a = t_a +order by t.a +format PrettyCompactNoEscapes; select t.a, s.a as s_a from t left join s on s.a = t.a -left join y on y.b = s.b format PrettyCompactNoEscapes; +left join y on y.b = s.b +order by t.a +format PrettyCompactNoEscapes; select t.a, t.a, t.b as t_b from t left join s on t.a = s.a -left join y on y.b = s.b format PrettyCompactNoEscapes; +left join y on y.b = s.b +order by t.a +format PrettyCompactNoEscapes; select s.a, s.a, s.b as s_b, s.b from t left join s on s.a = t.a -left join y on s.b = y.b format PrettyCompactNoEscapes; +left join y on s.b = y.b +order by t.a +format PrettyCompactNoEscapes; select y.a, y.a, y.b as y_b, y.b from t left join s on s.a = t.a -left join y on y.b = s.b format PrettyCompactNoEscapes; +left join y on y.b = s.b +order by t.a +format PrettyCompactNoEscapes; select t.a, t.a as t_a, s.a, s.a as s_a, y.a, y.a as y_a from t left join s on t.a = s.a -left join y on y.b = s.b format PrettyCompactNoEscapes; +left join y on y.b = s.b +order by t.a +format PrettyCompactNoEscapes; drop table t; drop table s; diff --git a/dbms/tests/queries/0_stateless/00882_multiple_join_no_alias.reference b/dbms/tests/queries/0_stateless/00882_multiple_join_no_alias.reference new file mode 100644 index 00000000000..a3723bc9976 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00882_multiple_join_no_alias.reference @@ -0,0 +1,8 @@ +1 1 1 1 +0 0 0 0 +0 +1 +1 1 1 1 1 1 +2 2 0 0 0 0 +2 2 0 +1 1 1 diff --git a/dbms/tests/queries/0_stateless/00882_multiple_join_no_alias.sql b/dbms/tests/queries/0_stateless/00882_multiple_join_no_alias.sql new file mode 100644 index 00000000000..bd3a2a19913 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00882_multiple_join_no_alias.sql @@ -0,0 +1,35 @@ +drop table if exists t; +drop table if exists s; +drop table if exists y; + +create table t(a Int64, b Int64) engine = Memory; +create table s(a Int64, b Int64) engine = Memory; +create table y(a Int64, b Int64) engine = Memory; + +insert into t values (1,1), (2,2); +insert into s values (1,1); +insert into y values (1,1); + +select s.a, s.a, s.b as s_b, s.b from t +left join s on s.a = t.a +left join y on s.b = y.b +order by t.a; + +select max(s.a) from t +left join s on s.a = t.a +left join y on s.b = y.b +group by t.a; + +select t.a, t.a as t_a, s.a, s.a as s_a, y.a, y.a as y_a from t +left join s on t.a = s.a +left join y on y.b = s.b +order by t.a; + +select t.a, t.a as t_a, max(s.a) from t +left join s on t.a = s.a +left join y on y.b = s.b +group by t.a; + +drop table t; +drop table s; +drop table y; From 88adbb72d9a0de4724234c5e997927621c3a0975 Mon Sep 17 00:00:00 2001 From: millb Date: Fri, 20 Sep 2019 19:21:05 +0300 Subject: [PATCH 137/309] Fixed Function Hex for Float32 and Float64 --- dbms/src/Functions/FunctionsCoding.h | 53 ++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/dbms/src/Functions/FunctionsCoding.h b/dbms/src/Functions/FunctionsCoding.h index ed34863765d..adaaa875a71 100644 --- a/dbms/src/Functions/FunctionsCoding.h +++ b/dbms/src/Functions/FunctionsCoding.h @@ -948,7 +948,7 @@ public: if (!which.isStringOrFixedString() && !which.isDateOrDateTime() - && !which.isUInt()) + && !which.isUInt() && !which.isFloat()) throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); @@ -1021,6 +1021,53 @@ public: } } + template + bool tryExecuteFloat(const IColumn * col, ColumnPtr & col_res) + { + const ColumnVector * col_vec = checkAndGetColumn>(col); + + static constexpr size_t MAX_FLOAT_HEX_LENGTH = sizeof(T) * 2 + 1; /// Including trailing zero byte. + + if (col_vec) + { + auto col_str = ColumnString::create(); + ColumnString::Chars & out_vec = col_str->getChars(); + ColumnString::Offsets & out_offsets = col_str->getOffsets(); + + const typename ColumnVector::Container & in_vec = col_vec->getData(); + + size_t size = in_vec.size(); + out_offsets.resize(size); + out_vec.resize(size * 3 + MAX_FLOAT_HEX_LENGTH); /// 3 is length of one byte in hex plus zero byte. + + size_t pos = 0; + for (size_t i = 0; i < size; ++i) + { + /// Manual exponential growth, so as not to rely on the linear amortized work time of `resize` (no one guarantees it). + if (pos + MAX_FLOAT_HEX_LENGTH > out_vec.size()) + out_vec.resize(out_vec.size() * 2 + MAX_FLOAT_HEX_LENGTH); + + char * begin = reinterpret_cast(&out_vec[pos]); + char * end = begin; + + const UInt8 * in_pos = reinterpret_cast(&in_vec[i]); + executeOneString(in_pos, in_pos + sizeof(in_vec[i]), end); + + pos += end - begin; + out_offsets[i] = pos; + } + + out_vec.resize(pos); + + col_res = std::move(col_str); + return true; + } + else + { + return false; + } + } + void executeOneString(const UInt8 * pos, const UInt8 * end, char *& out) { while (pos < end) @@ -1135,7 +1182,9 @@ public: tryExecuteUInt(column, res_column) || tryExecuteUInt(column, res_column) || tryExecuteString(column, res_column) || - tryExecuteFixedString(column, res_column)) + tryExecuteFixedString(column, res_column) || + tryExecuteFloat(column, res_column) || + tryExecuteFloat(column, res_column)) return; throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName() From 4f24512ba4e4cda82eedf4a3a08df562bb682f17 Mon Sep 17 00:00:00 2001 From: millb Date: Fri, 20 Sep 2019 19:40:06 +0300 Subject: [PATCH 138/309] Test added. --- dbms/tests/queries/0_stateless/01013_hex_float.reference | 7 +++++++ dbms/tests/queries/0_stateless/01013_hex_float.sql | 7 +++++++ 2 files changed, 14 insertions(+) create mode 100644 dbms/tests/queries/0_stateless/01013_hex_float.reference create mode 100644 dbms/tests/queries/0_stateless/01013_hex_float.sql diff --git a/dbms/tests/queries/0_stateless/01013_hex_float.reference b/dbms/tests/queries/0_stateless/01013_hex_float.reference new file mode 100644 index 00000000000..ac428aa6bea --- /dev/null +++ b/dbms/tests/queries/0_stateless/01013_hex_float.reference @@ -0,0 +1,7 @@ +000000000000F03F +0000000000405940 +00C84E676DC1AB43 +2342920CA19CC73B +7DC39425AD49B254 +2C616D8C9DF0423F +BA490C022BFF5EC0 diff --git a/dbms/tests/queries/0_stateless/01013_hex_float.sql b/dbms/tests/queries/0_stateless/01013_hex_float.sql new file mode 100644 index 00000000000..e6da504657f --- /dev/null +++ b/dbms/tests/queries/0_stateless/01013_hex_float.sql @@ -0,0 +1,7 @@ +SELECT hex(1.0); +SELECT hex(101.); +SELECT hex(1e+18); +SELECT hex(1e-20); +SELECT hex(1e+100); +SELECT hex(0.000578); +SELECt hex(-123.987); From 967e00a8da20256c70644424a9dd83389c8728b6 Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Fri, 20 Sep 2019 19:50:13 +0300 Subject: [PATCH 139/309] It works! if compiled with: CC=clang CXX=clang++ cmake -Wno-dev . -Bbuild -GNinja -DCMAKE_AR:FILEPATH=x86_64-apple-darwin-ar -DCMAKE_RANLIB:FILEPATH=x86_64-apple-darwin-ranlib -DCMAKE_SYSTEM_NAME=Darwin -DSDK_PATH=MacOSX10.14.sdk -DLINKER_NAME=x86_64-apple-darwin-ld -DUSE_SNAPPY=OFF -DENABLE_SSL=OFF -DENABLE_PROTOBUF=OFF -DENABLE_PARQUET=OFF -DENABLE_READLINE=OFF -DENABLE_ICU=OFF -DENABLE_FASTOPS=OFF --- contrib/zlib-ng | 2 +- dbms/src/Common/ZooKeeper/ZooKeeper.cpp | 4 +++- dbms/src/Core/MySQLProtocol.cpp | 8 +++++++- dbms/src/Core/MySQLProtocol.h | 6 ++++++ dbms/src/Formats/FormatFactory.cpp | 4 ++++ dbms/src/Functions/registerFunctionsIntrospection.cpp | 6 ++++++ dbms/src/Processors/Formats/Impl/MySQLOutputFormat.cpp | 4 ++++ dbms/src/Processors/Formats/Impl/MySQLOutputFormat.h | 6 ++++++ 8 files changed, 37 insertions(+), 3 deletions(-) diff --git a/contrib/zlib-ng b/contrib/zlib-ng index cb43e7fa08e..cff0f500d93 160000 --- a/contrib/zlib-ng +++ b/contrib/zlib-ng @@ -1 +1 @@ -Subproject commit cb43e7fa08ec29fd76d84e3bb35258a0f0bf3df3 +Subproject commit cff0f500d9399d7cd3b9461a693d211e4b86fcc9 diff --git a/dbms/src/Common/ZooKeeper/ZooKeeper.cpp b/dbms/src/Common/ZooKeeper/ZooKeeper.cpp index f60085195ed..ae5b548f7e5 100644 --- a/dbms/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/dbms/src/Common/ZooKeeper/ZooKeeper.cpp @@ -158,7 +158,9 @@ struct ZooKeeperArgs } /// Shuffle the hosts to distribute the load among ZooKeeper nodes. - std::shuffle(hosts_strings.begin(), hosts_strings.end(), thread_local_rng); + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(hosts_strings.begin(), hosts_strings.end(), g); for (auto & host : hosts_strings) { diff --git a/dbms/src/Core/MySQLProtocol.cpp b/dbms/src/Core/MySQLProtocol.cpp index a133fc64e24..21e29cf8e6a 100644 --- a/dbms/src/Core/MySQLProtocol.cpp +++ b/dbms/src/Core/MySQLProtocol.cpp @@ -1,10 +1,14 @@ +#include "MySQLProtocol.h" + +#if USE_SSL + #include #include #include #include + #include #include -#include "MySQLProtocol.h" namespace DB::MySQLProtocol @@ -100,3 +104,5 @@ size_t getLengthEncodedStringSize(const String & s) } } + +#endif // USE_SSL diff --git a/dbms/src/Core/MySQLProtocol.h b/dbms/src/Core/MySQLProtocol.h index 029d7ded18a..2ac255cca34 100644 --- a/dbms/src/Core/MySQLProtocol.h +++ b/dbms/src/Core/MySQLProtocol.h @@ -1,5 +1,9 @@ #pragma once +#include "config_core.h" + +#if USE_SSL + #include #include #include @@ -1075,3 +1079,5 @@ private: } } + +#endif // USE_SSL diff --git a/dbms/src/Formats/FormatFactory.cpp b/dbms/src/Formats/FormatFactory.cpp index d26f6c64f33..446f23b77dc 100644 --- a/dbms/src/Formats/FormatFactory.cpp +++ b/dbms/src/Formats/FormatFactory.cpp @@ -264,7 +264,9 @@ void registerOutputFormatProcessorXML(FormatFactory & factory); void registerOutputFormatProcessorODBCDriver(FormatFactory & factory); void registerOutputFormatProcessorODBCDriver2(FormatFactory & factory); void registerOutputFormatProcessorNull(FormatFactory & factory); +#if USE_SSL void registerOutputFormatProcessorMySQLWrite(FormatFactory & factory); +#endif /// Input only formats. void registerInputFormatProcessorCapnProto(FormatFactory & factory); @@ -312,7 +314,9 @@ FormatFactory::FormatFactory() registerOutputFormatProcessorODBCDriver(*this); registerOutputFormatProcessorODBCDriver2(*this); registerOutputFormatProcessorNull(*this); +#if USE_SSL registerOutputFormatProcessorMySQLWrite(*this); +#endif } FormatFactory & FormatFactory::instance() diff --git a/dbms/src/Functions/registerFunctionsIntrospection.cpp b/dbms/src/Functions/registerFunctionsIntrospection.cpp index 700a568d822..187f32e5319 100644 --- a/dbms/src/Functions/registerFunctionsIntrospection.cpp +++ b/dbms/src/Functions/registerFunctionsIntrospection.cpp @@ -1,3 +1,5 @@ +#include + namespace DB { @@ -10,10 +12,14 @@ void registerFunctionTrap(FunctionFactory & factory); void registerFunctionsIntrospection(FunctionFactory & factory) { +#if defined (OS_LINUX) registerFunctionAddressToSymbol(factory); registerFunctionDemangle(factory); registerFunctionAddressToLine(factory); registerFunctionTrap(factory); +#else + UNUSED(factory); +#endif } } diff --git a/dbms/src/Processors/Formats/Impl/MySQLOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/MySQLOutputFormat.cpp index f7ba96a63bd..bf5a0324690 100644 --- a/dbms/src/Processors/Formats/Impl/MySQLOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/MySQLOutputFormat.cpp @@ -1,5 +1,7 @@ #include +#if USE_SSL + #include #include #include @@ -116,3 +118,5 @@ void registerOutputFormatProcessorMySQLWrite(FormatFactory & factory) } } + +#endif // USE_SSL diff --git a/dbms/src/Processors/Formats/Impl/MySQLOutputFormat.h b/dbms/src/Processors/Formats/Impl/MySQLOutputFormat.h index 39d04818dee..5793c044fed 100644 --- a/dbms/src/Processors/Formats/Impl/MySQLOutputFormat.h +++ b/dbms/src/Processors/Formats/Impl/MySQLOutputFormat.h @@ -1,5 +1,9 @@ #pragma once +#include "config_core.h" + +#if USE_SSL + #include #include @@ -40,3 +44,5 @@ private: }; } + +#endif From 8cb0e58bc7624915eea4cef014c035f5c325ee6e Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Fri, 20 Sep 2019 20:34:59 +0300 Subject: [PATCH 140/309] Fix build for all targets --- dbms/programs/odbc-bridge/CMakeLists.txt | 8 +++++--- dbms/src/Common/tests/symbol_index.cpp | 4 ++-- dbms/src/Interpreters/tests/hash_map_string_3.cpp | 12 ++++++------ .../CMakeLists.txt | 1 + .../zookeeper-adjust-block-numbers-to-parts/main.cpp | 2 +- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/dbms/programs/odbc-bridge/CMakeLists.txt b/dbms/programs/odbc-bridge/CMakeLists.txt index 73574f8dc2e..d32c87033ff 100644 --- a/dbms/programs/odbc-bridge/CMakeLists.txt +++ b/dbms/programs/odbc-bridge/CMakeLists.txt @@ -33,9 +33,11 @@ endif () clickhouse_program_add_library(odbc-bridge) -# clickhouse-odbc-bridge is always a separate binary. -# Reason: it must not export symbols from SSL, mariadb-client, etc. to not break ABI compatibility with ODBC drivers. -set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--no-export-dynamic") +if (OS_LINUX) + # clickhouse-odbc-bridge is always a separate binary. + # Reason: it must not export symbols from SSL, mariadb-client, etc. to not break ABI compatibility with ODBC drivers. + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--no-export-dynamic") +endif () add_executable(clickhouse-odbc-bridge odbc-bridge.cpp) set_target_properties(clickhouse-odbc-bridge PROPERTIES RUNTIME_OUTPUT_DIRECTORY ..) diff --git a/dbms/src/Common/tests/symbol_index.cpp b/dbms/src/Common/tests/symbol_index.cpp index dde5ce185ae..baab1de1fe1 100644 --- a/dbms/src/Common/tests/symbol_index.cpp +++ b/dbms/src/Common/tests/symbol_index.cpp @@ -12,11 +12,11 @@ NO_INLINE const void * getAddress() return __builtin_return_address(0); } -using namespace DB; - int main(int argc, char ** argv) { #ifdef __ELF__ + using namespace DB; + if (argc < 2) { std::cerr << "Usage: ./symbol_index address\n"; diff --git a/dbms/src/Interpreters/tests/hash_map_string_3.cpp b/dbms/src/Interpreters/tests/hash_map_string_3.cpp index 850a9268c5d..2309a29c531 100644 --- a/dbms/src/Interpreters/tests/hash_map_string_3.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string_3.cpp @@ -18,7 +18,7 @@ #include #include -#ifdef __SSE4_1__ +#ifdef __SSE4_2__ #include #endif @@ -164,7 +164,7 @@ struct FNV1a }; -#ifdef __SSE4_1__ +#ifdef __SSE4_2__ struct CrapWow { @@ -254,7 +254,7 @@ struct SimpleHash if (size < 8) { -#ifdef __SSE4_1__ +#ifdef __SSE4_2__ return hashLessThan8(x.data, x.size); #endif } @@ -291,7 +291,7 @@ struct VerySimpleHash if (size < 8) { -#ifdef __SSE4_1__ +#ifdef __SSE4_2__ return hashLessThan8(x.data, x.size); #endif } @@ -342,7 +342,7 @@ struct SMetroHash64 }; -#ifdef __SSE4_1__ +#ifdef __SSE4_2__ /*struct CRC32Hash { @@ -499,7 +499,7 @@ int main(int argc, char ** argv) if (!m || m == 3) bench (data, "StringRef_SimpleHash"); if (!m || m == 4) bench (data, "StringRef_FNV1a"); -#ifdef __SSE4_1__ +#ifdef __SSE4_2__ if (!m || m == 5) bench (data, "StringRef_CrapWow"); if (!m || m == 6) bench (data, "StringRef_CRC32Hash"); if (!m || m == 7) bench (data, "StringRef_CRC32ILPHash"); diff --git a/utils/zookeeper-adjust-block-numbers-to-parts/CMakeLists.txt b/utils/zookeeper-adjust-block-numbers-to-parts/CMakeLists.txt index d2357ec755d..2fdd87a4412 100644 --- a/utils/zookeeper-adjust-block-numbers-to-parts/CMakeLists.txt +++ b/utils/zookeeper-adjust-block-numbers-to-parts/CMakeLists.txt @@ -1,2 +1,3 @@ add_executable (zookeeper-adjust-block-numbers-to-parts main.cpp ${SRCS}) +target_compile_options(zookeeper-adjust-block-numbers-to-parts PRIVATE -Wno-format) target_link_libraries (zookeeper-adjust-block-numbers-to-parts PRIVATE dbms clickhouse_common_zookeeper ${Boost_PROGRAM_OPTIONS_LIBRARY}) diff --git a/utils/zookeeper-adjust-block-numbers-to-parts/main.cpp b/utils/zookeeper-adjust-block-numbers-to-parts/main.cpp index 3e449043adc..02aa2ddbcda 100644 --- a/utils/zookeeper-adjust-block-numbers-to-parts/main.cpp +++ b/utils/zookeeper-adjust-block-numbers-to-parts/main.cpp @@ -179,7 +179,7 @@ void setCurrentBlockNumber(zkutil::ZooKeeper & zk, const std::string & path, Int if (number != current_block_number) { char suffix[11] = ""; - sprintf(suffix, "%010ld", current_block_number); + sprintf(suffix, "%010lld", current_block_number); std::string expected_path = block_prefix + suffix; std::cerr << "\t" << path_created << ": Ephemeral node has been created with an unexpected path (expected something like " << expected_path << ")." << std::endl; From d0ef26e1b832e1a2450c2702e5be5afc9c1e78d1 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 20 Sep 2019 22:57:09 +0300 Subject: [PATCH 141/309] Better shared binary build --- docker/packager/binary/Dockerfile | 3 ++- docker/packager/binary/build.sh | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile index 78f113d94b4..82fa93ec570 100644 --- a/docker/packager/binary/Dockerfile +++ b/docker/packager/binary/Dockerfile @@ -56,7 +56,8 @@ RUN apt-get update -y \ tzdata \ gperf \ cmake \ - gdb + gdb \ + rename COPY build.sh / CMD ["/bin/bash", "/build.sh"] diff --git a/docker/packager/binary/build.sh b/docker/packager/binary/build.sh index c5b2e260e45..9d994af0986 100755 --- a/docker/packager/binary/build.sh +++ b/docker/packager/binary/build.sh @@ -14,3 +14,6 @@ mv ./dbms/programs/clickhouse* /output mv ./dbms/unit_tests_dbms /output find . -name '*.so' -print -exec mv '{}' /output \; find . -name '*.so.*' -print -exec mv '{}' /output \; +tar -czvf shared_build.tgz /output +rm -r /output/* +mv shared_build.tgz /output From f2f9f585e42557c7b9e41fc182692bd44176d79f Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 20 Sep 2019 23:15:42 +0300 Subject: [PATCH 142/309] Better check in shared binary build --- docker/packager/binary/build.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/docker/packager/binary/build.sh b/docker/packager/binary/build.sh index 9d994af0986..ed30feb8cb7 100755 --- a/docker/packager/binary/build.sh +++ b/docker/packager/binary/build.sh @@ -14,6 +14,11 @@ mv ./dbms/programs/clickhouse* /output mv ./dbms/unit_tests_dbms /output find . -name '*.so' -print -exec mv '{}' /output \; find . -name '*.so.*' -print -exec mv '{}' /output \; -tar -czvf shared_build.tgz /output -rm -r /output/* -mv shared_build.tgz /output + +count=`ls -1 /output/*.so 2>/dev/null | wc -l` +if [ $count != 0 ] +then + tar -czvf shared_build.tgz /output + rm -r /output/* + mv shared_build.tgz /output +fi From b1f19a50d4eee10386a8db78cdca57a808555618 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 20 Sep 2019 23:16:55 +0300 Subject: [PATCH 143/309] Update CHANGELOG.md --- CHANGELOG.md | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 11ca2fd1301..c385831af85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -72,7 +72,6 @@ * Fix for skip indices with vertical merge and alter. Fix for `Bad size of marks file` exception. [#6594](https://github.com/yandex/ClickHouse/issues/6594) [#6713](https://github.com/yandex/ClickHouse/pull/6713) ([alesapin](https://github.com/alesapin)) * Fix rare crash in `ALTER MODIFY COLUMN` and vertical merge when one of merged/altered parts is empty (0 rows) [#6746](https://github.com/yandex/ClickHouse/issues/6746) [#6780](https://github.com/yandex/ClickHouse/pull/6780) ([alesapin](https://github.com/alesapin)) * Fixed bug in conversion of `LowCardinality` types in `AggregateFunctionFactory`. This fixes [#6257](https://github.com/yandex/ClickHouse/issues/6257). [#6281](https://github.com/yandex/ClickHouse/pull/6281) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fixed possible data loss after `ALTER DELETE` query on table with skipping index. [#6224](https://github.com/yandex/ClickHouse/issues/6224) [#6282](https://github.com/yandex/ClickHouse/pull/6282) ([Nikita Vasilev](https://github.com/nikvas0)) * Fix wrong behavior and possible segfaults in `topK` and `topKWeighted` aggregated functions. [#6404](https://github.com/yandex/ClickHouse/pull/6404) ([Anton Popov](https://github.com/CurtizJ)) * Fixed unsafe code around `getIdentifier` function. [#6401](https://github.com/yandex/ClickHouse/issues/6401) [#6409](https://github.com/yandex/ClickHouse/pull/6409) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Fixed bug in MySQL wire protocol (is used while connecting to ClickHouse form MySQL client). Caused by heap buffer overflow in `PacketPayloadWriteBuffer`. [#6212](https://github.com/yandex/ClickHouse/pull/6212) ([Yuriy Baranov](https://github.com/yurriy)) @@ -116,7 +115,6 @@ ### Security Fix * This release also contains all bug security fixes from 19.13 and 19.11. -* If the attacker has write access to ZooKeeper and is able to run custom server available from the network where ClickHouse run, it can create custom-built malicious server that will act as ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. Found by Eldar Zaitov, information security team at Yandex. [#6247](https://github.com/yandex/ClickHouse/pull/6247) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Fixed the possibility of a fabricated query to cause server crash due to stack overflow in SQL parser. Fixed the possibility of stack overflow in Merge and Distributed tables, materialized views and conditions for row-level security that involve subqueries. [#6433](https://github.com/yandex/ClickHouse/pull/6433) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Improvement @@ -272,32 +270,46 @@ * Removed rarely used table function `catBoostPool` and storage `CatBoostPool`. If you have used this table function, please write email to `clickhouse-feedback@yandex-team.com`. Note that CatBoost integration remains and will be supported. [#6279](https://github.com/yandex/ClickHouse/pull/6279) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Disable `ANY RIGHT JOIN` and `ANY FULL JOIN` by default. Set `any_join_get_any_from_right_table` setting to enable them. [#5126](https://github.com/yandex/ClickHouse/issues/5126) [#6351](https://github.com/yandex/ClickHouse/pull/6351) ([Artem Zuikov](https://github.com/4ertus2)) +## ClickHouse release 19.11.11.57, 2019-09-13 +* Fix logical error causing segfaults when selecting from Kafka empty topic. [#6902](https://github.com/yandex/ClickHouse/issues/6902) [#6909](https://github.com/yandex/ClickHouse/pull/6909) ([Ivan](https://github.com/abyss7)) +* Fix for function `АrrayEnumerateUniqRanked` with empty arrays in params. [#6928](https://github.com/yandex/ClickHouse/pull/6928) ([proller](https://github.com/proller)) + ## ClickHouse release 19.13.4.32, 2019-09-10 ### Bug Fix -* Fixed an issue when long `ALTER UPDATE` or `ALTER DELETE` may prevent regular merges to run. Prevent mutations from executing if there is no enough free threads available. [#6502](https://github.com/yandex/ClickHouse/issues/6502) [#6617](https://github.com/yandex/ClickHouse/pull/6617) ([tavplubix](https://github.com/tavplubix)) +* This release also contains all bug security fixes from 19.11.9.52 and 19.11.10.54. * Fixed data race in `system.parts` table and `ALTER` query. [#6245](https://github.com/yandex/ClickHouse/issues/6245) [#6513](https://github.com/yandex/ClickHouse/pull/6513) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix Kafka messages duplication problem on normal server restart. [#6597](https://github.com/yandex/ClickHouse/pull/6597) ([Ivan](https://github.com/abyss7)) -* Do store offsets for Kafka messages manually to be able to commit them all at once for all partitions. Fixes potential duplication in "one consumer - many partitions" scenario. [#6872](https://github.com/yandex/ClickHouse/pull/6872) ([Ivan](https://github.com/abyss7)) * Fixed mismatched header in streams happened in case of reading from empty distributed table with sample and prewhere. [#6167](https://github.com/yandex/ClickHouse/issues/6167) ([Lixiang Qian](https://github.com/fancyqlx)) [#6823](https://github.com/yandex/ClickHouse/pull/6823) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) * Fixed crash when using `IN` clause with a subquery with a tuple. [#6125](https://github.com/yandex/ClickHouse/issues/6125) [#6550](https://github.com/yandex/ClickHouse/pull/6550) ([tavplubix](https://github.com/tavplubix)) -* Fixed bug in function `arrayEnumerateUniqRanked`. [#6779](https://github.com/yandex/ClickHouse/pull/6779) ([proller](https://github.com/proller)) -* Fix `JSONExtract` function while extracting a `Tuple` from JSON. [#6718](https://github.com/yandex/ClickHouse/pull/6718) ([Vitaly Baranov](https://github.com/vitlibar)) * Fix case with same column names in `GLOBAL JOIN ON` section. [#6181](https://github.com/yandex/ClickHouse/pull/6181) ([Artem Zuikov](https://github.com/4ertus2)) * Fix crash when casting types to `Decimal` that do not support it. Throw exception instead. [#6297](https://github.com/yandex/ClickHouse/pull/6297) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed wrong behaviour of `nullIf` function for constant arguments. [#6518](https://github.com/yandex/ClickHouse/pull/6518) ([Guillaume Tassery](https://github.com/YiuRULE)) [#6580](https://github.com/yandex/ClickHouse/pull/6580) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Fixed crash in `extractAll()` function. [#6644](https://github.com/yandex/ClickHouse/pull/6644) ([Artem Zuikov](https://github.com/4ertus2)) * Query transformation for `MySQL`, `ODBC`, `JDBC` table functions now works properly for `SELECT WHERE` queries with multiple `AND` expressions. [#6381](https://github.com/yandex/ClickHouse/issues/6381) [#6676](https://github.com/yandex/ClickHouse/pull/6676) ([dimarub2000](https://github.com/dimarub2000)) * Added previous declaration checks for MySQL 8 integration. [#6569](https://github.com/yandex/ClickHouse/pull/6569) ([Rafael David Tinoco](https://github.com/rafaeldtinoco)) -* Parquet: Fix reading boolean columns. [#6579](https://github.com/yandex/ClickHouse/pull/6579) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed error with processing "timezone" in server configuration file. [#6709](https://github.com/yandex/ClickHouse/pull/6709) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Improve error handling in cache dictionaries. [#6737](https://github.com/yandex/ClickHouse/pull/6737) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix kafka tests. [#6805](https://github.com/yandex/ClickHouse/pull/6805) ([Ivan](https://github.com/abyss7)) -* Fixed performance test. [#6392](https://github.com/yandex/ClickHouse/pull/6392) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Security Fix * Fix two vulnerabilities in codecs in decompression phase (malicious user can fabricate compressed data that will lead to buffer overflow in decompression). [#6670](https://github.com/yandex/ClickHouse/pull/6670) ([Artem Zuikov](https://github.com/4ertus2)) +## ClickHouse release 19.11.10.54, 2019-09-10 + +### Bug Fix +* Do store offsets for Kafka messages manually to be able to commit them all at once for all partitions. Fixes potential duplication in "one consumer - many partitions" scenario. [#6872](https://github.com/yandex/ClickHouse/pull/6872) ([Ivan](https://github.com/abyss7)) + +## ClickHouse release 19.11.9.52, 2019-09-6 +* Improve error handling in cache dictionaries. [#6737](https://github.com/yandex/ClickHouse/pull/6737) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fixed bug in function `arrayEnumerateUniqRanked`. [#6779](https://github.com/yandex/ClickHouse/pull/6779) ([proller](https://github.com/proller)) +* Fix `JSONExtract` function while extracting a `Tuple` from JSON. [#6718](https://github.com/yandex/ClickHouse/pull/6718) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fixed possible data loss after `ALTER DELETE` query on table with skipping index. [#6224](https://github.com/yandex/ClickHouse/issues/6224) [#6282](https://github.com/yandex/ClickHouse/pull/6282) ([Nikita Vasilev](https://github.com/nikvas0)) +* Fixed performance test. [#6392](https://github.com/yandex/ClickHouse/pull/6392) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Parquet: Fix reading boolean columns. [#6579](https://github.com/yandex/ClickHouse/pull/6579) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed wrong behaviour of `nullIf` function for constant arguments. [#6518](https://github.com/yandex/ClickHouse/pull/6518) ([Guillaume Tassery](https://github.com/YiuRULE)) [#6580](https://github.com/yandex/ClickHouse/pull/6580) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix Kafka messages duplication problem on normal server restart. [#6597](https://github.com/yandex/ClickHouse/pull/6597) ([Ivan](https://github.com/abyss7)) +* Fixed an issue when long `ALTER UPDATE` or `ALTER DELETE` may prevent regular merges to run. Prevent mutations from executing if there is no enough free threads available. [#6502](https://github.com/yandex/ClickHouse/issues/6502) [#6617](https://github.com/yandex/ClickHouse/pull/6617) ([tavplubix](https://github.com/tavplubix)) +* Fixed error with processing "timezone" in server configuration file. [#6709](https://github.com/yandex/ClickHouse/pull/6709) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix kafka tests. [#6805](https://github.com/yandex/ClickHouse/pull/6805) ([Ivan](https://github.com/abyss7)) + +### Security Fix +* If the attacker has write access to ZooKeeper and is able to run custom server available from the network where ClickHouse run, it can create custom-built malicious server that will act as ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. Found by Eldar Zaitov, information security team at Yandex. [#6247](https://github.com/yandex/ClickHouse/pull/6247) ([alexey-milovidov](https://github.com/alexey-milovidov)) ## ClickHouse release 19.13.3.26, 2019-08-22 @@ -308,6 +320,10 @@ * Fixed issue with parsing CSV [#6426](https://github.com/yandex/ClickHouse/issues/6426) [#6559](https://github.com/yandex/ClickHouse/pull/6559) ([tavplubix](https://github.com/tavplubix)) * Fixed data race in system.parts table and ALTER query. This fixes [#6245](https://github.com/yandex/ClickHouse/issues/6245). [#6513](https://github.com/yandex/ClickHouse/pull/6513) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [#6514](https://github.com/yandex/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed possible data loss after `ALTER DELETE` query on table with skipping index. [#6224](https://github.com/yandex/ClickHouse/issues/6224) [#6282](https://github.com/yandex/ClickHouse/pull/6282) ([Nikita Vasilev](https://github.com/nikvas0)) + +### Security Fix +* If the attacker has write access to ZooKeeper and is able to run custom server available from the network where ClickHouse run, it can create custom-built malicious server that will act as ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. Found by Eldar Zaitov, information security team at Yandex. [#6247](https://github.com/yandex/ClickHouse/pull/6247) ([alexey-milovidov](https://github.com/alexey-milovidov)) ## ClickHouse release 19.13.2.19, 2019-08-14 From 42b739550b51bc1e1b4439994486edee14557f56 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 20 Sep 2019 23:35:50 +0300 Subject: [PATCH 144/309] Renamed "storage_policy_name" to "storage_policy" --- dbms/src/Storages/MergeTree/MergeTreeData.cpp | 2 +- .../Storages/MergeTree/MergeTreeSettings.h | 4 +-- .../integration/test_multiple_disks/test.py | 36 +++++++++---------- docs/ru/operations/table_engines/mergetree.md | 12 +++---- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.cpp b/dbms/src/Storages/MergeTree/MergeTreeData.cpp index 3538ad4c843..d68d96675af 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeData.cpp @@ -126,7 +126,7 @@ MergeTreeData::MergeTreeData( , log_name(database_name + "." + table_name) , log(&Logger::get(log_name)) , storage_settings(std::move(storage_settings_)) - , storage_policy(context_.getStoragePolicy(getSettings()->storage_policy_name)) + , storage_policy(context_.getStoragePolicy(getSettings()->storage_policy)) , data_parts_by_info(data_parts_indexes.get()) , data_parts_by_state_and_info(data_parts_indexes.get()) , parts_mover(this) diff --git a/dbms/src/Storages/MergeTree/MergeTreeSettings.h b/dbms/src/Storages/MergeTree/MergeTreeSettings.h index 994d9675941..3652718451f 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeSettings.h +++ b/dbms/src/Storages/MergeTree/MergeTreeSettings.h @@ -88,7 +88,7 @@ struct MergeTreeSettings : public SettingsCollection M(SettingMaxThreads, max_part_loading_threads, 0, "The number of theads to load data parts at startup.") \ M(SettingMaxThreads, max_part_removal_threads, 0, "The number of theads for concurrent removal of inactive data parts. One is usually enough, but in 'Google Compute Environment SSD Persistent Disks' file removal (unlink) operation is extraordinarily slow and you probably have to increase this number (recommended is up to 16).") \ M(SettingUInt64, concurrent_part_removal_threshold, 100, "Activate concurrent part removal (see 'max_part_removal_threads') only if the number of inactive data parts is at least this.") \ - M(SettingString, storage_policy_name, "default", "Name of storage disk policy") + M(SettingString, storage_policy, "default", "Name of storage disk policy") DECLARE_SETTINGS_COLLECTION(LIST_OF_MERGE_TREE_SETTINGS) @@ -104,7 +104,7 @@ struct MergeTreeSettings : public SettingsCollection /// We check settings after storage creation static bool isReadonlySetting(const String & name) { - return name == "index_granularity" || name == "index_granularity_bytes" || name == "storage_policy_name"; + return name == "index_granularity" || name == "index_granularity_bytes" || name == "storage_policy"; } }; diff --git a/dbms/tests/integration/test_multiple_disks/test.py b/dbms/tests/integration/test_multiple_disks/test.py index afad78cdc8c..4ee337229c9 100644 --- a/dbms/tests/integration/test_multiple_disks/test.py +++ b/dbms/tests/integration/test_multiple_disks/test.py @@ -146,7 +146,7 @@ def test_query_parser(start_cluster): d UInt64 ) ENGINE = MergeTree() ORDER BY d - SETTINGS storage_policy_name='very_exciting_policy' + SETTINGS storage_policy='very_exciting_policy' """) with pytest.raises(QueryRuntimeException): @@ -155,7 +155,7 @@ def test_query_parser(start_cluster): d UInt64 ) ENGINE = MergeTree() ORDER BY d - SETTINGS storage_policy_name='jbod1' + SETTINGS storage_policy='jbod1' """) @@ -164,7 +164,7 @@ def test_query_parser(start_cluster): d UInt64 ) ENGINE = MergeTree() ORDER BY d - SETTINGS storage_policy_name='default' + SETTINGS storage_policy='default' """) node1.query("INSERT INTO table_with_normal_policy VALUES (5)") @@ -182,7 +182,7 @@ def test_query_parser(start_cluster): node1.query("ALTER TABLE table_with_normal_policy MOVE PARTITION 'yyyy' TO DISK 'jbod1'") with pytest.raises(QueryRuntimeException): - node1.query("ALTER TABLE table_with_normal_policy MODIFY SETTING storage_policy_name='moving_jbod_with_external'") + node1.query("ALTER TABLE table_with_normal_policy MODIFY SETTING storage_policy='moving_jbod_with_external'") finally: node1.query("DROP TABLE IF EXISTS table_with_normal_policy") @@ -204,7 +204,7 @@ def test_round_robin(start_cluster, name, engine): d UInt64 ) ENGINE = {engine} ORDER BY d - SETTINGS storage_policy_name='jbods_with_external' + SETTINGS storage_policy='jbods_with_external' """.format(name=name, engine=engine)) # first should go to the jbod1 @@ -239,7 +239,7 @@ def test_max_data_part_size(start_cluster, name, engine): s1 String ) ENGINE = {engine} ORDER BY tuple() - SETTINGS storage_policy_name='jbods_with_external' + SETTINGS storage_policy='jbods_with_external' """.format(name=name, engine=engine)) data = [] # 10MB in total for i in range(10): @@ -263,7 +263,7 @@ def test_jbod_overflow(start_cluster, name, engine): s1 String ) ENGINE = {engine} ORDER BY tuple() - SETTINGS storage_policy_name='small_jbod_with_external' + SETTINGS storage_policy='small_jbod_with_external' """.format(name=name, engine=engine)) node1.query("SYSTEM STOP MERGES") @@ -313,7 +313,7 @@ def test_background_move(start_cluster, name, engine): s1 String ) ENGINE = {engine} ORDER BY tuple() - SETTINGS storage_policy_name='moving_jbod_with_external' + SETTINGS storage_policy='moving_jbod_with_external' """.format(name=name, engine=engine)) for i in range(5): @@ -357,7 +357,7 @@ def test_start_stop_moves(start_cluster, name, engine): s1 String ) ENGINE = {engine} ORDER BY tuple() - SETTINGS storage_policy_name='moving_jbod_with_external' + SETTINGS storage_policy='moving_jbod_with_external' """.format(name=name, engine=engine)) node1.query("INSERT INTO {} VALUES ('HELLO')".format(name)) @@ -452,7 +452,7 @@ def test_alter_move(start_cluster, name, engine): ) ENGINE = {engine} ORDER BY tuple() PARTITION BY toYYYYMM(EventDate) - SETTINGS storage_policy_name='jbods_with_external' + SETTINGS storage_policy='jbods_with_external' """.format(name=name, engine=engine)) node1.query("SYSTEM STOP MERGES {}".format(name)) # to avoid conflicts @@ -540,7 +540,7 @@ def test_concurrent_alter_move(start_cluster, name, engine): ) ENGINE = {engine} ORDER BY tuple() PARTITION BY toYYYYMM(EventDate) - SETTINGS storage_policy_name='jbods_with_external' + SETTINGS storage_policy='jbods_with_external' """.format(name=name, engine=engine)) def insert(num): @@ -591,7 +591,7 @@ def test_concurrent_alter_move_and_drop(start_cluster, name, engine): ) ENGINE = {engine} ORDER BY tuple() PARTITION BY toYYYYMM(EventDate) - SETTINGS storage_policy_name='jbods_with_external' + SETTINGS storage_policy='jbods_with_external' """.format(name=name, engine=engine)) def insert(num): @@ -640,7 +640,7 @@ def test_mutate_to_another_disk(start_cluster, name, engine): s1 String ) ENGINE = {engine} ORDER BY tuple() - SETTINGS storage_policy_name='moving_jbod_with_external' + SETTINGS storage_policy='moving_jbod_with_external' """.format(name=name, engine=engine)) for i in range(5): @@ -687,7 +687,7 @@ def test_concurrent_alter_modify(start_cluster, name, engine): ) ENGINE = {engine} ORDER BY tuple() PARTITION BY toYYYYMM(EventDate) - SETTINGS storage_policy_name='jbods_with_external' + SETTINGS storage_policy='jbods_with_external' """.format(name=name, engine=engine)) def insert(num): @@ -733,7 +733,7 @@ def test_simple_replication_and_moves(start_cluster): s1 String ) ENGINE = ReplicatedMergeTree('/clickhouse/replicated_table_for_moves', '{}') ORDER BY tuple() - SETTINGS storage_policy_name='moving_jbod_with_external', old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=2 + SETTINGS storage_policy='moving_jbod_with_external', old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=2 """.format(i + 1)) def insert(num): @@ -796,7 +796,7 @@ def test_download_appropriate_disk(start_cluster): s1 String ) ENGINE = ReplicatedMergeTree('/clickhouse/replicated_table_for_download', '{}') ORDER BY tuple() - SETTINGS storage_policy_name='moving_jbod_with_external', old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=2 + SETTINGS storage_policy='moving_jbod_with_external', old_parts_lifetime=1, cleanup_delay_period=1, cleanup_delay_period_random_add=2 """.format(i + 1)) data = [] @@ -827,7 +827,7 @@ def test_rename(start_cluster): s String ) ENGINE = MergeTree ORDER BY tuple() - SETTINGS storage_policy_name='small_jbod_with_external' + SETTINGS storage_policy='small_jbod_with_external' """) for _ in range(5): @@ -867,7 +867,7 @@ def test_freeze(start_cluster): ) ENGINE = MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(d) - SETTINGS storage_policy_name='small_jbod_with_external' + SETTINGS storage_policy='small_jbod_with_external' """) for _ in range(5): diff --git a/docs/ru/operations/table_engines/mergetree.md b/docs/ru/operations/table_engines/mergetree.md index 543c5177bd3..c1fb3dc9edd 100644 --- a/docs/ru/operations/table_engines/mergetree.md +++ b/docs/ru/operations/table_engines/mergetree.md @@ -332,7 +332,7 @@ TTL date_time + INTERVAL 15 HOUR Создание таблицы с TTL ```sql -CREATE TABLE example_table +CREATE TABLE example_table ( d DateTime, a Int TTL d + INTERVAL 1 MONTH, @@ -367,7 +367,7 @@ ALTER TABLE example_table Примеры: ```sql -CREATE TABLE example_table +CREATE TABLE example_table ( d DateTime, a Int @@ -378,7 +378,7 @@ ORDER BY d TTL d + INTERVAL 1 MONTH; ``` -Изменение TTL +Изменение TTL ```sql ALTER TABLE example_table @@ -488,10 +488,10 @@ CREATE TABLE table_with_non_default_policy ( OrderID UInt64, BannerID UInt64, SearchPhrase String -) ENGINE = MergeTree() +) ENGINE = MergeTree ORDER BY (OrderID, BannerID) PARTITION BY toYYYYMM(EventDate) -SETTINGS storage_policy_name='moving_from_ssd_to_hdd' +SETTINGS storage_policy = 'moving_from_ssd_to_hdd' ``` По умолчанию используется политика хранения `default` в которой есть один том и один диск, указанный в ``. В данный момент менять политику хранения после создания таблицы нельзя. @@ -502,7 +502,7 @@ SETTINGS storage_policy_name='moving_from_ssd_to_hdd' * В результате вставки (запрос `INSERT`). * В фоновых операциях слияний и [мутаций](../../query_language/alter.md#alter-mutations). -* При скачивании данных с другой реплики. +* При скачивании данных с другой реплики. * В результате заморозки партиций [ALTER TABLE ... FREEZE PARTITION](../../query_language/alter.md#alter_freeze-partition). Во всех случаях, кроме мутаций и заморозки партиций, при записи куска выбирается том и диск в соответствии с указанной конфигурацией хранилища: From 1dc58dcb15507e4b5427310b06b221610b4bf828 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Fri, 20 Sep 2019 23:53:14 +0300 Subject: [PATCH 145/309] Update InterpreterCreateQuery.cpp --- dbms/src/Interpreters/InterpreterCreateQuery.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dbms/src/Interpreters/InterpreterCreateQuery.cpp b/dbms/src/Interpreters/InterpreterCreateQuery.cpp index f2189a59866..9d7e052c826 100644 --- a/dbms/src/Interpreters/InterpreterCreateQuery.cpp +++ b/dbms/src/Interpreters/InterpreterCreateQuery.cpp @@ -417,8 +417,12 @@ ColumnsDescription InterpreterCreateQuery::setProperties( else if (!create.as_table.empty()) { columns = as_storage->getColumns(); + + /// Secondary indices make sense only for MergeTree family of storage engines. + /// We should not copy them for other storages. if (create.storage && endsWith(create.storage->engine->name, "MergeTree")) indices = as_storage->getIndices(); + constraints = as_storage->getConstraints(); } else if (create.select) From b22b65dd299f77f882e8c98f7342b03671bd678f Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 21 Sep 2019 02:22:25 +0300 Subject: [PATCH 146/309] Avoid SIGSEGV on batch send failure (file with index XX is absent) In case of the following error: Failed to send batch: file with index 23742 is absent NULL dereference will occur for the "remote". --- dbms/src/Storages/Distributed/DirectoryMonitor.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbms/src/Storages/Distributed/DirectoryMonitor.cpp b/dbms/src/Storages/Distributed/DirectoryMonitor.cpp index df19d76bbb3..183897c7574 100644 --- a/dbms/src/Storages/Distributed/DirectoryMonitor.cpp +++ b/dbms/src/Storages/Distributed/DirectoryMonitor.cpp @@ -392,7 +392,8 @@ struct StorageDistributedDirectoryMonitor::Batch remote->writePrepared(in); } - remote->writeSuffix(); + if (remote) + remote->writeSuffix(); } catch (const Exception & e) { From 420089c301abe7db167a05eefd80560ea4ad0e4b Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 21 Sep 2019 02:22:40 +0300 Subject: [PATCH 147/309] Add new dictionary layout (sparse_hashed) that is more memory efficient With this new layout, sparsehash will be used over default HashMap, sparsehash is more memory efficient but it is also slower. So in a nutshell: - HashMap uses ~2x more memory then sparse_hash_map - HashMap ~2-2.5x faster then sparse_hash_map (tested on lots of input, and the most close to production was dictionary with 600KK hashes and UInt16 as value) TODO: - fix allocated memory calculation - getBufferSizeInBytes/getBufferSizeInCells interface - benchmarks v0: replace HashMap with google::sparse_hash_map v2: use google::sparse_hash_map only when isset to true v3: replace attributes with different layout v4: use ch hash over std::hash --- dbms/src/Dictionaries/CMakeLists.txt | 2 + dbms/src/Dictionaries/HashedDictionary.cpp | 124 ++++++++++++++---- dbms/src/Dictionaries/HashedDictionary.h | 34 ++++- dbms/src/Functions/CMakeLists.txt | 2 +- dbms/tests/config/ints_dictionary.xml | 63 +++++++++ .../0_stateless/00950_dict_get.reference | 3 + .../queries/0_stateless/00950_dict_get.sql | 28 ++++ .../dicts/external_dicts_dict_layout.md | 13 ++ 8 files changed, 242 insertions(+), 27 deletions(-) diff --git a/dbms/src/Dictionaries/CMakeLists.txt b/dbms/src/Dictionaries/CMakeLists.txt index 4d066d1f59b..d6f8fc57ff6 100644 --- a/dbms/src/Dictionaries/CMakeLists.txt +++ b/dbms/src/Dictionaries/CMakeLists.txt @@ -40,3 +40,5 @@ if(USE_POCO_MONGODB) endif() add_subdirectory(Embedded) + +target_include_directories(clickhouse_dictionaries SYSTEM PRIVATE ${SPARSEHASH_INCLUDE_DIR}) diff --git a/dbms/src/Dictionaries/HashedDictionary.cpp b/dbms/src/Dictionaries/HashedDictionary.cpp index 9b853ac2df5..4d195b5139a 100644 --- a/dbms/src/Dictionaries/HashedDictionary.cpp +++ b/dbms/src/Dictionaries/HashedDictionary.cpp @@ -3,6 +3,23 @@ #include "DictionaryBlockInputStream.h" #include "DictionaryFactory.h" +namespace +{ + +/// google::sparse_hash_map +template auto first(const T &lhs) -> decltype(lhs.first) +{ return lhs.first; } +template auto second(const T &lhs) -> decltype(lhs.second) +{ return lhs.second; } + +/// HashMap +template auto first(const T &lhs) -> decltype(lhs.getFirst()) +{ return lhs.getFirst(); } +template auto second(const T &lhs) -> decltype(lhs.getSecond()) +{ return lhs.getSecond(); } + +} + namespace DB { namespace ErrorCodes @@ -21,12 +38,14 @@ HashedDictionary::HashedDictionary( DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_, + bool sparse_, BlockPtr saved_block_) : name{name_} , dict_struct(dict_struct_) , source_ptr{std::move(source_ptr_)} , dict_lifetime(dict_lifetime_) , require_nonempty(require_nonempty_) + , sparse(sparse_) , saved_block{std::move(saved_block_)} { createAttributes(); @@ -57,11 +76,10 @@ static inline HashedDictionary::Key getAt(const HashedDictionary::Key & value, c return value; } -template -void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const +template +void HashedDictionary::isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const { const auto null_value = std::get(hierarchical_attribute->null_values); - const auto & attr = *std::get>(hierarchical_attribute->maps); const auto rows = out.size(); for (const auto row : ext::range(0, rows)) @@ -73,7 +91,7 @@ void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType { auto it = attr.find(id); if (it != std::end(attr)) - id = it->getSecond(); + id = second(*it); else break; } @@ -83,6 +101,13 @@ void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType query_count.fetch_add(rows, std::memory_order_relaxed); } +template +void HashedDictionary::isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const +{ + if (!sparse) + return isInAttrImpl(*std::get>(hierarchical_attribute->maps), child_ids, ancestor_ids, out); + return isInAttrImpl(*std::get>(hierarchical_attribute->sparse_maps), child_ids, ancestor_ids, out); +} void HashedDictionary::isInVectorVector( const PaddedPODArray & child_ids, const PaddedPODArray & ancestor_ids, PaddedPODArray & out) const @@ -407,9 +432,22 @@ void HashedDictionary::loadData() template void HashedDictionary::addAttributeSize(const Attribute & attribute) { - const auto & map_ref = std::get>(attribute.maps); - bytes_allocated += sizeof(CollectionType) + map_ref->getBufferSizeInBytes(); - bucket_count = map_ref->getBufferSizeInCells(); + if (!sparse) + { + const auto & map_ref = std::get>(attribute.maps); + bytes_allocated += sizeof(CollectionType) + map_ref->getBufferSizeInBytes(); + bucket_count = map_ref->getBufferSizeInCells(); + } + else + { + const auto & map_ref = std::get>(attribute.sparse_maps); + bucket_count = map_ref->bucket_count(); + + /** TODO: more accurate calculation */ + bytes_allocated += sizeof(CollectionType); + bytes_allocated += bucket_count; + bytes_allocated += map_ref->size() * sizeof(Key) * sizeof(T); + } } void HashedDictionary::calculateBytesAllocated() @@ -479,12 +517,15 @@ template void HashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) { attribute.null_values = T(null_value.get>()); - attribute.maps = std::make_unique>(); + if (!sparse) + attribute.maps = std::make_unique>(); + else + attribute.sparse_maps = std::make_unique>(); } HashedDictionary::Attribute HashedDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value) { - Attribute attr{type, {}, {}, {}}; + Attribute attr{type, {}, {}, {}, {}}; switch (type) { @@ -535,7 +576,10 @@ HashedDictionary::Attribute HashedDictionary::createAttributeWithType(const Attr case AttributeUnderlyingType::utString: { attr.null_values = null_value.get(); - attr.maps = std::make_unique>(); + if (!sparse) + attr.maps = std::make_unique>(); + else + attr.sparse_maps = std::make_unique>(); attr.string_arena = std::make_unique(); break; } @@ -545,28 +589,43 @@ HashedDictionary::Attribute HashedDictionary::createAttributeWithType(const Attr } -template -void HashedDictionary::getItemsImpl( - const Attribute & attribute, const PaddedPODArray & ids, ValueSetter && set_value, DefaultGetter && get_default) const +template +void HashedDictionary::getItemsAttrImpl( + const AttrType & attr, const PaddedPODArray & ids, ValueSetter && set_value, DefaultGetter && get_default) const { - const auto & attr = *std::get>(attribute.maps); const auto rows = ext::size(ids); for (const auto i : ext::range(0, rows)) { const auto it = attr.find(ids[i]); - set_value(i, it != attr.end() ? static_cast(it->getSecond()) : get_default(i)); + set_value(i, it != attr.end() ? static_cast(second(*it)) : get_default(i)); } query_count.fetch_add(rows, std::memory_order_relaxed); } +template +void HashedDictionary::getItemsImpl( + const Attribute & attribute, const PaddedPODArray & ids, ValueSetter && set_value, DefaultGetter && get_default) const +{ + if (!sparse) + return getItemsAttrImpl(*std::get>(attribute.maps), ids, set_value, get_default); + return getItemsAttrImpl(*std::get>(attribute.sparse_maps), ids, set_value, get_default); +} template bool HashedDictionary::setAttributeValueImpl(Attribute & attribute, const Key id, const T value) { - auto & map = *std::get>(attribute.maps); - return map.insert({id, value}).second; + if (!sparse) + { + auto & map = *std::get>(attribute.maps); + return map.insert({id, value}).second; + } + else + { + auto & map = *std::get>(attribute.sparse_maps); + return map.insert({id, value}).second; + } } bool HashedDictionary::setAttributeValue(Attribute & attribute, const Key id, const Field & value) @@ -605,10 +664,18 @@ bool HashedDictionary::setAttributeValue(Attribute & attribute, const Key id, co case AttributeUnderlyingType::utString: { - auto & map = *std::get>(attribute.maps); const auto & string = value.get(); const auto string_in_arena = attribute.string_arena->insert(string.data(), string.size()); - return map.insert({id, StringRef{string_in_arena, string.size()}}).second; + if (!sparse) + { + auto & map = *std::get>(attribute.maps); + return map.insert({id, StringRef{string_in_arena, string.size()}}).second; + } + else + { + auto & map = *std::get>(attribute.sparse_maps); + return map.insert({id, StringRef{string_in_arena, string.size()}}).second; + } } } @@ -636,18 +703,23 @@ void HashedDictionary::has(const Attribute & attribute, const PaddedPODArray -PaddedPODArray HashedDictionary::getIds(const Attribute & attribute) const +template +PaddedPODArray HashedDictionary::getIdsAttrImpl(const AttrType & attr) const { - const HashMap & attr = *std::get>(attribute.maps); - PaddedPODArray ids; ids.reserve(attr.size()); for (const auto & value : attr) - ids.push_back(value.getFirst()); + ids.push_back(first(value)); return ids; } +template +PaddedPODArray HashedDictionary::getIds(const Attribute & attribute) const +{ + if (!sparse) + return getIdsAttrImpl(*std::get>(attribute.maps)); + return getIdsAttrImpl(*std::get>(attribute.sparse_maps)); +} PaddedPODArray HashedDictionary::getIds() const { @@ -714,9 +786,11 @@ void registerDictionaryHashed(DictionaryFactory & factory) ErrorCodes::BAD_ARGUMENTS}; const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"}; const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false); - return std::make_unique(name, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty); + const bool sparse = name == "sparse_hashed"; + return std::make_unique(name, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty, sparse); }; factory.registerLayout("hashed", create_layout); + factory.registerLayout("sparse_hashed", create_layout); } } diff --git a/dbms/src/Dictionaries/HashedDictionary.h b/dbms/src/Dictionaries/HashedDictionary.h index d1aa5a38d97..5cd31ba3e80 100644 --- a/dbms/src/Dictionaries/HashedDictionary.h +++ b/dbms/src/Dictionaries/HashedDictionary.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "DictionaryStructure.h" #include "IDictionary.h" @@ -26,6 +27,7 @@ public: DictionarySourcePtr source_ptr_, const DictionaryLifetime dict_lifetime_, bool require_nonempty_, + bool sparse_, BlockPtr saved_block_ = nullptr); std::string getName() const override { return name; } @@ -46,7 +48,7 @@ public: std::shared_ptr clone() const override { - return std::make_shared(name, dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, saved_block); + return std::make_shared(name, dict_struct, source_ptr->clone(), dict_lifetime, require_nonempty, sparse, saved_block); } const IDictionarySource * getSource() const override { return source_ptr.get(); } @@ -149,6 +151,11 @@ private: template using CollectionPtrType = std::unique_ptr>; + template + using SparseCollectionType = google::sparse_hash_map>; + template + using SparseCollectionPtrType = std::unique_ptr>; + struct Attribute final { AttributeUnderlyingType type; @@ -186,6 +193,23 @@ private: CollectionPtrType, CollectionPtrType> maps; + std::variant< + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType, + SparseCollectionPtrType> + sparse_maps; std::unique_ptr string_arena; }; @@ -207,6 +231,9 @@ private: Attribute createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value); + template + void getItemsAttrImpl( + const AttrType & attr, const PaddedPODArray & ids, ValueSetter && set_value, DefaultGetter && get_default) const; template void getItemsImpl( const Attribute & attribute, const PaddedPODArray & ids, ValueSetter && set_value, DefaultGetter && get_default) const; @@ -221,11 +248,15 @@ private: template void has(const Attribute & attribute, const PaddedPODArray & ids, PaddedPODArray & out) const; + template + PaddedPODArray getIdsAttrImpl(const AttrType & attr) const; template PaddedPODArray getIds(const Attribute & attribute) const; PaddedPODArray getIds() const; + template + void isInAttrImpl(const AttrType & attr, const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; template void isInImpl(const ChildType & child_ids, const AncestorType & ancestor_ids, PaddedPODArray & out) const; @@ -234,6 +265,7 @@ private: const DictionarySourcePtr source_ptr; const DictionaryLifetime dict_lifetime; const bool require_nonempty; + const bool sparse; std::map attribute_index_by_name; std::vector attributes; diff --git a/dbms/src/Functions/CMakeLists.txt b/dbms/src/Functions/CMakeLists.txt index f495d6d8665..d3d8eb34d36 100644 --- a/dbms/src/Functions/CMakeLists.txt +++ b/dbms/src/Functions/CMakeLists.txt @@ -33,7 +33,7 @@ if (OPENSSL_CRYPTO_LIBRARY) endif() target_include_directories(clickhouse_functions PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/include) -target_include_directories(clickhouse_functions SYSTEM PRIVATE ${DIVIDE_INCLUDE_DIR} ${METROHASH_INCLUDE_DIR}) +target_include_directories(clickhouse_functions SYSTEM PRIVATE ${DIVIDE_INCLUDE_DIR} ${METROHASH_INCLUDE_DIR} ${SPARSEHASH_INCLUDE_DIR}) if (CONSISTENT_HASHING_INCLUDE_DIR) target_include_directories (clickhouse_functions PRIVATE ${CONSISTENT_HASHING_INCLUDE_DIR}) diff --git a/dbms/tests/config/ints_dictionary.xml b/dbms/tests/config/ints_dictionary.xml index c70f820ce36..a406c8553c0 100644 --- a/dbms/tests/config/ints_dictionary.xml +++ b/dbms/tests/config/ints_dictionary.xml @@ -125,6 +125,69 @@ + + hashed_sparse_ints + + + localhost + 9000 + default + + test_00950 + ints
+
+ + 0 + + + + + + key + + + i8 + Int8 + 0 + + + i16 + Int16 + 0 + + + i32 + Int32 + 0 + + + i64 + Int64 + 0 + + + u8 + UInt8 + 0 + + + u16 + UInt16 + 0 + + + u32 + UInt32 + 0 + + + u64 + UInt64 + 0 + + +
+ cache_ints diff --git a/dbms/tests/queries/0_stateless/00950_dict_get.reference b/dbms/tests/queries/0_stateless/00950_dict_get.reference index fabfda8425f..3010a2989c0 100644 --- a/dbms/tests/queries/0_stateless/00950_dict_get.reference +++ b/dbms/tests/queries/0_stateless/00950_dict_get.reference @@ -4,6 +4,9 @@ dictGetOrDefault flat_ints 0 42 42 42 42 42 42 42 42 dictGet hashed_ints 1 1 1 1 1 1 1 1 1 dictGetOrDefault hashed_ints 1 1 1 1 1 1 1 1 1 dictGetOrDefault hashed_ints 0 42 42 42 42 42 42 42 42 +dictGet hashed_sparse_ints 1 1 1 1 1 1 1 1 1 +dictGetOrDefault hashed_sparse_ints 1 1 1 1 1 1 1 1 1 +dictGetOrDefault hashed_sparse_ints 0 42 42 42 42 42 42 42 42 dictGet cache_ints 1 1 1 1 1 1 1 1 1 dictGetOrDefault cache_ints 1 1 1 1 1 1 1 1 1 dictGetOrDefault cache_ints 0 42 42 42 42 42 42 42 42 diff --git a/dbms/tests/queries/0_stateless/00950_dict_get.sql b/dbms/tests/queries/0_stateless/00950_dict_get.sql index 20e5e376855..2483a21c0d3 100644 --- a/dbms/tests/queries/0_stateless/00950_dict_get.sql +++ b/dbms/tests/queries/0_stateless/00950_dict_get.sql @@ -69,6 +69,34 @@ select 'dictGetOrDefault', 'hashed_ints' as dict_name, toUInt64(0) as k, dictGetOrDefault(dict_name, 'u32', k, toUInt32(42)), dictGetOrDefault(dict_name, 'u64', k, toUInt64(42)); +select 'dictGet', 'hashed_sparse_ints' as dict_name, toUInt64(1) as k, + dictGet(dict_name, 'i8', k), + dictGet(dict_name, 'i16', k), + dictGet(dict_name, 'i32', k), + dictGet(dict_name, 'i64', k), + dictGet(dict_name, 'u8', k), + dictGet(dict_name, 'u16', k), + dictGet(dict_name, 'u32', k), + dictGet(dict_name, 'u64', k); +select 'dictGetOrDefault', 'hashed_sparse_ints' as dict_name, toUInt64(1) as k, + dictGetOrDefault(dict_name, 'i8', k, toInt8(42)), + dictGetOrDefault(dict_name, 'i16', k, toInt16(42)), + dictGetOrDefault(dict_name, 'i32', k, toInt32(42)), + dictGetOrDefault(dict_name, 'i64', k, toInt64(42)), + dictGetOrDefault(dict_name, 'u8', k, toUInt8(42)), + dictGetOrDefault(dict_name, 'u16', k, toUInt16(42)), + dictGetOrDefault(dict_name, 'u32', k, toUInt32(42)), + dictGetOrDefault(dict_name, 'u64', k, toUInt64(42)); +select 'dictGetOrDefault', 'hashed_sparse_ints' as dict_name, toUInt64(0) as k, + dictGetOrDefault(dict_name, 'i8', k, toInt8(42)), + dictGetOrDefault(dict_name, 'i16', k, toInt16(42)), + dictGetOrDefault(dict_name, 'i32', k, toInt32(42)), + dictGetOrDefault(dict_name, 'i64', k, toInt64(42)), + dictGetOrDefault(dict_name, 'u8', k, toUInt8(42)), + dictGetOrDefault(dict_name, 'u16', k, toUInt16(42)), + dictGetOrDefault(dict_name, 'u32', k, toUInt32(42)), + dictGetOrDefault(dict_name, 'u64', k, toUInt64(42)); + select 'dictGet', 'cache_ints' as dict_name, toUInt64(1) as k, dictGet(dict_name, 'i8', k), dictGet(dict_name, 'i16', k), diff --git a/docs/en/query_language/dicts/external_dicts_dict_layout.md b/docs/en/query_language/dicts/external_dicts_dict_layout.md index c3096544d25..a9a80dbe761 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_layout.md +++ b/docs/en/query_language/dicts/external_dicts_dict_layout.md @@ -39,6 +39,7 @@ The configuration looks like this: - [flat](#flat) - [hashed](#dicts-external_dicts_dict_layout-hashed) +- [sparse_hashed](#dicts-external_dicts_dict_layout-sparse_hashed) - [cache](#cache) - [range_hashed](#range-hashed) - [complex_key_hashed](#complex-key-hashed) @@ -77,6 +78,18 @@ Configuration example: ``` +### sparse_hashed {#dicts-external_dicts_dict_layout-sparse_hashed} + +Similar to `hashed`, but uses less memory in favor more CPU usage. + +Configuration example: + +```xml + + + +``` + ### complex_key_hashed From 59763cbb3af1bb28db49d27c8bf011570cb66489 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 18 Sep 2019 02:16:49 +0300 Subject: [PATCH 148/309] Add ability to send events with absolute values to graphite sends only deltas for events, while sometimes it is more convenient to get absolute values (for example to calculate rate). This patch introduces another directive -- that will handle this. v0: use as bool/string v2: use true over absolute --- dbms/programs/server/MetricsTransmitter.cpp | 11 +++++++++++ dbms/programs/server/MetricsTransmitter.h | 5 ++++- dbms/programs/server/config.xml | 2 ++ docs/en/operations/server_settings/settings.md | 4 +++- docs/ru/operations/server_settings/settings.md | 4 +++- 5 files changed, 23 insertions(+), 3 deletions(-) diff --git a/dbms/programs/server/MetricsTransmitter.cpp b/dbms/programs/server/MetricsTransmitter.cpp index 8419d3e1b8c..73413cad1c0 100644 --- a/dbms/programs/server/MetricsTransmitter.cpp +++ b/dbms/programs/server/MetricsTransmitter.cpp @@ -21,6 +21,7 @@ MetricsTransmitter::MetricsTransmitter( { interval_seconds = config.getInt(config_name + ".interval", 60); send_events = config.getBool(config_name + ".events", true); + send_events_absolute = config.getBool(config_name + ".events_absolute", false); send_metrics = config.getBool(config_name + ".metrics", true); send_asynchronous_metrics = config.getBool(config_name + ".asynchronous_metrics", true); } @@ -95,6 +96,16 @@ void MetricsTransmitter::transmit(std::vector & prev_count } } + if (send_events_absolute) + { + for (size_t i = 0, end = ProfileEvents::end(); i < end; ++i) + { + const auto counter = ProfileEvents::global_counters[i].load(std::memory_order_relaxed); + std::string key{ProfileEvents::getName(static_cast(i))}; + key_vals.emplace_back(profile_events_absolute_path_prefix + key, counter); + } + } + if (send_metrics) { for (size_t i = 0, end = CurrentMetrics::end(); i < end; ++i) diff --git a/dbms/programs/server/MetricsTransmitter.h b/dbms/programs/server/MetricsTransmitter.h index b9c7fd7f179..1d5795c24db 100644 --- a/dbms/programs/server/MetricsTransmitter.h +++ b/dbms/programs/server/MetricsTransmitter.h @@ -24,7 +24,8 @@ class AsynchronousMetrics; /** Automatically sends - * - difference of ProfileEvents; + * - values deltas of ProfileEvents; + * - absolute values of ProfileEvents; * - values of CurrentMetrics; * - values of AsynchronousMetrics; * to Graphite at beginning of every minute. @@ -44,6 +45,7 @@ private: std::string config_name; UInt32 interval_seconds; bool send_events; + bool send_events_absolute; bool send_metrics; bool send_asynchronous_metrics; @@ -53,6 +55,7 @@ private: ThreadFromGlobalPool thread{&MetricsTransmitter::run, this}; static inline constexpr auto profile_events_path_prefix = "ClickHouse.ProfileEvents."; + static inline constexpr auto profile_events_absolute_path_prefix = "ClickHouse.ProfileEventsAbsolute."; static inline constexpr auto current_metrics_path_prefix = "ClickHouse.Metrics."; static inline constexpr auto asynchronous_metrics_path_prefix = "ClickHouse.AsynchronousMetrics."; }; diff --git a/dbms/programs/server/config.xml b/dbms/programs/server/config.xml index 34fe98b0e31..7263992b3d3 100644 --- a/dbms/programs/server/config.xml +++ b/dbms/programs/server/config.xml @@ -258,6 +258,7 @@ true true + false true @@ -269,6 +270,7 @@ true true + false false --> diff --git a/docs/en/operations/server_settings/settings.md b/docs/en/operations/server_settings/settings.md index 49c8bf6fbc1..baf9013fbbf 100644 --- a/docs/en/operations/server_settings/settings.md +++ b/docs/en/operations/server_settings/settings.md @@ -141,7 +141,8 @@ Settings: - timeout – The timeout for sending data, in seconds. - root_path – Prefix for keys. - metrics – Sending data from a :ref:`system_tables-system.metrics` table. -- events – Sending data from a :ref:`system_tables-system.events` table. +- events – Sending deltas data from a :ref:`system_tables-system.events` table +- events_absolute – Sending absolute data from a :ref:`system_tables-system.events` table - asynchronous_metrics – Sending data from a :ref:`system_tables-system.asynchronous_metrics` table. You can configure multiple `` clauses. For instance, you can use this for sending different data at different intervals. @@ -157,6 +158,7 @@ You can configure multiple `` clauses. For instance, you can use this one_min true true + false true ``` diff --git a/docs/ru/operations/server_settings/settings.md b/docs/ru/operations/server_settings/settings.md index 1aacd525d24..2f6362b7635 100644 --- a/docs/ru/operations/server_settings/settings.md +++ b/docs/ru/operations/server_settings/settings.md @@ -140,7 +140,8 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat - timeout - Таймаут отправки данных в секундах. - root_path - Префикс для ключей. - metrics - Отправка данных из таблицы :ref:`system_tables-system.metrics`. -- events - Отправка данных из таблицы :ref:`system_tables-system.events`. +- events - Отправка дельты данных из таблицы :ref:`system_tables-system.events` +- events_absolute - Отправка абсолютных данных из таблицы :ref:`system_tables-system.events` - asynchronous_metrics - Отправка данных из таблицы :ref:`system_tables-system.asynchronous_metrics`. Можно определить несколько секций ``, например, для передачи различных данных с различной частотой. @@ -156,6 +157,7 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat one_min true true + false true ``` From ab7ecd84a1738170657c1fcebd6379d60ede7a5a Mon Sep 17 00:00:00 2001 From: Zhichang Yu Date: Sat, 21 Sep 2019 23:34:44 +0800 Subject: [PATCH 149/309] fix bitmapMin and bitmapMax doc --- dbms/src/Functions/FunctionsBitmap.h | 2 +- docs/en/query_language/functions/bitmap_functions.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dbms/src/Functions/FunctionsBitmap.h b/dbms/src/Functions/FunctionsBitmap.h index ad4f16b16ef..06d45cdd54f 100644 --- a/dbms/src/Functions/FunctionsBitmap.h +++ b/dbms/src/Functions/FunctionsBitmap.h @@ -49,7 +49,7 @@ namespace ErrorCodes * Retrun bitmap cardinality: * bitmapCardinality: bitmap -> integer * - * Retrun smallest value in the set: + * Retrun the smallest value in the set: * bitmapMin: bitmap -> integer * * Retrun the greatest value in the set: diff --git a/docs/en/query_language/functions/bitmap_functions.md b/docs/en/query_language/functions/bitmap_functions.md index fdc2e8a7a0d..29cf67d70f0 100644 --- a/docs/en/query_language/functions/bitmap_functions.md +++ b/docs/en/query_language/functions/bitmap_functions.md @@ -294,7 +294,7 @@ SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res ## bitmapMin -Retrun smallest value of type UInt64 in the set, UINT32_MAX if the set is empty. +Retrun the smallest value of type UInt64 in the set, UINT32_MAX if the set is empty. ``` @@ -319,7 +319,7 @@ SELECT bitmapMin(bitmapBuild([1, 2, 3, 4, 5])) AS res ## bitmapMax -Retrun smallest value of type UInt64 in the set, 0 if the set is empty. +Retrun the greatest value of type UInt64 in the set, 0 if the set is empty. ``` From 1fbd2fb6a81a61319b35725df56e74881f65c08e Mon Sep 17 00:00:00 2001 From: Silviu Caragea Date: Sat, 21 Sep 2019 21:30:01 +0300 Subject: [PATCH 150/309] Fix osx build --- dbms/src/Common/PoolWithFailoverBase.h | 2 +- dbms/src/Common/QueryProfiler.cpp | 6 ++++++ dbms/src/Common/StackTrace.cpp | 1 + dbms/src/Common/TraceCollector.cpp | 2 +- dbms/src/Common/checkStackSize.cpp | 18 ++++++++++++++++++ dbms/src/Common/new_delete.cpp | 4 ++++ dbms/src/Interpreters/MetricLog.cpp | 2 +- libs/libcommon/src/sleep.cpp | 18 ++++++++++++++++++ 8 files changed, 50 insertions(+), 3 deletions(-) diff --git a/dbms/src/Common/PoolWithFailoverBase.h b/dbms/src/Common/PoolWithFailoverBase.h index 989831ce2b0..6f1c88b53da 100644 --- a/dbms/src/Common/PoolWithFailoverBase.h +++ b/dbms/src/Common/PoolWithFailoverBase.h @@ -199,7 +199,7 @@ PoolWithFailoverBase::getMany( for (const ShuffledPool & pool: shuffled_pools) { auto & pool_state = shared_pool_states[pool.index]; - pool_state.error_count = std::min(max_error_cap, pool_state.error_count + pool.error_count); + pool_state.error_count = std::min(max_error_cap, static_cast(pool_state.error_count + pool.error_count)); } }); diff --git a/dbms/src/Common/QueryProfiler.cpp b/dbms/src/Common/QueryProfiler.cpp index a0b75c567a9..d4e0af90ceb 100644 --- a/dbms/src/Common/QueryProfiler.cpp +++ b/dbms/src/Common/QueryProfiler.cpp @@ -30,10 +30,13 @@ namespace /// Thus upper bound on query_id length should be introduced to avoid buffer overflow in signal handler. constexpr size_t QUERY_ID_MAX_LEN = 1024; +# if !defined(__APPLE__) thread_local size_t write_trace_iteration = 0; +#endif void writeTraceInfo(TimerType timer_type, int /* sig */, siginfo_t * info, void * context) { +# if !defined(__APPLE__) /// Quickly drop if signal handler is called too frequently. /// Otherwise we may end up infinitelly processing signals instead of doing any useful work. ++write_trace_iteration; @@ -50,6 +53,9 @@ namespace return; } } +#else + UNUSED(info); +#endif constexpr size_t buf_size = sizeof(char) + // TraceCollector stop flag 8 * sizeof(char) + // maximum VarUInt length for string size diff --git a/dbms/src/Common/StackTrace.cpp b/dbms/src/Common/StackTrace.cpp index 9981d0941aa..9694e33a2dd 100644 --- a/dbms/src/Common/StackTrace.cpp +++ b/dbms/src/Common/StackTrace.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/dbms/src/Common/TraceCollector.cpp b/dbms/src/Common/TraceCollector.cpp index 9451c3f88e5..6a160326a18 100644 --- a/dbms/src/Common/TraceCollector.cpp +++ b/dbms/src/Common/TraceCollector.cpp @@ -46,7 +46,7 @@ TraceCollector::TraceCollector(std::shared_ptr & trace_log_) if (-1 == fcntl(trace_pipe.fds_rw[1], F_SETFL, flags | O_NONBLOCK)) throwFromErrno("Cannot set non-blocking mode of pipe", ErrorCodes::CANNOT_FCNTL); -#if !defined(__FreeBSD__) +#if !defined(__FreeBSD__) && !defined(__APPLE__) /** Increase pipe size to avoid slowdown during fine-grained trace collection. */ int pipe_size = fcntl(trace_pipe.fds_rw[1], F_GETPIPE_SZ); diff --git a/dbms/src/Common/checkStackSize.cpp b/dbms/src/Common/checkStackSize.cpp index e7f91bc3330..7459277b563 100644 --- a/dbms/src/Common/checkStackSize.cpp +++ b/dbms/src/Common/checkStackSize.cpp @@ -27,14 +27,32 @@ void checkStackSize() if (!stack_address) { +#if defined(__APPLE__) + // pthread_get_stacksize_np() returns a value too low for the main thread on + // OSX 10.9, http://mail.openjdk.java.net/pipermail/hotspot-dev/2013-October/011369.html + // + // Multiple workarounds possible, adopt the one made by https://github.com/robovm/robovm/issues/274 + // https://developer.apple.com/library/mac/documentation/Cocoa/Conceptual/Multithreading/CreatingThreads/CreatingThreads.html + // Stack size for the main thread is 8MB on OSX excluding the guard page size. + pthread_t thread = pthread_self(); + max_stack_size = pthread_main_np() ? (8 * 1024 * 1024) : pthread_get_stacksize_np(thread); + stack_address = pthread_get_stackaddr_np(thread); +#else pthread_attr_t attr; +#if defined(__FreeBSD__) + pthread_attr_init(&attr); + if (0 != pthread_attr_get_np(pthread_self(), &attr)) + throwFromErrno("Cannot pthread_attr_get_np", ErrorCodes::CANNOT_PTHREAD_ATTR); +#else if (0 != pthread_getattr_np(pthread_self(), &attr)) throwFromErrno("Cannot pthread_getattr_np", ErrorCodes::CANNOT_PTHREAD_ATTR); +#endif SCOPE_EXIT({ pthread_attr_destroy(&attr); }); if (0 != pthread_attr_getstack(&attr, &stack_address, &max_stack_size)) throwFromErrno("Cannot pthread_getattr_np", ErrorCodes::CANNOT_PTHREAD_ATTR); +#endif } const void * frame_address = __builtin_frame_address(0); diff --git a/dbms/src/Common/new_delete.cpp b/dbms/src/Common/new_delete.cpp index f2a85163035..5e32b910b19 100644 --- a/dbms/src/Common/new_delete.cpp +++ b/dbms/src/Common/new_delete.cpp @@ -1,4 +1,8 @@ +#if defined(__MACH__) +#include +#else #include +#endif #include #include diff --git a/dbms/src/Interpreters/MetricLog.cpp b/dbms/src/Interpreters/MetricLog.cpp index 59a500010dc..5622e0c65b0 100644 --- a/dbms/src/Interpreters/MetricLog.cpp +++ b/dbms/src/Interpreters/MetricLog.cpp @@ -103,7 +103,7 @@ void MetricLog::metricThreadFunction() for (size_t i = 0, end = ProfileEvents::end(); i < end; ++i) { const ProfileEvents::Count new_value = ProfileEvents::global_counters[i].load(std::memory_order_relaxed); - UInt64 & old_value = prev_profile_events[i]; + auto & old_value = prev_profile_events[i]; elem.profile_events[i] = new_value - old_value; old_value = new_value; } diff --git a/libs/libcommon/src/sleep.cpp b/libs/libcommon/src/sleep.cpp index 710b387d62e..a11140d0cb5 100644 --- a/libs/libcommon/src/sleep.cpp +++ b/libs/libcommon/src/sleep.cpp @@ -3,6 +3,11 @@ #include #include +#if defined(__APPLE__) +#include +#include +#endif + /** * Sleep with nanoseconds precision. Tolerant to signal interruptions * @@ -14,6 +19,18 @@ */ void sleepForNanoseconds(uint64_t nanoseconds) { +# if defined(__APPLE__) + //https://developer.apple.com/library/archive/technotes/tn2169/_index.html + //https://dshil.github.io/blog/missed-os-x-clock-guide/ + static mach_timebase_info_data_t timebase_info = {0}; + if(timebase_info.denom == 0) + mach_timebase_info(&timebase_info); + + uint64_t time_to_wait = nanoseconds * timebase_info.denom / timebase_info.numer; + uint64_t now = mach_absolute_time(); + + while(mach_wait_until(now + time_to_wait) != KERN_SUCCESS); +#else constexpr auto clock_type = CLOCK_MONOTONIC; struct timespec current_time; @@ -29,6 +46,7 @@ void sleepForNanoseconds(uint64_t nanoseconds) finish_time.tv_sec += (nanoseconds / resolution) + extra_second; while (clock_nanosleep(clock_type, TIMER_ABSTIME, &finish_time, nullptr) == EINTR); +#endif } void sleepForMicroseconds(uint64_t microseconds) From ab02aad198a1d4cbf163d6fe5d98e83e914b288a Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sat, 21 Sep 2019 23:44:56 +0300 Subject: [PATCH 151/309] Update sleep.cpp --- libs/libcommon/src/sleep.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/libcommon/src/sleep.cpp b/libs/libcommon/src/sleep.cpp index a11140d0cb5..cfe57151bb1 100644 --- a/libs/libcommon/src/sleep.cpp +++ b/libs/libcommon/src/sleep.cpp @@ -23,13 +23,13 @@ void sleepForNanoseconds(uint64_t nanoseconds) //https://developer.apple.com/library/archive/technotes/tn2169/_index.html //https://dshil.github.io/blog/missed-os-x-clock-guide/ static mach_timebase_info_data_t timebase_info = {0}; - if(timebase_info.denom == 0) + if (timebase_info.denom == 0) mach_timebase_info(&timebase_info); uint64_t time_to_wait = nanoseconds * timebase_info.denom / timebase_info.numer; uint64_t now = mach_absolute_time(); - while(mach_wait_until(now + time_to_wait) != KERN_SUCCESS); + while (mach_wait_until(now + time_to_wait) != KERN_SUCCESS); #else constexpr auto clock_type = CLOCK_MONOTONIC; From 5a14d97587aa8d16bca3e7aa25c16d8afac75f91 Mon Sep 17 00:00:00 2001 From: Silviu Caragea Date: Sun, 22 Sep 2019 01:04:17 +0300 Subject: [PATCH 152/309] Fix linking when __ELF__ is not defined --- dbms/src/Functions/registerFunctionsIntrospection.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dbms/src/Functions/registerFunctionsIntrospection.cpp b/dbms/src/Functions/registerFunctionsIntrospection.cpp index 700a568d822..706175c777f 100644 --- a/dbms/src/Functions/registerFunctionsIntrospection.cpp +++ b/dbms/src/Functions/registerFunctionsIntrospection.cpp @@ -3,16 +3,20 @@ namespace DB class FunctionFactory; +#ifdef __ELF__ void registerFunctionAddressToSymbol(FunctionFactory & factory); -void registerFunctionDemangle(FunctionFactory & factory); void registerFunctionAddressToLine(FunctionFactory & factory); +#endif +void registerFunctionDemangle(FunctionFactory & factory); void registerFunctionTrap(FunctionFactory & factory); void registerFunctionsIntrospection(FunctionFactory & factory) { +#ifdef __ELF__ registerFunctionAddressToSymbol(factory); - registerFunctionDemangle(factory); registerFunctionAddressToLine(factory); +#endif + registerFunctionDemangle(factory); registerFunctionTrap(factory); } From d446205eece30eaf840729d001bfcd05aa775fd4 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sun, 22 Sep 2019 04:53:19 +0300 Subject: [PATCH 153/309] Update QueryProfiler.cpp --- dbms/src/Common/QueryProfiler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Common/QueryProfiler.cpp b/dbms/src/Common/QueryProfiler.cpp index d4e0af90ceb..3060a282cd3 100644 --- a/dbms/src/Common/QueryProfiler.cpp +++ b/dbms/src/Common/QueryProfiler.cpp @@ -54,7 +54,7 @@ namespace } } #else - UNUSED(info); + UNUSED(info); #endif constexpr size_t buf_size = sizeof(char) + // TraceCollector stop flag From e42e26a453201739130b86605c89a7025fbacacf Mon Sep 17 00:00:00 2001 From: Yuriy Date: Mon, 16 Sep 2019 05:33:03 +0300 Subject: [PATCH 154/309] updated contrib/mariadb-connector-c --- contrib/mariadb-connector-c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/mariadb-connector-c b/contrib/mariadb-connector-c index 9bbf08c2a0f..4d473f89bb8 160000 --- a/contrib/mariadb-connector-c +++ b/contrib/mariadb-connector-c @@ -1 +1 @@ -Subproject commit 9bbf08c2a0fb7b34671291fce13e6af62c5343a2 +Subproject commit 4d473f89bb86ae485a116f6271201b214d0ac4cc From 6020ba187e78d6a66de8432f9fe549571e5d8a9b Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sun, 22 Sep 2019 04:58:21 +0300 Subject: [PATCH 155/309] Update AggregateFunctionGroupBitmapData.h --- .../AggregateFunctionGroupBitmapData.h | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h index e1fbd092490..9a74af530d3 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h @@ -63,7 +63,12 @@ public: roaring_bitmap_add(rb, value); } - UInt64 size() const { return isSmall() ? small.size() : roaring_bitmap_get_cardinality(rb); } + UInt64 size() const + { + return isSmall() + ? small.size() + : roaring_bitmap_get_cardinality(rb); + } void merge(const RoaringBitmapWithSmallSet & r1) { @@ -91,7 +96,7 @@ public: std::string s; readStringBinary(s,in); rb = roaring_bitmap_portable_deserialize(s.c_str()); - for (const auto & x : small) //merge from small + for (const auto & x : small) // merge from small roaring_bitmap_add(rb, x.getValue()); } else @@ -245,13 +250,13 @@ public: { for (const auto & x : small) if (r1.small.find(x.getValue()) != r1.small.end()) - retSize++; + ++retSize; } else if (isSmall() && r1.isLarge()) { for (const auto & x : small) if (roaring_bitmap_contains(r1.rb, x.getValue())) - retSize++; + ++retSize; } else { @@ -391,8 +396,7 @@ public: */ UInt8 rb_contains(const UInt32 x) const { - return isSmall() ? small.find(x) != small.end() : - roaring_bitmap_contains(rb, x); + return isSmall() ? small.find(x) != small.end() : roaring_bitmap_contains(rb, x); } /** @@ -460,7 +464,7 @@ public: /** * Return new set with specified range (not include the range_end) */ - UInt64 rb_range(UInt32 range_start, UInt32 range_end, RoaringBitmapWithSmallSet& r1) const + UInt64 rb_range(UInt32 range_start, UInt32 range_end, RoaringBitmapWithSmallSet & r1) const { UInt64 count = 0; if (range_start >= range_end) @@ -473,7 +477,7 @@ public: if (UInt32(val) >= range_start && UInt32(val) < range_end) { r1.add(val); - count++; + ++count; } } } @@ -486,7 +490,7 @@ public: { r1.add(iterator.current_value); roaring_advance_uint32_iterator(&iterator); - count++; + ++count; } } return count; @@ -495,7 +499,7 @@ public: /** * Return new set of the smallest `limit` values in set which is no less than `range_start`. */ - UInt64 rb_limit(UInt32 range_start, UInt32 limit, RoaringBitmapWithSmallSet& r1) const + UInt64 rb_limit(UInt32 range_start, UInt32 limit, RoaringBitmapWithSmallSet & r1) const { UInt64 count = 0; if (isSmall()) @@ -512,7 +516,7 @@ public: sort(ans.begin(), ans.end()); if (limit > ans.size()) limit = ans.size(); - for (size_t i=0; i> 16) == 0) From fa8a35934ecab58cf5344f937b08a00c1d540aa6 Mon Sep 17 00:00:00 2001 From: Yuriy Date: Sun, 22 Sep 2019 05:03:40 +0300 Subject: [PATCH 156/309] empty From 9afea6f0b388652f44c2572738a0c1f332358088 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sun, 22 Sep 2019 05:04:49 +0300 Subject: [PATCH 157/309] Update HashedDictionary.cpp --- dbms/src/Dictionaries/HashedDictionary.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/dbms/src/Dictionaries/HashedDictionary.cpp b/dbms/src/Dictionaries/HashedDictionary.cpp index 4d195b5139a..9aa111946ee 100644 --- a/dbms/src/Dictionaries/HashedDictionary.cpp +++ b/dbms/src/Dictionaries/HashedDictionary.cpp @@ -6,17 +6,15 @@ namespace { +/// NOTE: Trailing return type is explicitly specified for SFINAE. + /// google::sparse_hash_map -template auto first(const T &lhs) -> decltype(lhs.first) -{ return lhs.first; } -template auto second(const T &lhs) -> decltype(lhs.second) -{ return lhs.second; } +template auto first(const T & value) -> decltype(value.first) { return lhs.first; } +template auto second(const T & value) -> decltype(value.second) { return lhs.second; } /// HashMap -template auto first(const T &lhs) -> decltype(lhs.getFirst()) -{ return lhs.getFirst(); } -template auto second(const T &lhs) -> decltype(lhs.getSecond()) -{ return lhs.getSecond(); } +template auto first(const T & value) -> decltype(value.getFirst()) { return lhs.getFirst(); } +template auto second(const T & value) -> decltype(value.getSecond()) { return lhs.getSecond(); } } From ab9a7be45ad647e7be832a61f31f03ae90ff9a40 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sun, 22 Sep 2019 05:09:40 +0300 Subject: [PATCH 158/309] Comments are the must. --- dbms/src/Dictionaries/HashedDictionary.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dbms/src/Dictionaries/HashedDictionary.h b/dbms/src/Dictionaries/HashedDictionary.h index 5cd31ba3e80..fac7bddbb6e 100644 --- a/dbms/src/Dictionaries/HashedDictionary.h +++ b/dbms/src/Dictionaries/HashedDictionary.h @@ -13,6 +13,10 @@ #include "IDictionary.h" #include "IDictionarySource.h" +/** This dictionary stores all content in a hash table in memory + * (a separate Key -> Value map for each attribute) + * Two variants of hash table is supported: a fast HashMap and memory efficient sparse_hash_map. + */ namespace DB { From a12cffee91e3c8a423846769be519fce210a29ef Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sun, 22 Sep 2019 05:11:08 +0300 Subject: [PATCH 159/309] Update HashedDictionary.h --- dbms/src/Dictionaries/HashedDictionary.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Dictionaries/HashedDictionary.h b/dbms/src/Dictionaries/HashedDictionary.h index fac7bddbb6e..4f9cded40fc 100644 --- a/dbms/src/Dictionaries/HashedDictionary.h +++ b/dbms/src/Dictionaries/HashedDictionary.h @@ -15,7 +15,7 @@ /** This dictionary stores all content in a hash table in memory * (a separate Key -> Value map for each attribute) - * Two variants of hash table is supported: a fast HashMap and memory efficient sparse_hash_map. + * Two variants of hash table are supported: a fast HashMap and memory efficient sparse_hash_map. */ namespace DB From 4465fdb8bfbda9ce0cc7e70e087c5ea341c4177a Mon Sep 17 00:00:00 2001 From: fenglv Date: Fri, 20 Sep 2019 00:50:21 +0800 Subject: [PATCH 160/309] Add test fix fix fix --- dbms/src/Functions/repeat.cpp | 419 ++++++++---------- .../01013_repeat_function.reference | 37 ++ .../0_stateless/01013_repeat_function.sql | 24 + 3 files changed, 257 insertions(+), 223 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/01013_repeat_function.reference create mode 100644 dbms/tests/queries/0_stateless/01013_repeat_function.sql diff --git a/dbms/src/Functions/repeat.cpp b/dbms/src/Functions/repeat.cpp index 5c821e498ca..81e780fb53b 100644 --- a/dbms/src/Functions/repeat.cpp +++ b/dbms/src/Functions/repeat.cpp @@ -10,227 +10,200 @@ namespace DB { - namespace ErrorCodes - { - extern const int ILLEGAL_COLUMN; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; - } - - struct RepeatImpl - { - static void vectorNonConstStr( - const ColumnString::Chars & data, - const ColumnString::Offsets & offsets, - ColumnString::Chars & res_data, - ColumnString::Offsets & res_offsets, - const UInt64 repeatTime) - { - UInt64 data_size = 0; - res_offsets.assign(offsets); - for (UInt64 i = 0; i < offsets.size(); ++i) - { - data_size += (offsets[i] - offsets[i - 1] - 1) * repeatTime + 1; - res_offsets[i] = data_size; - } - res_data.resize(data_size); - for (UInt64 i = 0; i < res_offsets.size(); ++i) - { - array(data.data() + offsets[i - 1], res_data.data() + res_offsets[i - 1], offsets[i] - offsets[i - 1], repeatTime); - } - } - - static void - vectorConst(const String & copy_str, const UInt64 repeatTime, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) - { - UInt64 data_size = copy_str.size() * repeatTime + 1; - res_data.resize(data_size); - res_offsets.resize_fill(1, data_size); - array((UInt8 *)copy_str.data(), res_data.data(), copy_str.size() + 1, repeatTime); - } - - template - static void vectorNonConst( - const ColumnString::Chars & data, - const ColumnString::Offsets & offsets, - ColumnString::Chars & res_data, - ColumnString::Offsets & res_offsets, - const PaddedPODArray & col_num) - { - UInt64 data_size = 0; - res_offsets.assign(offsets); - for (UInt64 i = 0; i < col_num.size(); ++i) - { - data_size += (offsets[i] - offsets[i - 1] - 1) * col_num[i] + 1; - res_offsets[i] = data_size; - } - res_data.resize(data_size); - for (UInt64 i = 0; i < col_num.size(); ++i) - { - array(data.data() + offsets[i - 1], res_data.data() + res_offsets[i - 1], offsets[i] - offsets[i - 1], col_num[i]); - } - } - - - template - static void vectorNonConstInteger( - const String & copy_str, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets, const PaddedPODArray & col_num) - { - UInt64 data_size = 0; - res_offsets.resize(col_num.size()); - UInt64 str_size = copy_str.size(); - for (UInt64 i = 0; i < col_num.size(); ++i) - { - data_size += str_size * col_num[i] + 1; - res_offsets[i] = data_size; - } - res_data.resize(data_size); - for (UInt64 i = 0; i < col_num.size(); ++i) - { - array((UInt8 *)copy_str.data(), res_data.data() + res_offsets[i - 1], str_size + 1, col_num[i]); - } - } - - private: - template - static void array(const UInt8 * src, UInt8 * dst, const UInt64 size, T repeatTime) - { - UInt64 i = 0; - do - { - memcpy(dst, src, size - 1); - dst += size - 1; - ++i; - } while (i < repeatTime); - *dst = 0; - } - }; - - - template - class FunctionRepeatImpl : public IFunction - { - template - static bool castType(const IDataType * type, F && f) - { - return castTypeToEither< - DataTypeUInt8, - DataTypeUInt16, - DataTypeUInt32, - DataTypeUInt64>(type, std::forward(f)); - } - - public: - static constexpr auto name = "repeat"; - static FunctionPtr create(const Context &) { return std::make_shared(); } - - String getName() const override { return name; } - - size_t getNumberOfArguments() const override { return 2; } - - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override - { - if (!isString(arguments[0])) - throw Exception( - "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - if (!isUnsignedInteger(arguments[1])) - throw Exception( - "Illegal type " + arguments[1]->getName() + " of argument of function 1" + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - return arguments[0]; - } - - bool useDefaultImplementationForConstants() const override { return true; } - - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override - { - const ColumnPtr strcolumn = block.getByPosition(arguments[0]).column; - const ColumnPtr numcolumn = block.getByPosition(arguments[1]).column; - - if (const ColumnString * col = checkAndGetColumn(strcolumn.get())) - { - if (const ColumnConst * scale_column_num = checkAndGetColumn(numcolumn.get())) - { - Field scale_field_num = scale_column_num->getField(); - UInt64 repeat_time = scale_field_num.get(); - auto col_res = ColumnString::create(); - Impl::vectorNonConstStr( - col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), repeat_time); - block.getByPosition(result).column = std::move(col_res); - } - else if (!castType( - block.getByPosition(arguments[1]).type.get(), [&](const auto & type) { - using DataType = std::decay_t; - using T0 = typename DataType::FieldType; - const ColumnVector * colnum = checkAndGetColumn>(numcolumn.get()); - if (col->size() > 1 && colnum->size() > 1 && col->size() != colnum->size()) - throw Exception( - "Column size doesn't match of argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); - auto col_res = ColumnString::create(); - if (colnum->size() == 1 && col->size() >= 1) - { - UInt64 repeat_time = colnum->get64(0); - Impl::vectorNonConstStr( - col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), repeat_time); - } - else - { - Impl::vectorNonConst( - col->getChars(), - col->getOffsets(), - col_res->getChars(), - col_res->getOffsets(), - colnum->getData()); - } - block.getByPosition(result).column = std::move(col_res); - return 0; - })) - ; - else - throw Exception( - "Illegal column " + block.getByPosition(arguments[1]).column->getName() + " of argument of function2 " - + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - else if (const ColumnConst * scale_column_str = checkAndGetColumn(strcolumn.get())) - { - Field scale_field_str = scale_column_str->getField(); - String copy_str = scale_field_str.get(); - if (const ColumnConst * scale_column_num = checkAndGetColumn(numcolumn.get())) - { - Field scale_field_num = scale_column_num->getField(); - UInt64 repeat_time = scale_field_num.get(); - auto col_res = ColumnString::create(); - Impl::vectorConst(copy_str, repeat_time, col_res->getChars(), col_res->getOffsets()); - block.getByPosition(result).column = std::move(col_res); - } - else if (!castType(block.getByPosition(arguments[1]).type.get(), [&](const auto & type) { - using DataType = std::decay_t; - using T0 = typename DataType::FieldType; - const ColumnVector * colnum = checkAndGetColumn>(numcolumn.get()); - auto col_res = ColumnString::create(); - Impl::vectorNonConstInteger(copy_str, col_res->getChars(), col_res->getOffsets(), colnum->getData()); - block.getByPosition(result).column = std::move(col_res); - return 0; - })) - ; - else - throw Exception( - "Illegal column " + block.getByPosition(arguments[1]).column->getName() + " of argument of function2 " - + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - else - throw Exception( - "Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - }; - - using FunctionRepeat = FunctionRepeatImpl; - - void registerFunctionRepeat(FunctionFactory & factory) - { - factory.registerFunction(); - } +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +struct RepeatImpl +{ + static void vectorNonConstStr( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets, + const UInt64 & repeat_time) + { + UInt64 data_size = 0; + res_offsets.assign(offsets); + for (UInt64 i = 0; i < offsets.size(); ++i) + { + data_size += (offsets[i] - offsets[i - 1] - 1) * repeat_time + 1; + res_offsets[i] = data_size; + } + res_data.resize(data_size); + for (UInt64 i = 0; i < res_offsets.size(); ++i) + { + array(data.data() + offsets[i - 1], res_data.data() + res_offsets[i - 1], offsets[i] - offsets[i - 1], repeat_time); + } + } + + static void + vectorConst(const String & copy_str, const UInt64 & repeat_time, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) + { + UInt64 data_size = copy_str.size() * repeat_time + 1; + res_data.resize(data_size); + res_offsets.resize_fill(1, data_size); + array(reinterpret_cast(const_cast(copy_str.data())), res_data.data(), copy_str.size() + 1, repeat_time); + } + + template + static void vectorNonConst( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets, + const PaddedPODArray & col_num) + { + UInt64 data_size = 0; + res_offsets.assign(offsets); + for (UInt64 i = 0; i < col_num.size(); ++i) + { + data_size += (offsets[i] - offsets[i - 1] - 1) * col_num[i] + 1; + res_offsets[i] = data_size; + } + res_data.resize(data_size); + for (UInt64 i = 0; i < col_num.size(); ++i) + { + array(data.data() + offsets[i - 1], res_data.data() + res_offsets[i - 1], offsets[i] - offsets[i - 1], col_num[i]); + } + } + + template + static void vectorNonConstInteger( + const String & copy_str, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets, const PaddedPODArray & col_num) + { + UInt64 data_size = 0; + res_offsets.resize(col_num.size()); + UInt64 str_size = copy_str.size(); + UInt64 col_size = col_num.size(); + for (UInt64 i = 0; i < col_size; ++i) + { + data_size += str_size * col_num[i] + 1; + res_offsets[i] = data_size; + } + res_data.resize(data_size); + for (UInt64 i = 0; i < col_size; ++i) + { + array( + reinterpret_cast(const_cast(copy_str.data())), + res_data.data() + res_offsets[i - 1], + str_size + 1, + col_num[i]); + } + } + +private: + static void array(const UInt8 * src, UInt8 * dst, const UInt64 & size, const UInt64 & repeat_time) + { + for (UInt64 i = 0; i < repeat_time; ++i) + { + memcpy(dst, src, size - 1); + dst += size - 1; + } + *dst = 0; + } +}; + +template +class FunctionRepeatImpl : public IFunction +{ + template + static bool castType(const IDataType * type, F && f) + { + return castTypeToEither(type, std::forward(f)); + } + +public: + static constexpr auto name = "repeat"; + static FunctionPtr create(const Context &) { return std::make_shared(); } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 2; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isString(arguments[0])) + throw Exception( + "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + if (!isUnsignedInteger(arguments[1])) + throw Exception( + "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + return arguments[0]; + } + + bool useDefaultImplementationForConstants() const override { return true; } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override + { + auto & strcolumn = block.getByPosition(arguments[0]).column; + auto & numcolumn = block.getByPosition(arguments[1]).column; + + if (const ColumnString * col = checkAndGetColumn(strcolumn.get())) + { + if (const ColumnConst * scale_column_num = checkAndGetColumn(numcolumn.get())) + { + Field scale_field_num = scale_column_num->getField(); + UInt64 repeat_time = scale_field_num.get(); + auto col_res = ColumnString::create(); + Impl::vectorNonConstStr(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), repeat_time); + block.getByPosition(result).column = std::move(col_res); + } + else if (!castType(block.getByPosition(arguments[1]).type.get(), [&](const auto & type) + { + using DataType = std::decay_t; + using T0 = typename DataType::FieldType; + const ColumnVector * colnum = checkAndGetColumn>(numcolumn.get()); + auto col_res = ColumnString::create(); + Impl::vectorNonConst(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), colnum->getData()); + block.getByPosition(result).column = std::move(col_res); + return 0; + })); + else + throw Exception( + "Illegal column " + block.getByPosition(arguments[1]).column->getName() + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else if (const ColumnConst * scale_column_str = checkAndGetColumn(strcolumn.get())) + { + Field scale_field_str = scale_column_str->getField(); + String copy_str = scale_field_str.get(); + if (const ColumnConst * scale_column_num = checkAndGetColumn(numcolumn.get())) + { + Field scale_field_num = scale_column_num->getField(); + UInt64 repeat_time = scale_field_num.get(); + auto col_res = ColumnString::create(); + Impl::vectorConst(copy_str, repeat_time, col_res->getChars(), col_res->getOffsets()); + block.getByPosition(result).column = std::move(col_res); + } + else if (!castType(block.getByPosition(arguments[1]).type.get(), [&](const auto & type) + { + using DataType = std::decay_t; + using T0 = typename DataType::FieldType; + const ColumnVector * colnum = checkAndGetColumn>(numcolumn.get()); + auto col_res = ColumnString::create(); + Impl::vectorNonConstInteger(copy_str, col_res->getChars(), col_res->getOffsets(), colnum->getData()); + block.getByPosition(result).column = std::move(col_res); + return 0; + })); + else + throw Exception( + "Illegal column " + block.getByPosition(arguments[1]).column->getName() + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else + throw Exception( + "Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } +}; + +using FunctionRepeat = FunctionRepeatImpl; + +void registerFunctionRepeat(FunctionFactory & factory) +{ + factory.registerFunction(); +} } diff --git a/dbms/tests/queries/0_stateless/01013_repeat_function.reference b/dbms/tests/queries/0_stateless/01013_repeat_function.reference new file mode 100644 index 00000000000..7841bbd52f9 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01013_repeat_function.reference @@ -0,0 +1,37 @@ +abcabcabcabcabcabcabcabcabcabc +abcabcabc +sdfggsdfgg +xywq + +abcabcabcabcabcabcabcabcabcabcabcabc +sdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfgg +xywqxywqxywqxywq +plkfplkfplkfplkfplkf +abcabcabcabc +sdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfgg +xywqxywqxywqxywqxywqxywqxywqxywqxywq +plkfplkfplkfplkfplkfplkfplkf +abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc +sdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfgg +xywqxywqxywqxywqxywq +plkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkfplkf +abcabcabcabcabcabcabcabcabcabc +sdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfggsdfgg +xywqxywqxywqxywqxywqxywqxywqxywqxywqxywq +plkfplkfplkfplkfplkfplkfplkfplkfplkfplkf +abcabcabc +abcabc +abc + +abcabcabcabcabcabcabcabcabcabcabcabc +abcabcabcabcabcabcabcabcabcabc +abcabcabcabc +abcabcabcabcabc +abcabcabcabc +abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc +abcabcabcabcabcabcabcabcabc +abcabcabcabcabcabcabc +abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc +abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc +abcabcabcabcabc +abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc diff --git a/dbms/tests/queries/0_stateless/01013_repeat_function.sql b/dbms/tests/queries/0_stateless/01013_repeat_function.sql new file mode 100644 index 00000000000..5de0e7a64e5 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01013_repeat_function.sql @@ -0,0 +1,24 @@ +SELECT repeat('abc', 10); +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + strings String, + u8 UInt8, + u16 UInt16, + u32 UInt32, + u64 UInt64 +)ENGINE = Memory(); + +INSERT INTO defaults values ('abc', 3, 12, 4, 56) ('sdfgg', 2, 10, 21, 200) ('xywq', 1, 4, 9, 5) ('plkf', 0, 5, 7,77); + +SELECT repeat(strings, u8) FROM defaults; +SELECT repeat(strings, u16) FROM defaults; +SELECT repeat(strings, u32) from defaults; +SELECT repeat(strings, u64) FROM defaults; +SELECT repeat(strings, 10) FROM defaults; +SELECT repeat('abc', u8) FROM defaults; +SELECT repeat('abc', u16) FROM defaults; +SELECT repeat('abc', u32) FROM defaults; +SELECT repeat('abc', u64) FROM defaults; + +DROP TABLE defaults; From 52f14c1ead5d220411291d3ba146b513572a6a0b Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 22 Sep 2019 10:39:50 +0300 Subject: [PATCH 161/309] Update HashedDictionary.cpp (fixed rename in return type for SFINAE) --- dbms/src/Dictionaries/HashedDictionary.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dbms/src/Dictionaries/HashedDictionary.cpp b/dbms/src/Dictionaries/HashedDictionary.cpp index 9aa111946ee..7946c87dff8 100644 --- a/dbms/src/Dictionaries/HashedDictionary.cpp +++ b/dbms/src/Dictionaries/HashedDictionary.cpp @@ -9,12 +9,12 @@ namespace /// NOTE: Trailing return type is explicitly specified for SFINAE. /// google::sparse_hash_map -template auto first(const T & value) -> decltype(value.first) { return lhs.first; } -template auto second(const T & value) -> decltype(value.second) { return lhs.second; } +template auto first(const T & value) -> decltype(value.first) { return value.first; } +template auto second(const T & value) -> decltype(value.second) { return value.second; } /// HashMap -template auto first(const T & value) -> decltype(value.getFirst()) { return lhs.getFirst(); } -template auto second(const T & value) -> decltype(value.getSecond()) { return lhs.getSecond(); } +template auto first(const T & value) -> decltype(value.getFirst()) { return value.getFirst(); } +template auto second(const T & value) -> decltype(value.getSecond()) { return value.getSecond(); } } From b174833bdf27e6480bd82e72da453323f3a46a4c Mon Sep 17 00:00:00 2001 From: Eldar Zaitov Date: Sun, 22 Sep 2019 13:34:09 +0300 Subject: [PATCH 162/309] Initial fuzzing support with libfuzzer --- CMakeLists.txt | 8 +++++ cmake/sanitize.cmake | 11 +++++++ dbms/src/Compression/tests/CMakeLists.txt | 6 ++++ .../tests/compressed_buffer_fuzz.cpp | 32 +++++++++++++++++++ release | 3 ++ 5 files changed, 60 insertions(+) create mode 100644 dbms/src/Compression/tests/compressed_buffer_fuzz.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d82da6838a..448639c17d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,6 +91,14 @@ if (USE_STATIC_LIBRARIES) list(REVERSE CMAKE_FIND_LIBRARY_SUFFIXES) endif () +option (ENABLE_FUZZING "Enables fuzzing instrumentation" OFF) + +if (ENABLE_FUZZING) + message (STATUS "Fuzzing instrumentation enabled") + set (WITH_COVERAGE ON) + set (SANITIZE "libfuzzer") +endif() + include (cmake/sanitize.cmake) diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index 196a66e6845..381c186212b 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -42,6 +42,17 @@ if (SANITIZE) if (MAKE_STATIC_LIBRARIES AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libubsan") endif () + + elseif (SANITIZE STREQUAL "libfuzzer") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} -fsanitize=fuzzer-no-link,address,signed-integer-overflow -fsanitize-address-use-after-scope") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} -fsanitize=fuzzer-no-link,address,signed-integer-overflow -fsanitize-address-use-after-scope") + if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=fuzzer-no-link,address,signed-integer-overflow -fsanitize-address-use-after-scope") + endif() + if (MAKE_STATIC_LIBRARIES AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libasan") + endif () + set (LIBFUZZER_CMAKE_CXX_FLAGS "-fsanitize=fuzzer,address,signed-integer-overflow -fsanitize-address-use-after-scope") else () message (FATAL_ERROR "Unknown sanitizer type: ${SANITIZE}") endif () diff --git a/dbms/src/Compression/tests/CMakeLists.txt b/dbms/src/Compression/tests/CMakeLists.txt index 3cfc0ccb7dc..6b13d4eb5cd 100644 --- a/dbms/src/Compression/tests/CMakeLists.txt +++ b/dbms/src/Compression/tests/CMakeLists.txt @@ -3,3 +3,9 @@ target_link_libraries (compressed_buffer PRIVATE dbms) add_executable (cached_compressed_read_buffer cached_compressed_read_buffer.cpp) target_link_libraries (cached_compressed_read_buffer PRIVATE dbms) + +if (ENABLE_FUZZING) + add_executable (compressed_buffer_fuzz compressed_buffer_fuzz.cpp) + target_link_libraries (compressed_buffer_fuzz PRIVATE dbms) + set_target_properties(compressed_buffer_fuzz PROPERTIES LINK_FLAGS ${LIBFUZZER_CMAKE_CXX_FLAGS}) +endif () diff --git a/dbms/src/Compression/tests/compressed_buffer_fuzz.cpp b/dbms/src/Compression/tests/compressed_buffer_fuzz.cpp new file mode 100644 index 00000000000..ff4907dcb2d --- /dev/null +++ b/dbms/src/Compression/tests/compressed_buffer_fuzz.cpp @@ -0,0 +1,32 @@ +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + try { + std::string Str(reinterpret_cast(data), size); + + DB::ReadBufferFromString from(Str); + DB::CompressedReadBuffer in{from}; + } + catch (const DB::Exception & e) + { + std::cerr << e.what() << ", " << e.displayText() << std::endl; + return 1; + } + + return 0; +} diff --git a/release b/release index 270c16f4c36..b5e3f05cd95 100755 --- a/release +++ b/release @@ -87,6 +87,9 @@ then elif [[ "$SANITIZER" == "thread" ]]; then VERSION_POSTFIX+="+tsan" elif [[ "$SANITIZER" == "memory" ]]; then VERSION_POSTFIX+="+msan" elif [[ "$SANITIZER" == "undefined" ]]; then VERSION_POSTFIX+="+ubsan" + elif [[ "$SANITIZER" == "libfuzzer" ]]; then + VERSION_POSTFIX+="+libfuzzer" + MALLOC_OPTS="-DENABLE_TCMALLOC=0 -DENABLE_JEMALLOC=0" else echo "Unknown value of SANITIZER variable: $SANITIZER" exit 3 From 64d49e4565d7beb57c61eb295a5eb6cb81f5b4cd Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Sun, 22 Sep 2019 10:42:47 +0000 Subject: [PATCH 163/309] Minor review fixes. --- dbms/src/IO/ReadBufferFromS3.cpp | 4 +- dbms/src/IO/WriteBufferFromS3.cpp | 61 +++--- .../tests/integration/test_storage_s3/test.py | 125 ++++++------ .../test_storage_s3/test_server.py | 188 +++++++++--------- 4 files changed, 192 insertions(+), 186 deletions(-) diff --git a/dbms/src/IO/ReadBufferFromS3.cpp b/dbms/src/IO/ReadBufferFromS3.cpp index 7fcb7a0ca41..ae09f0fb189 100644 --- a/dbms/src/IO/ReadBufferFromS3.cpp +++ b/dbms/src/IO/ReadBufferFromS3.cpp @@ -5,11 +5,11 @@ #include -#define DEFAULT_S3_MAX_FOLLOW_GET_REDIRECT 2 - namespace DB { +const int DEFAULT_S3_MAX_FOLLOW_GET_REDIRECT = 2; + ReadBufferFromS3::ReadBufferFromS3(Poco::URI uri_, const ConnectionTimeouts & timeouts, const Poco::Net::HTTPBasicCredentials & credentials, diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 5b6f9fdff4c..1ef6f3b19a0 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -11,12 +11,13 @@ #include -#define DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT 2 -#define S3_SOFT_MAX_PARTS 10000 - namespace DB { +const int DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT = 2; +const int S3_WARN_MAX_PARTS = 10000; + + namespace ErrorCodes { extern const int INCORRECT_DATA; @@ -92,34 +93,33 @@ void WriteBufferFromS3::initiate() { // See https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadInitiate.html Poco::Net::HTTPResponse response; - std::unique_ptr request; + std::unique_ptr request_ptr; HTTPSessionPtr session; std::istream * istr = nullptr; /// owned by session Poco::URI initiate_uri = uri; initiate_uri.setRawQuery("uploads"); - auto params = uri.getQueryParameters(); - for (auto it = params.begin(); it != params.end(); ++it) + for (auto & param: uri.getQueryParameters()) { - initiate_uri.addQueryParameter(it->first, it->second); + initiate_uri.addQueryParameter(param.first, param.second); } for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT; ++i) { session = makeHTTPSession(initiate_uri, timeouts); - request = std::make_unique(Poco::Net::HTTPRequest::HTTP_POST, initiate_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); - request->setHost(initiate_uri.getHost()); // use original, not resolved host name in header + request_ptr = std::make_unique(Poco::Net::HTTPRequest::HTTP_POST, initiate_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); + request_ptr->setHost(initiate_uri.getHost()); // use original, not resolved host name in header if (auth_request.hasCredentials()) { Poco::Net::HTTPBasicCredentials credentials(auth_request); - credentials.authenticate(*request); + credentials.authenticate(*request_ptr); } - request->setContentLength(0); + request_ptr->setContentLength(0); LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << initiate_uri.toString()); - session->sendRequest(*request); + session->sendRequest(*request_ptr); istr = &session->receiveResponse(response); @@ -134,7 +134,7 @@ void WriteBufferFromS3::initiate() initiate_uri = location_iterator->second; } - assertResponseIsOk(*request, response, *istr); + assertResponseIsOk(*request_ptr, response, *istr); Poco::XML::InputSource src(*istr); Poco::XML::DOMParser parser; @@ -156,37 +156,38 @@ void WriteBufferFromS3::writePart(const String & data) { // See https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadUploadPart.html Poco::Net::HTTPResponse response; - std::unique_ptr request; + std::unique_ptr request_ptr; HTTPSessionPtr session; std::istream * istr = nullptr; /// owned by session Poco::URI part_uri = uri; part_uri.addQueryParameter("partNumber", std::to_string(part_tags.size() + 1)); part_uri.addQueryParameter("uploadId", upload_id); - if (part_tags.size() == S3_SOFT_MAX_PARTS) + if (part_tags.size() == S3_WARN_MAX_PARTS) { + // Don't throw exception here by ourselves but leave the decision to take by S3 server. LOG_WARNING(&Logger::get("WriteBufferFromS3"), "Maximum part number in S3 protocol has reached (too much parts). Server may not accept this whole upload."); } for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT; ++i) { session = makeHTTPSession(part_uri, timeouts); - request = std::make_unique(Poco::Net::HTTPRequest::HTTP_PUT, part_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); - request->setHost(part_uri.getHost()); // use original, not resolved host name in header + request_ptr = std::make_unique(Poco::Net::HTTPRequest::HTTP_PUT, part_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); + request_ptr->setHost(part_uri.getHost()); // use original, not resolved host name in header if (auth_request.hasCredentials()) { Poco::Net::HTTPBasicCredentials credentials(auth_request); - credentials.authenticate(*request); + credentials.authenticate(*request_ptr); } - request->setExpectContinue(true); + request_ptr->setExpectContinue(true); - request->setContentLength(data.size()); + request_ptr->setContentLength(data.size()); LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << part_uri.toString()); - std::ostream & ostr = session->sendRequest(*request); + std::ostream & ostr = session->sendRequest(*request_ptr); if (session->peekResponse(response)) { // Received 100-continue. @@ -206,7 +207,7 @@ void WriteBufferFromS3::writePart(const String & data) part_uri = location_iterator->second; } - assertResponseIsOk(*request, response, *istr); + assertResponseIsOk(*request_ptr, response, *istr); auto etag_iterator = response.find("ETag"); if (etag_iterator == response.end()) @@ -221,7 +222,7 @@ void WriteBufferFromS3::complete() { // See https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadComplete.html Poco::Net::HTTPResponse response; - std::unique_ptr request; + std::unique_ptr request_ptr; HTTPSessionPtr session; std::istream * istr = nullptr; /// owned by session Poco::URI complete_uri = uri; @@ -244,22 +245,22 @@ void WriteBufferFromS3::complete() for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT; ++i) { session = makeHTTPSession(complete_uri, timeouts); - request = std::make_unique(Poco::Net::HTTPRequest::HTTP_POST, complete_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); - request->setHost(complete_uri.getHost()); // use original, not resolved host name in header + request_ptr = std::make_unique(Poco::Net::HTTPRequest::HTTP_POST, complete_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); + request_ptr->setHost(complete_uri.getHost()); // use original, not resolved host name in header if (auth_request.hasCredentials()) { Poco::Net::HTTPBasicCredentials credentials(auth_request); - credentials.authenticate(*request); + credentials.authenticate(*request_ptr); } - request->setExpectContinue(true); + request_ptr->setExpectContinue(true); - request->setContentLength(data.size()); + request_ptr->setContentLength(data.size()); LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << complete_uri.toString()); - std::ostream & ostr = session->sendRequest(*request); + std::ostream & ostr = session->sendRequest(*request_ptr); if (session->peekResponse(response)) { // Received 100-continue. @@ -279,7 +280,7 @@ void WriteBufferFromS3::complete() complete_uri = location_iterator->second; } - assertResponseIsOk(*request, response, *istr); + assertResponseIsOk(*request_ptr, response, *istr); } } diff --git a/dbms/tests/integration/test_storage_s3/test.py b/dbms/tests/integration/test_storage_s3/test.py index 2013daa6ae6..88be4640388 100644 --- a/dbms/tests/integration/test_storage_s3/test.py +++ b/dbms/tests/integration/test_storage_s3/test.py @@ -15,7 +15,7 @@ logging.getLogger().addHandler(logging.StreamHandler()) def get_communication_data(started_cluster): - conn = httplib.HTTPConnection(started_cluster.instances['dummy'].ip_address, started_cluster.communication_port) + conn = httplib.HTTPConnection(started_cluster.instances["dummy"].ip_address, started_cluster.communication_port) conn.request("GET", "/") r = conn.getresponse() raw_data = r.read() @@ -24,7 +24,7 @@ def get_communication_data(started_cluster): def put_communication_data(started_cluster, body): - conn = httplib.HTTPConnection(started_cluster.instances['dummy'].ip_address, started_cluster.communication_port) + conn = httplib.HTTPConnection(started_cluster.instances["dummy"].ip_address, started_cluster.communication_port) conn.request("PUT", "/", body) r = conn.getresponse() conn.close() @@ -34,29 +34,29 @@ def put_communication_data(started_cluster, body): def started_cluster(): try: cluster = ClickHouseCluster(__file__) - instance = cluster.add_instance('dummy', config_dir="configs", main_configs=['configs/min_chunk_size.xml']) + instance = cluster.add_instance("dummy", config_dir="configs", main_configs=["configs/min_chunk_size.xml"]) cluster.start() cluster.communication_port = 10000 - instance.copy_file_to_container(os.path.join(os.path.dirname(__file__), 'test_server.py'), 'test_server.py') - cluster.bucket = 'abc' - instance.exec_in_container(['python', 'test_server.py', str(cluster.communication_port), cluster.bucket], detach=True) + instance.copy_file_to_container(os.path.join(os.path.dirname(__file__), "test_server.py"), "test_server.py") + cluster.bucket = "abc" + instance.exec_in_container(["python", "test_server.py", str(cluster.communication_port), cluster.bucket], detach=True) cluster.mock_host = instance.ip_address for i in range(10): try: data = get_communication_data(cluster) - cluster.redirecting_to_http_port = data['redirecting_to_http_port'] - cluster.preserving_data_port = data['preserving_data_port'] - cluster.multipart_preserving_data_port = data['multipart_preserving_data_port'] - cluster.redirecting_preserving_data_port = data['redirecting_preserving_data_port'] + cluster.redirecting_to_http_port = data["redirecting_to_http_port"] + cluster.preserving_data_port = data["preserving_data_port"] + cluster.multipart_preserving_data_port = data["multipart_preserving_data_port"] + cluster.redirecting_preserving_data_port = data["redirecting_preserving_data_port"] except: logging.error(traceback.format_exc()) time.sleep(0.5) else: break else: - assert False, 'Could not initialize mock server' + assert False, "Could not initialize mock server" yield cluster @@ -65,92 +65,97 @@ def started_cluster(): def run_query(instance, query, stdin=None): - logging.info('Running query "{}"...'.format(query)) + logging.info("Running query '{}'...".format(query)) result = instance.query(query, stdin=stdin) - logging.info('Query finished') + logging.info("Query finished") return result -def test_get_with_redirect(started_cluster): - instance = started_cluster.instances['dummy'] - format = 'column1 UInt32, column2 UInt32, column3 UInt32' - put_communication_data(started_cluster, '=== Get with redirect test ===') +def test_get_with_redirect(started_cluster): + instance = started_cluster.instances["dummy"] + format = "column1 UInt32, column2 UInt32, column3 UInt32" + + put_communication_data(started_cluster, "=== Get with redirect test ===") query = "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(started_cluster.mock_host, started_cluster.redirecting_to_http_port, format) stdout = run_query(instance, query) assert list(map(str.split, stdout.splitlines())) == [ - ['42', '87', '44', '160776'], - ['55', '33', '81', '147015'], - ['1', '0', '9', '0'], + ["42", "87", "44", "160776"], + ["55", "33", "81", "147015"], + ["1", "0", "9", "0"], ] -def test_put(started_cluster): - instance = started_cluster.instances['dummy'] - format = 'column1 UInt32, column2 UInt32, column3 UInt32' - logging.info('Phase 3') - put_communication_data(started_cluster, '=== Put test ===') - values = '(1, 2, 3), (3, 2, 1), (78, 43, 45)' +def test_put(started_cluster): + instance = started_cluster.instances["dummy"] + format = "column1 UInt32, column2 UInt32, column3 UInt32" + + logging.info("Phase 3") + put_communication_data(started_cluster, "=== Put test ===") + values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)" put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(started_cluster.mock_host, started_cluster.preserving_data_port, started_cluster.bucket, format, values) run_query(instance, put_query) data = get_communication_data(started_cluster) - received_data_completed = data['received_data_completed'] - received_data = data['received_data'] - finalize_data = data['finalize_data'] - finalize_data_query = data['finalize_data_query'] - assert received_data[-1].decode() == '1,2,3\n3,2,1\n78,43,45\n' + received_data_completed = data["received_data_completed"] + received_data = data["received_data"] + finalize_data = data["finalize_data"] + finalize_data_query = data["finalize_data_query"] + assert received_data[-1].decode() == "1,2,3\n3,2,1\n78,43,45\n" assert received_data_completed - assert finalize_data == '1hello-etag' - assert finalize_data_query == 'uploadId=TEST' + assert finalize_data == "1hello-etag" + assert finalize_data_query == "uploadId=TEST" + def test_put_csv(started_cluster): - instance = started_cluster.instances['dummy'] - format = 'column1 UInt32, column2 UInt32, column3 UInt32' + instance = started_cluster.instances["dummy"] + format = "column1 UInt32, column2 UInt32, column3 UInt32" - put_communication_data(started_cluster, '=== Put test CSV ===') + put_communication_data(started_cluster, "=== Put test CSV ===") put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') format CSV".format(started_cluster.mock_host, started_cluster.preserving_data_port, started_cluster.bucket, format) - csv_data = '8,9,16\n11,18,13\n22,14,2\n' + csv_data = "8,9,16\n11,18,13\n22,14,2\n" run_query(instance, put_query, stdin=csv_data) data = get_communication_data(started_cluster) - received_data_completed = data['received_data_completed'] - received_data = data['received_data'] - finalize_data = data['finalize_data'] - finalize_data_query = data['finalize_data_query'] + received_data_completed = data["received_data_completed"] + received_data = data["received_data"] + finalize_data = data["finalize_data"] + finalize_data_query = data["finalize_data_query"] assert received_data[-1].decode() == csv_data assert received_data_completed - assert finalize_data == '1hello-etag' - assert finalize_data_query == 'uploadId=TEST' + assert finalize_data == "1hello-etag" + assert finalize_data_query == "uploadId=TEST" + def test_put_with_redirect(started_cluster): - instance = started_cluster.instances['dummy'] - format = 'column1 UInt32, column2 UInt32, column3 UInt32' + instance = started_cluster.instances["dummy"] + format = "column1 UInt32, column2 UInt32, column3 UInt32" - put_communication_data(started_cluster, '=== Put with redirect test ===') - other_values = '(1, 1, 1), (1, 1, 1), (11, 11, 11)' + put_communication_data(started_cluster, "=== Put with redirect test ===") + other_values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)" query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') values {}".format(started_cluster.mock_host, started_cluster.redirecting_preserving_data_port, started_cluster.bucket, format, other_values) run_query(instance, query) query = "select *, column1*column2*column3 from s3('http://{}:{}/{}/test.csv', 'CSV', '{}')".format(started_cluster.mock_host, started_cluster.preserving_data_port, started_cluster.bucket, format) stdout = run_query(instance, query) assert list(map(str.split, stdout.splitlines())) == [ - ['1', '1', '1', '1'], - ['1', '1', '1', '1'], - ['11', '11', '11', '1331'], + ["1", "1", "1", "1"], + ["1", "1", "1", "1"], + ["11", "11", "11", "1331"], ] data = get_communication_data(started_cluster) - received_data = data['received_data'] - assert received_data[-1].decode() == '1,1,1\n1,1,1\n11,11,11\n' + received_data = data["received_data"] + assert received_data[-1].decode() == "1,1,1\n1,1,1\n11,11,11\n" + def test_multipart_put(started_cluster): - instance = started_cluster.instances['dummy'] - format = 'column1 UInt32, column2 UInt32, column3 UInt32' + instance = started_cluster.instances["dummy"] + format = "column1 UInt32, column2 UInt32, column3 UInt32" - put_communication_data(started_cluster, '=== Multipart test ===') + put_communication_data(started_cluster, "=== Multipart test ===") long_data = [[i, i+1, i+2] for i in range(100000)] - long_values = ''.join([ '{},{},{}\n'.format(x,y,z) for x, y, z in long_data ]) + long_values = "".join([ "{},{},{}\n".format(x,y,z) for x, y, z in long_data ]) put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') format CSV".format(started_cluster.mock_host, started_cluster.multipart_preserving_data_port, started_cluster.bucket, format) run_query(instance, put_query, stdin=long_values) data = get_communication_data(started_cluster) - assert 'multipart_received_data' in data - received_data = data['multipart_received_data'] - assert received_data[-1].decode() == ''.join([ '{},{},{}\n'.format(x, y, z) for x, y, z in long_data ]) - assert 1 < data['multipart_parts'] < 10000 + assert "multipart_received_data" in data + received_data = data["multipart_received_data"] + assert received_data[-1].decode() == "".join([ "{},{},{}\n".format(x, y, z) for x, y, z in long_data ]) + assert 1 < data["multipart_parts"] < 10000 diff --git a/dbms/tests/integration/test_storage_s3/test_server.py b/dbms/tests/integration/test_storage_s3/test_server.py index 3c10445566a..09dfa1ca958 100644 --- a/dbms/tests/integration/test_storage_s3/test_server.py +++ b/dbms/tests/integration/test_storage_s3/test_server.py @@ -25,8 +25,8 @@ import xml.etree.ElementTree logging.getLogger().setLevel(logging.INFO) -file_handler = logging.FileHandler('/var/log/clickhouse-server/test-server.log', 'a', encoding='utf-8') -file_handler.setFormatter(logging.Formatter('%(asctime)s %(message)s')) +file_handler = logging.FileHandler("/var/log/clickhouse-server/test-server.log", "a", encoding="utf-8") +file_handler.setFormatter(logging.Formatter("%(asctime)s %(message)s")) logging.getLogger().addHandler(file_handler) logging.getLogger().addHandler(logging.StreamHandler()) @@ -54,21 +54,21 @@ def GetFreeTCPPortsAndIP(n): ), localhost = GetFreeTCPPortsAndIP(5) data = { - 'redirecting_to_http_port': redirecting_to_http_port, - 'preserving_data_port': preserving_data_port, - 'multipart_preserving_data_port': multipart_preserving_data_port, - 'redirecting_preserving_data_port': redirecting_preserving_data_port, + "redirecting_to_http_port": redirecting_to_http_port, + "preserving_data_port": preserving_data_port, + "multipart_preserving_data_port": multipart_preserving_data_port, + "redirecting_preserving_data_port": redirecting_preserving_data_port, } class SimpleHTTPServerHandler(BaseHTTPRequestHandler): def do_GET(self): - logging.info('GET {}'.format(self.path)) - if self.path == '/milovidov/test.csv': + logging.info("GET {}".format(self.path)) + if self.path == "/milovidov/test.csv": self.send_response(200) - self.send_header('Content-type', 'text/plain') + self.send_header("Content-type", "text/plain") self.end_headers() - self.wfile.write('42,87,44\n55,33,81\n1,0,9\n') + self.wfile.write("42,87,44\n55,33,81\n1,0,9\n") else: self.send_response(404) self.end_headers() @@ -78,27 +78,27 @@ class SimpleHTTPServerHandler(BaseHTTPRequestHandler): class RedirectingToHTTPHandler(BaseHTTPRequestHandler): def do_GET(self): self.send_response(307) - self.send_header('Content-type', 'text/xml') - self.send_header('Location', 'http://{}:{}/milovidov/test.csv'.format(localhost, simple_server_port)) + self.send_header("Content-type", "text/xml") + self.send_header("Location", "http://{}:{}/milovidov/test.csv".format(localhost, simple_server_port)) self.end_headers() - self.wfile.write(r''' + self.wfile.write(r""" TemporaryRedirect Please re-send this request to the specified temporary endpoint. Continue to use the original request endpoint for future requests. storage.yandexcloud.net -'''.encode()) +
""".encode()) self.finish() class PreservingDataHandler(BaseHTTPRequestHandler): - protocol_version = 'HTTP/1.1' + protocol_version = "HTTP/1.1" def parse_request(self): result = BaseHTTPRequestHandler.parse_request(self) # Adaptation to Python 3. if sys.version_info.major == 2 and result == True: - expect = self.headers.get('Expect', "") + expect = self.headers.get("Expect", "") if (expect.lower() == "100-continue" and self.protocol_version >= "HTTP/1.1" and self.request_version >= "HTTP/1.1"): if not self.handle_expect_100(): return False @@ -109,12 +109,12 @@ class PreservingDataHandler(BaseHTTPRequestHandler): if code in self.responses: message = self.responses[code][0] else: - message = '' - if self.request_version != 'HTTP/0.9': + message = "" + if self.request_version != "HTTP/0.9": self.wfile.write("%s %d %s\r\n" % (self.protocol_version, code, message)) def handle_expect_100(self): - logging.info('Received Expect-100') + logging.info("Received Expect-100") self.send_response_only(100) self.end_headers() return True @@ -122,37 +122,37 @@ class PreservingDataHandler(BaseHTTPRequestHandler): def do_POST(self): self.send_response(200) query = urlparse.urlparse(self.path).query - logging.info('PreservingDataHandler POST ?' + query) - if query == 'uploads': - post_data = r''' -TEST'''.encode() - self.send_header('Content-length', str(len(post_data))) - self.send_header('Content-type', 'text/plain') + logging.info("PreservingDataHandler POST ?" + query) + if query == "uploads": + post_data = r""" +TEST""".encode() + self.send_header("Content-length", str(len(post_data))) + self.send_header("Content-type", "text/plain") self.end_headers() self.wfile.write(post_data) else: - post_data = self.rfile.read(int(self.headers.get('Content-Length'))) - self.send_header('Content-type', 'text/plain') + post_data = self.rfile.read(int(self.headers.get("Content-Length"))) + self.send_header("Content-type", "text/plain") self.end_headers() - data['received_data_completed'] = True - data['finalize_data'] = post_data - data['finalize_data_query'] = query + data["received_data_completed"] = True + data["finalize_data"] = post_data + data["finalize_data_query"] = query self.finish() def do_PUT(self): self.send_response(200) - self.send_header('Content-type', 'text/plain') - self.send_header('ETag', 'hello-etag') + self.send_header("Content-type", "text/plain") + self.send_header("ETag", "hello-etag") self.end_headers() query = urlparse.urlparse(self.path).query path = urlparse.urlparse(self.path).path - logging.info('Content-Length = ' + self.headers.get('Content-Length')) - logging.info('PUT ' + query) - assert self.headers.get('Content-Length') - assert self.headers['Expect'] == '100-continue' + logging.info("Content-Length = " + self.headers.get("Content-Length")) + logging.info("PUT " + query) + assert self.headers.get("Content-Length") + assert self.headers["Expect"] == "100-continue" put_data = self.rfile.read() - data.setdefault('received_data', []).append(put_data) - logging.info('PUT to {}'.format(path)) + data.setdefault("received_data", []).append(put_data) + logging.info("PUT to {}".format(path)) self.server.storage[path] = put_data self.finish() @@ -160,8 +160,8 @@ class PreservingDataHandler(BaseHTTPRequestHandler): path = urlparse.urlparse(self.path).path if path in self.server.storage: self.send_response(200) - self.send_header('Content-type', 'text/plain') - self.send_header('Content-length', str(len(self.server.storage[path]))) + self.send_header("Content-type", "text/plain") + self.send_header("Content-length", str(len(self.server.storage[path]))) self.end_headers() self.wfile.write(self.server.storage[path]) else: @@ -171,13 +171,13 @@ class PreservingDataHandler(BaseHTTPRequestHandler): class MultipartPreservingDataHandler(BaseHTTPRequestHandler): - protocol_version = 'HTTP/1.1' + protocol_version = "HTTP/1.1" def parse_request(self): result = BaseHTTPRequestHandler.parse_request(self) # Adaptation to Python 3. if sys.version_info.major == 2 and result == True: - expect = self.headers.get('Expect', "") + expect = self.headers.get("Expect", "") if (expect.lower() == "100-continue" and self.protocol_version >= "HTTP/1.1" and self.request_version >= "HTTP/1.1"): if not self.handle_expect_100(): return False @@ -188,78 +188,78 @@ class MultipartPreservingDataHandler(BaseHTTPRequestHandler): if code in self.responses: message = self.responses[code][0] else: - message = '' - if self.request_version != 'HTTP/0.9': + message = "" + if self.request_version != "HTTP/0.9": self.wfile.write("%s %d %s\r\n" % (self.protocol_version, code, message)) def handle_expect_100(self): - logging.info('Received Expect-100') + logging.info("Received Expect-100") self.send_response_only(100) self.end_headers() return True def do_POST(self): query = urlparse.urlparse(self.path).query - logging.info('MultipartPreservingDataHandler POST ?' + query) - if query == 'uploads': + logging.info("MultipartPreservingDataHandler POST ?" + query) + if query == "uploads": self.send_response(200) - post_data = r''' -TEST'''.encode() - self.send_header('Content-length', str(len(post_data))) - self.send_header('Content-type', 'text/plain') + post_data = r""" +TEST""".encode() + self.send_header("Content-length", str(len(post_data))) + self.send_header("Content-type", "text/plain") self.end_headers() self.wfile.write(post_data) else: try: - assert query == 'uploadId=TEST' - logging.info('Content-Length = ' + self.headers.get('Content-Length')) - post_data = self.rfile.read(int(self.headers.get('Content-Length'))) + assert query == "uploadId=TEST" + logging.info("Content-Length = " + self.headers.get("Content-Length")) + post_data = self.rfile.read(int(self.headers.get("Content-Length"))) root = xml.etree.ElementTree.fromstring(post_data) - assert root.tag == 'CompleteMultipartUpload' + assert root.tag == "CompleteMultipartUpload" assert len(root) > 1 - content = '' + content = "" for i, part in enumerate(root): - assert part.tag == 'Part' + assert part.tag == "Part" assert len(part) == 2 - assert part[0].tag == 'PartNumber' - assert part[1].tag == 'ETag' + assert part[0].tag == "PartNumber" + assert part[1].tag == "ETag" assert int(part[0].text) == i + 1 - content += self.server.storage['@'+part[1].text] - data.setdefault('multipart_received_data', []).append(content) - data['multipart_parts'] = len(root) + content += self.server.storage["@"+part[1].text] + data.setdefault("multipart_received_data", []).append(content) + data["multipart_parts"] = len(root) self.send_response(200) - self.send_header('Content-type', 'text/plain') + self.send_header("Content-type", "text/plain") self.end_headers() - logging.info('Sending 200') + logging.info("Sending 200") except: - logging.error('Sending 500') + logging.error("Sending 500") self.send_response(500) self.finish() def do_PUT(self): uid = uuid.uuid4() self.send_response(200) - self.send_header('Content-type', 'text/plain') - self.send_header('ETag', str(uid)) + self.send_header("Content-type", "text/plain") + self.send_header("ETag", str(uid)) self.end_headers() query = urlparse.urlparse(self.path).query path = urlparse.urlparse(self.path).path - logging.info('Content-Length = ' + self.headers.get('Content-Length')) - logging.info('PUT ' + query) - assert self.headers.get('Content-Length') - assert self.headers['Expect'] == '100-continue' + logging.info("Content-Length = " + self.headers.get("Content-Length")) + logging.info("PUT " + query) + assert self.headers.get("Content-Length") + assert self.headers["Expect"] == "100-continue" put_data = self.rfile.read() - data.setdefault('received_data', []).append(put_data) - logging.info('PUT to {}'.format(path)) - self.server.storage['@'+str(uid)] = put_data + data.setdefault("received_data", []).append(put_data) + logging.info("PUT to {}".format(path)) + self.server.storage["@"+str(uid)] = put_data self.finish() def do_GET(self): path = urlparse.urlparse(self.path).path if path in self.server.storage: self.send_response(200) - self.send_header('Content-type', 'text/plain') - self.send_header('Content-length', str(len(self.server.storage[path]))) + self.send_header("Content-type", "text/plain") + self.send_header("Content-length", str(len(self.server.storage[path]))) self.end_headers() self.wfile.write(self.server.storage[path]) else: @@ -269,13 +269,13 @@ class MultipartPreservingDataHandler(BaseHTTPRequestHandler): class RedirectingPreservingDataHandler(BaseHTTPRequestHandler): - protocol_version = 'HTTP/1.1' + protocol_version = "HTTP/1.1" def parse_request(self): result = BaseHTTPRequestHandler.parse_request(self) # Adaptation to Python 3. if sys.version_info.major == 2 and result == True: - expect = self.headers.get('Expect', "") + expect = self.headers.get("Expect", "") if (expect.lower() == "100-continue" and self.protocol_version >= "HTTP/1.1" and self.request_version >= "HTTP/1.1"): if not self.handle_expect_100(): return False @@ -286,46 +286,46 @@ class RedirectingPreservingDataHandler(BaseHTTPRequestHandler): if code in self.responses: message = self.responses[code][0] else: - message = '' - if self.request_version != 'HTTP/0.9': + message = "" + if self.request_version != "HTTP/0.9": self.wfile.write("%s %d %s\r\n" % (self.protocol_version, code, message)) def handle_expect_100(self): - logging.info('Received Expect-100') + logging.info("Received Expect-100") return True def do_POST(self): query = urlparse.urlparse(self.path).query if query: - query = '?{}'.format(query) + query = "?{}".format(query) self.send_response(307) - self.send_header('Content-type', 'text/xml') - self.send_header('Location', 'http://{host}:{port}/{bucket}/test.csv{query}'.format(host=localhost, port=preserving_data_port, bucket=bucket, query=query)) + self.send_header("Content-type", "text/xml") + self.send_header("Location", "http://{host}:{port}/{bucket}/test.csv{query}".format(host=localhost, port=preserving_data_port, bucket=bucket, query=query)) self.end_headers() - self.wfile.write(r''' + self.wfile.write(r""" TemporaryRedirect Please re-send this request to the specified temporary endpoint. Continue to use the original request endpoint for future requests. {host}:{port} -'''.format(host=localhost, port=preserving_data_port).encode()) +""".format(host=localhost, port=preserving_data_port).encode()) self.finish() def do_PUT(self): query = urlparse.urlparse(self.path).query if query: - query = '?{}'.format(query) + query = "?{}".format(query) self.send_response(307) - self.send_header('Content-type', 'text/xml') - self.send_header('Location', 'http://{host}:{port}/{bucket}/test.csv{query}'.format(host=localhost, port=preserving_data_port, bucket=bucket, query=query)) + self.send_header("Content-type", "text/xml") + self.send_header("Location", "http://{host}:{port}/{bucket}/test.csv{query}".format(host=localhost, port=preserving_data_port, bucket=bucket, query=query)) self.end_headers() - self.wfile.write(r''' + self.wfile.write(r""" TemporaryRedirect Please re-send this request to the specified temporary endpoint. Continue to use the original request endpoint for future requests. {host}:{port} -'''.format(host=localhost, port=preserving_data_port).encode()) +""".format(host=localhost, port=preserving_data_port).encode()) self.finish() @@ -357,8 +357,8 @@ jobs = [ threading.Thread(target=server.serve_forever) for server in servers ] time.sleep(60) # Timeout -logging.info('Shutting down') +logging.info("Shutting down") [ server.shutdown() for server in servers ] -logging.info('Joining threads') +logging.info("Joining threads") [ job.join() for job in jobs ] -logging.info('Done') +logging.info("Done") From 4608da13449dacbeabda77ec9d7d10bb8db1358b Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Sun, 22 Sep 2019 11:03:02 +0000 Subject: [PATCH 164/309] Minor tests improvement. --- dbms/tests/integration/test_storage_s3/test.py | 8 +++----- dbms/tests/integration/test_storage_s3/test_server.py | 3 ++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/dbms/tests/integration/test_storage_s3/test.py b/dbms/tests/integration/test_storage_s3/test.py index 88be4640388..84f6bf72f60 100644 --- a/dbms/tests/integration/test_storage_s3/test.py +++ b/dbms/tests/integration/test_storage_s3/test.py @@ -78,11 +78,9 @@ def test_get_with_redirect(started_cluster): put_communication_data(started_cluster, "=== Get with redirect test ===") query = "select *, column1*column2*column3 from s3('http://{}:{}/', 'CSV', '{}')".format(started_cluster.mock_host, started_cluster.redirecting_to_http_port, format) stdout = run_query(instance, query) - assert list(map(str.split, stdout.splitlines())) == [ - ["42", "87", "44", "160776"], - ["55", "33", "81", "147015"], - ["1", "0", "9", "0"], - ] + data = get_communication_data(started_cluster) + expected = [ [str(row[0]), str(row[1]), str(row[2]), str(row[0]*row[1]*row[2])] for row in data["redirect_csv_data"] ] + assert list(map(str.split, stdout.splitlines())) == expected def test_put(started_cluster): diff --git a/dbms/tests/integration/test_storage_s3/test_server.py b/dbms/tests/integration/test_storage_s3/test_server.py index 09dfa1ca958..8896af9c23e 100644 --- a/dbms/tests/integration/test_storage_s3/test_server.py +++ b/dbms/tests/integration/test_storage_s3/test_server.py @@ -68,7 +68,8 @@ class SimpleHTTPServerHandler(BaseHTTPRequestHandler): self.send_response(200) self.send_header("Content-type", "text/plain") self.end_headers() - self.wfile.write("42,87,44\n55,33,81\n1,0,9\n") + data["redirect_csv_data"] = [[42, 87, 44], [55, 33, 81], [1, 0, 9]] + self.wfile.write("".join([ "{},{},{}\n".format(*row) for row in data["redirect_csv_data"]])) else: self.send_response(404) self.end_headers() From 8fd66ac4fd6a9c992bec170bf79f3f16a382ea65 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sun, 22 Sep 2019 15:52:33 +0300 Subject: [PATCH 165/309] Update compressed_buffer_fuzz.cpp --- .../tests/compressed_buffer_fuzz.cpp | 42 +++++++------------ 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/dbms/src/Compression/tests/compressed_buffer_fuzz.cpp b/dbms/src/Compression/tests/compressed_buffer_fuzz.cpp index ff4907dcb2d..8e0e529ff8e 100644 --- a/dbms/src/Compression/tests/compressed_buffer_fuzz.cpp +++ b/dbms/src/Compression/tests/compressed_buffer_fuzz.cpp @@ -1,32 +1,22 @@ -#include - #include -#include -#include -#include - -#include -#include -#include -#include -#include -#include +#include #include -#include -#include +#include -extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { - try { - std::string Str(reinterpret_cast(data), size); - - DB::ReadBufferFromString from(Str); - DB::CompressedReadBuffer in{from}; - } - catch (const DB::Exception & e) - { - std::cerr << e.what() << ", " << e.displayText() << std::endl; - return 1; - } +extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) +try +{ + DB::ReadBufferFromMemory from(data, size); + DB::CompressedReadBuffer in{from}; + + while (!in.eof()) + in.next(); + return 0; } +catch (...) +{ + std::cerr << DB::getCurrentExceptionMessage(true) << std::endl; + return 1; +} From 3270ad439a82392fcc3bb05c1173bce7f53d148f Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Mon, 23 Sep 2019 00:18:14 +0300 Subject: [PATCH 166/309] Update compressed_buffer_fuzz.cpp --- dbms/src/Compression/tests/compressed_buffer_fuzz.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Compression/tests/compressed_buffer_fuzz.cpp b/dbms/src/Compression/tests/compressed_buffer_fuzz.cpp index 8e0e529ff8e..a87046eff5c 100644 --- a/dbms/src/Compression/tests/compressed_buffer_fuzz.cpp +++ b/dbms/src/Compression/tests/compressed_buffer_fuzz.cpp @@ -9,10 +9,10 @@ try { DB::ReadBufferFromMemory from(data, size); DB::CompressedReadBuffer in{from}; - + while (!in.eof()) in.next(); - + return 0; } catch (...) From c5f9b4ad0a6c210339cb966e782b863387eea5c8 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Mon, 23 Sep 2019 00:21:41 +0300 Subject: [PATCH 167/309] Update sanitize.cmake --- cmake/sanitize.cmake | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index 381c186212b..6bd97b925f2 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -44,8 +44,10 @@ if (SANITIZE) endif () elseif (SANITIZE STREQUAL "libfuzzer") - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} -fsanitize=fuzzer-no-link,address,signed-integer-overflow -fsanitize-address-use-after-scope") - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} -fsanitize=fuzzer-no-link,address,signed-integer-overflow -fsanitize-address-use-after-scope") + # NOTE: Eldar Zaitov decided to name it "libfuzzer" instead of "fuzzer" to keep in mind another possible fuzzer backends. + # NOTE: no-link means that all the targets are built with instrumentation for fuzzer, but only some of them (tests) have entry point for fuzzer and it's not checked. + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} -fsanitize=fuzzer-no-link,address,undefined -fsanitize-address-use-after-scope") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} -fsanitize=fuzzer-no-link,address,undefined -fsanitize-address-use-after-scope") if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=fuzzer-no-link,address,signed-integer-overflow -fsanitize-address-use-after-scope") endif() From b0dd36db7c3241251dcf13dea59759cd80e432d8 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Mon, 23 Sep 2019 00:22:22 +0300 Subject: [PATCH 168/309] Update sanitize.cmake --- cmake/sanitize.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index 6bd97b925f2..9e8ef3e857a 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -49,12 +49,12 @@ if (SANITIZE) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} -fsanitize=fuzzer-no-link,address,undefined -fsanitize-address-use-after-scope") set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} -fsanitize=fuzzer-no-link,address,undefined -fsanitize-address-use-after-scope") if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=fuzzer-no-link,address,signed-integer-overflow -fsanitize-address-use-after-scope") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=fuzzer-no-link,address,undefined -fsanitize-address-use-after-scope") endif() if (MAKE_STATIC_LIBRARIES AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libasan") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libasan -static-libubsan") endif () - set (LIBFUZZER_CMAKE_CXX_FLAGS "-fsanitize=fuzzer,address,signed-integer-overflow -fsanitize-address-use-after-scope") + set (LIBFUZZER_CMAKE_CXX_FLAGS "-fsanitize=fuzzer,address,undefined -fsanitize-address-use-after-scope") else () message (FATAL_ERROR "Unknown sanitizer type: ${SANITIZE}") endif () From 6fb9565091264bd0abc5aa43b85eac0db309c682 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 23 Sep 2019 01:06:22 +0300 Subject: [PATCH 169/309] Merging S3: removed useless headers --- dbms/src/IO/WriteBufferFromS3.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/dbms/src/IO/WriteBufferFromS3.h b/dbms/src/IO/WriteBufferFromS3.h index 0eb689e468f..9afda1d14e2 100644 --- a/dbms/src/IO/WriteBufferFromS3.h +++ b/dbms/src/IO/WriteBufferFromS3.h @@ -1,25 +1,16 @@ #pragma once -#include #include #include #include #include #include #include -#include -#include #include #include #include -#include #include -#include #include -#include -#include -#include -#include namespace DB @@ -36,6 +27,9 @@ private: String buffer_string; std::unique_ptr temporary_buffer; size_t last_part_size; + + /// Upload in S3 is made in parts. + /// We initiate upload, then upload each part and get ETag as a response, and then finish upload with listing all our parts. String upload_id; std::vector part_tags; From c03857b2aee08fee4616cecae6caf4dbd94555bf Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 23 Sep 2019 01:13:42 +0300 Subject: [PATCH 170/309] Merging S3: part 2 --- dbms/src/Storages/StorageFile.cpp | 8 +++--- dbms/src/Storages/StorageS3.cpp | 30 ++++++++++++++++++--- dbms/src/Storages/StorageS3.h | 23 ++++++---------- dbms/src/TableFunctions/TableFunctionS3.cpp | 4 ++- dbms/src/TableFunctions/TableFunctionS3.h | 12 ++++++--- 5 files changed, 51 insertions(+), 26 deletions(-) diff --git a/dbms/src/Storages/StorageFile.cpp b/dbms/src/Storages/StorageFile.cpp index d5d8cd0856e..78fb8b53060 100644 --- a/dbms/src/Storages/StorageFile.cpp +++ b/dbms/src/Storages/StorageFile.cpp @@ -48,10 +48,11 @@ namespace ErrorCodes namespace { + /* Recursive directory listing with matched paths as a result. * Have the same method in StorageHDFS. */ -std::vector LSWithRegexpMatching(const std::string & path_for_ls, const std::string & for_match) +std::vector listFilesWithRegexpMatching(const std::string & path_for_ls, const std::string & for_match) { const size_t first_glob = for_match.find_first_of("*?{"); @@ -86,7 +87,8 @@ std::vector LSWithRegexpMatching(const std::string & path_for_ls, c { if (re2::RE2::FullMatch(file_name, matcher)) { - Strings result_part = LSWithRegexpMatching(full_path + "/", suffix_with_globs.substr(next_slash)); + /// TODO: No recursion depth check. No protection for cyclic symlinks. It is a bug. + Strings result_part = listFilesWithRegexpMatching(full_path + "/", suffix_with_globs.substr(next_slash)); std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); } } @@ -143,7 +145,7 @@ StorageFile::StorageFile( poco_path = Poco::Path(db_dir_path, poco_path); const std::string path = poco_path.absolute().toString(); - paths = LSWithRegexpMatching("/", path); + paths = listFilesWithRegexpMatching("/", path); for (const auto & cur_path : paths) checkCreationIsAllowed(context_global, db_dir_path, cur_path); is_db_table = false; diff --git a/dbms/src/Storages/StorageS3.cpp b/dbms/src/Storages/StorageS3.cpp index 59b2ef589a9..b680c95c00a 100644 --- a/dbms/src/Storages/StorageS3.cpp +++ b/dbms/src/Storages/StorageS3.cpp @@ -84,7 +84,7 @@ namespace const ConnectionTimeouts & timeouts) : sample_block(sample_block_) { - auto minimum_upload_part_size = context.getConfigRef().getUInt64("s3_minimum_upload_part_size", 512'000'000); + auto minimum_upload_part_size = context.getConfigRef().getUInt64("s3_minimum_upload_part_size", 512 * 1024 * 1024); write_buf = std::make_unique(uri, minimum_upload_part_size, timeouts); writer = FormatFactory::instance().getOutput(format, *write_buf, sample_block, context); } @@ -119,14 +119,36 @@ namespace } -BlockInputStreams StorageS3::read(const Names & column_names, +StorageS3::StorageS3( + const Poco::URI & uri_, + const std::string & database_name_, + const std::string & table_name_, + const String & format_name_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + Context & context_) + : IStorage(columns_) + , uri(uri_) + , context_global(context_) + , format_name(format_name_) + , database_name(database_name_) + , table_name(table_name_) +{ + setColumns(columns_); + setConstraints(constraints_); +} + + +BlockInputStreams StorageS3::read( + const Names & column_names, const SelectQueryInfo & /*query_info*/, const Context & context, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, unsigned /*num_streams*/) { - BlockInputStreamPtr block_input = std::make_shared(uri, + BlockInputStreamPtr block_input = std::make_shared( + uri, format_name, getName(), getHeaderBlock(column_names), @@ -171,7 +193,7 @@ void registerStorageS3(StorageFactory & factory) String format_name = engine_args[1]->as().value.safeGet(); - return StorageS3::create(uri, args.database_name, args.table_name, format_name, args.columns, args.context); + return StorageS3::create(uri, args.database_name, args.table_name, format_name, args.columns, args.constraints, args.context); }); } } diff --git a/dbms/src/Storages/StorageS3.h b/dbms/src/Storages/StorageS3.h index ad073aaa14c..05a69f439d0 100644 --- a/dbms/src/Storages/StorageS3.h +++ b/dbms/src/Storages/StorageS3.h @@ -5,6 +5,7 @@ #include #include + namespace DB { /** @@ -15,22 +16,14 @@ namespace DB class StorageS3 : public ext::shared_ptr_helper, public IStorage { public: - StorageS3(const Poco::URI & uri_, + StorageS3( + const Poco::URI & uri_, const std::string & database_name_, const std::string & table_name_, const String & format_name_, const ColumnsDescription & columns_, - Context & context_ - ) - : IStorage(columns_) - , uri(uri_) - , context_global(context_) - , format_name(format_name_) - , database_name(database_name_) - , table_name(table_name_) - { - setColumns(columns_); - } + const ConstraintsDescription & constraints_, + Context & context_); String getName() const override { @@ -47,7 +40,8 @@ public: return table_name; } - BlockInputStreams read(const Names & column_names, + BlockInputStreams read( + const Names & column_names, const SelectQueryInfo & query_info, const Context & context, QueryProcessingStage::Enum processed_stage, @@ -58,11 +52,10 @@ public: void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name, TableStructureWriteLockHolder &) override; -protected: +private: Poco::URI uri; const Context & context_global; -private: String format_name; String database_name; String table_name; diff --git a/dbms/src/TableFunctions/TableFunctionS3.cpp b/dbms/src/TableFunctions/TableFunctionS3.cpp index 38ca0830e5b..31a66a91af2 100644 --- a/dbms/src/TableFunctions/TableFunctionS3.cpp +++ b/dbms/src/TableFunctions/TableFunctionS3.cpp @@ -5,15 +5,17 @@ namespace DB { + StoragePtr TableFunctionS3::getStorage( const String & source, const String & format, const ColumnsDescription & columns, Context & global_context, const std::string & table_name) const { Poco::URI uri(source); - return StorageS3::create(uri, getDatabaseName(), table_name, format, columns, global_context); + return StorageS3::create(uri, getDatabaseName(), table_name, format, columns, ConstraintsDescription{}, global_context); } void registerTableFunctionS3(TableFunctionFactory & factory) { factory.registerFunction(); } + } diff --git a/dbms/src/TableFunctions/TableFunctionS3.h b/dbms/src/TableFunctions/TableFunctionS3.h index a4966be13c7..ecb9ea03197 100644 --- a/dbms/src/TableFunctions/TableFunctionS3.h +++ b/dbms/src/TableFunctions/TableFunctionS3.h @@ -1,12 +1,13 @@ #pragma once #include -#include -#include namespace DB { + +class Context; + /* s3(source, format, structure) - creates a temporary storage for a file in S3 */ class TableFunctionS3 : public ITableFunctionFileLike @@ -20,6 +21,11 @@ public: private: StoragePtr getStorage( - const String & source, const String & format, const ColumnsDescription & columns, Context & global_context, const std::string & table_name) const override; + const String & source, + const String & format, + const ColumnsDescription & columns, + Context & global_context, + const std::string & table_name) const override; }; + } From bdea16e3083ee8de5bfe0d748d6a16be3106dbd2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 23 Sep 2019 01:44:10 +0300 Subject: [PATCH 171/309] Merging S3: Moved S3 settings to in config to correspond to and --- dbms/src/Storages/StorageS3.cpp | 2 +- .../integration/test_storage_s3/configs/min_chunk_size.xml | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dbms/src/Storages/StorageS3.cpp b/dbms/src/Storages/StorageS3.cpp index b680c95c00a..c135a16775a 100644 --- a/dbms/src/Storages/StorageS3.cpp +++ b/dbms/src/Storages/StorageS3.cpp @@ -84,7 +84,7 @@ namespace const ConnectionTimeouts & timeouts) : sample_block(sample_block_) { - auto minimum_upload_part_size = context.getConfigRef().getUInt64("s3_minimum_upload_part_size", 512 * 1024 * 1024); + auto minimum_upload_part_size = context.getConfigRef().getUInt64("s3.minimum_upload_part_size", 512 * 1024 * 1024); write_buf = std::make_unique(uri, minimum_upload_part_size, timeouts); writer = FormatFactory::instance().getOutput(format, *write_buf, sample_block, context); } diff --git a/dbms/tests/integration/test_storage_s3/configs/min_chunk_size.xml b/dbms/tests/integration/test_storage_s3/configs/min_chunk_size.xml index f61fcd2c5c9..2a9c465a7b8 100644 --- a/dbms/tests/integration/test_storage_s3/configs/min_chunk_size.xml +++ b/dbms/tests/integration/test_storage_s3/configs/min_chunk_size.xml @@ -1,3 +1,5 @@ - 1000000 + + 1000000 + From f3ba89ccef01eb6eb2fa66e280a14cd64b68fe71 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Mon, 23 Sep 2019 01:48:21 +0300 Subject: [PATCH 172/309] Empty commit to trigger CI --- contrib/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 0574dc33a1d..4e026a1835f 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -65,7 +65,7 @@ if (USE_INTERNAL_ZLIB_LIBRARY) endif () add_subdirectory (${INTERNAL_ZLIB_NAME}) - # todo: make pull to Dead2/zlib-ng and remove: + # TODO: make pull to Dead2/zlib-ng and remove: # We should use same defines when including zlib.h as used when zlib compiled target_compile_definitions (zlib PUBLIC ZLIB_COMPAT WITH_GZFILEOP) target_compile_definitions (zlibstatic PUBLIC ZLIB_COMPAT WITH_GZFILEOP) From 70bc89557fd5b7bda192045a5d63a6c1a933e1a2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 23 Sep 2019 01:57:06 +0300 Subject: [PATCH 173/309] Updates for #6969 --- dbms/programs/server/MetricsTransmitter.cpp | 6 +++--- dbms/programs/server/MetricsTransmitter.h | 8 ++++---- dbms/programs/server/config.xml | 4 ++-- docs/en/operations/server_settings/settings.md | 6 +++--- docs/ru/operations/server_settings/settings.md | 6 +++--- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/dbms/programs/server/MetricsTransmitter.cpp b/dbms/programs/server/MetricsTransmitter.cpp index 73413cad1c0..17c759ce98c 100644 --- a/dbms/programs/server/MetricsTransmitter.cpp +++ b/dbms/programs/server/MetricsTransmitter.cpp @@ -21,7 +21,7 @@ MetricsTransmitter::MetricsTransmitter( { interval_seconds = config.getInt(config_name + ".interval", 60); send_events = config.getBool(config_name + ".events", true); - send_events_absolute = config.getBool(config_name + ".events_absolute", false); + send_events_cumulative = config.getBool(config_name + ".events_cumulative", false); send_metrics = config.getBool(config_name + ".metrics", true); send_asynchronous_metrics = config.getBool(config_name + ".asynchronous_metrics", true); } @@ -96,13 +96,13 @@ void MetricsTransmitter::transmit(std::vector & prev_count } } - if (send_events_absolute) + if (send_events_cumulative) { for (size_t i = 0, end = ProfileEvents::end(); i < end; ++i) { const auto counter = ProfileEvents::global_counters[i].load(std::memory_order_relaxed); std::string key{ProfileEvents::getName(static_cast(i))}; - key_vals.emplace_back(profile_events_absolute_path_prefix + key, counter); + key_vals.emplace_back(profile_events_cumulative_path_prefix + key, counter); } } diff --git a/dbms/programs/server/MetricsTransmitter.h b/dbms/programs/server/MetricsTransmitter.h index 1d5795c24db..79840616dd3 100644 --- a/dbms/programs/server/MetricsTransmitter.h +++ b/dbms/programs/server/MetricsTransmitter.h @@ -24,8 +24,8 @@ class AsynchronousMetrics; /** Automatically sends - * - values deltas of ProfileEvents; - * - absolute values of ProfileEvents; + * - delta values of ProfileEvents; + * - cumulative values of ProfileEvents; * - values of CurrentMetrics; * - values of AsynchronousMetrics; * to Graphite at beginning of every minute. @@ -45,7 +45,7 @@ private: std::string config_name; UInt32 interval_seconds; bool send_events; - bool send_events_absolute; + bool send_events_cumulative; bool send_metrics; bool send_asynchronous_metrics; @@ -55,7 +55,7 @@ private: ThreadFromGlobalPool thread{&MetricsTransmitter::run, this}; static inline constexpr auto profile_events_path_prefix = "ClickHouse.ProfileEvents."; - static inline constexpr auto profile_events_absolute_path_prefix = "ClickHouse.ProfileEventsAbsolute."; + static inline constexpr auto profile_events_cumulative_path_prefix = "ClickHouse.ProfileEventsCumulative."; static inline constexpr auto current_metrics_path_prefix = "ClickHouse.Metrics."; static inline constexpr auto asynchronous_metrics_path_prefix = "ClickHouse.AsynchronousMetrics."; }; diff --git a/dbms/programs/server/config.xml b/dbms/programs/server/config.xml index 7263992b3d3..c8d33922167 100644 --- a/dbms/programs/server/config.xml +++ b/dbms/programs/server/config.xml @@ -258,7 +258,7 @@ true true - false + false true @@ -270,7 +270,7 @@ true true - false + false false --> diff --git a/docs/en/operations/server_settings/settings.md b/docs/en/operations/server_settings/settings.md index baf9013fbbf..f884a7b2963 100644 --- a/docs/en/operations/server_settings/settings.md +++ b/docs/en/operations/server_settings/settings.md @@ -141,8 +141,8 @@ Settings: - timeout – The timeout for sending data, in seconds. - root_path – Prefix for keys. - metrics – Sending data from a :ref:`system_tables-system.metrics` table. -- events – Sending deltas data from a :ref:`system_tables-system.events` table -- events_absolute – Sending absolute data from a :ref:`system_tables-system.events` table +- events – Sending deltas data accumulated for the time period from a :ref:`system_tables-system.events` table +- events_cumulative – Sending cumulative data from a :ref:`system_tables-system.events` table - asynchronous_metrics – Sending data from a :ref:`system_tables-system.asynchronous_metrics` table. You can configure multiple `` clauses. For instance, you can use this for sending different data at different intervals. @@ -158,7 +158,7 @@ You can configure multiple `` clauses. For instance, you can use this one_min true true - false + false true ``` diff --git a/docs/ru/operations/server_settings/settings.md b/docs/ru/operations/server_settings/settings.md index 2f6362b7635..39523db7d36 100644 --- a/docs/ru/operations/server_settings/settings.md +++ b/docs/ru/operations/server_settings/settings.md @@ -140,8 +140,8 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat - timeout - Таймаут отправки данных в секундах. - root_path - Префикс для ключей. - metrics - Отправка данных из таблицы :ref:`system_tables-system.metrics`. -- events - Отправка дельты данных из таблицы :ref:`system_tables-system.events` -- events_absolute - Отправка абсолютных данных из таблицы :ref:`system_tables-system.events` +- events - Отправка дельты данных, накопленной за промежуток времени из таблицы :ref:`system_tables-system.events` +- events_cumulative - Отправка суммарных данных из таблицы :ref:`system_tables-system.events` - asynchronous_metrics - Отправка данных из таблицы :ref:`system_tables-system.asynchronous_metrics`. Можно определить несколько секций ``, например, для передачи различных данных с различной частотой. @@ -157,7 +157,7 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat one_min true true - false + false true ``` From 61b02598d7ce1cc787d6d872411ba237c1c06347 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Mon, 23 Sep 2019 02:02:03 +0300 Subject: [PATCH 174/309] Update repeat.cpp --- dbms/src/Functions/repeat.cpp | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/dbms/src/Functions/repeat.cpp b/dbms/src/Functions/repeat.cpp index 81e780fb53b..741af698452 100644 --- a/dbms/src/Functions/repeat.cpp +++ b/dbms/src/Functions/repeat.cpp @@ -178,16 +178,18 @@ public: Impl::vectorConst(copy_str, repeat_time, col_res->getChars(), col_res->getOffsets()); block.getByPosition(result).column = std::move(col_res); } - else if (!castType(block.getByPosition(arguments[1]).type.get(), [&](const auto & type) + else if (!castType(block.getByPosition(arguments[1]).type.get(), [&](const auto & type) + { + using DataType = std::decay_t; + using T0 = typename DataType::FieldType; + const ColumnVector * colnum = checkAndGetColumn>(numcolumn.get()); + auto col_res = ColumnString::create(); + Impl::vectorNonConstInteger(copy_str, col_res->getChars(), col_res->getOffsets(), colnum->getData()); + block.getByPosition(result).column = std::move(col_res); + return 0; + })) { - using DataType = std::decay_t; - using T0 = typename DataType::FieldType; - const ColumnVector * colnum = checkAndGetColumn>(numcolumn.get()); - auto col_res = ColumnString::create(); - Impl::vectorNonConstInteger(copy_str, col_res->getChars(), col_res->getOffsets(), colnum->getData()); - block.getByPosition(result).column = std::move(col_res); - return 0; - })); + } else throw Exception( "Illegal column " + block.getByPosition(arguments[1]).column->getName() + " of argument of function " + getName(), From 454b83fcd30e09bfb790e353383c8d97d48b941e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 23 Sep 2019 02:30:13 +0300 Subject: [PATCH 175/309] Merging function 'repeat' --- dbms/src/Functions/repeat.cpp | 125 ++++++++---------- .../0_stateless/01013_repeat_function.sql | 10 +- 2 files changed, 59 insertions(+), 76 deletions(-) diff --git a/dbms/src/Functions/repeat.cpp b/dbms/src/Functions/repeat.cpp index 741af698452..ad1d3954393 100644 --- a/dbms/src/Functions/repeat.cpp +++ b/dbms/src/Functions/repeat.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -18,38 +17,29 @@ namespace ErrorCodes struct RepeatImpl { - static void vectorNonConstStr( + static void vectorStrConstRepeat( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets, - const UInt64 & repeat_time) + UInt64 repeat_time) { UInt64 data_size = 0; res_offsets.assign(offsets); for (UInt64 i = 0; i < offsets.size(); ++i) { - data_size += (offsets[i] - offsets[i - 1] - 1) * repeat_time + 1; + data_size += (offsets[i] - offsets[i - 1] - 1) * repeat_time + 1; /// Note that accessing -1th element is valid for PaddedPODArray. res_offsets[i] = data_size; } res_data.resize(data_size); for (UInt64 i = 0; i < res_offsets.size(); ++i) { - array(data.data() + offsets[i - 1], res_data.data() + res_offsets[i - 1], offsets[i] - offsets[i - 1], repeat_time); + process(data.data() + offsets[i - 1], res_data.data() + res_offsets[i - 1], offsets[i] - offsets[i - 1], repeat_time); } } - static void - vectorConst(const String & copy_str, const UInt64 & repeat_time, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) - { - UInt64 data_size = copy_str.size() * repeat_time + 1; - res_data.resize(data_size); - res_offsets.resize_fill(1, data_size); - array(reinterpret_cast(const_cast(copy_str.data())), res_data.data(), copy_str.size() + 1, repeat_time); - } - template - static void vectorNonConst( + static void vectorStrVectorRepeat( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, ColumnString::Chars & res_data, @@ -66,17 +56,20 @@ struct RepeatImpl res_data.resize(data_size); for (UInt64 i = 0; i < col_num.size(); ++i) { - array(data.data() + offsets[i - 1], res_data.data() + res_offsets[i - 1], offsets[i] - offsets[i - 1], col_num[i]); + process(data.data() + offsets[i - 1], res_data.data() + res_offsets[i - 1], offsets[i] - offsets[i - 1], col_num[i]); } } template - static void vectorNonConstInteger( - const String & copy_str, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets, const PaddedPODArray & col_num) + static void constStrVectorRepeat( + const StringRef & copy_str, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets, + const PaddedPODArray & col_num) { UInt64 data_size = 0; res_offsets.resize(col_num.size()); - UInt64 str_size = copy_str.size(); + UInt64 str_size = copy_str.size; UInt64 col_size = col_num.size(); for (UInt64 i = 0; i < col_size; ++i) { @@ -86,8 +79,8 @@ struct RepeatImpl res_data.resize(data_size); for (UInt64 i = 0; i < col_size; ++i) { - array( - reinterpret_cast(const_cast(copy_str.data())), + process( + reinterpret_cast(const_cast(copy_str.data)), res_data.data() + res_offsets[i - 1], str_size + 1, col_num[i]); @@ -95,7 +88,7 @@ struct RepeatImpl } private: - static void array(const UInt8 * src, UInt8 * dst, const UInt64 & size, const UInt64 & repeat_time) + static void process(const UInt8 * src, UInt8 * dst, UInt64 size, UInt64 repeat_time) { for (UInt64 i = 0; i < repeat_time; ++i) { @@ -106,8 +99,8 @@ private: } }; -template -class FunctionRepeatImpl : public IFunction + +class FunctionRepeat : public IFunction { template static bool castType(const IDataType * type, F && f) @@ -117,7 +110,7 @@ class FunctionRepeatImpl : public IFunction public: static constexpr auto name = "repeat"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(const Context &) { return std::make_shared(); } String getName() const override { return name; } @@ -138,74 +131,64 @@ public: void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override { - auto & strcolumn = block.getByPosition(arguments[0]).column; - auto & numcolumn = block.getByPosition(arguments[1]).column; + const auto & strcolumn = block.getByPosition(arguments[0]).column; + const auto & numcolumn = block.getByPosition(arguments[1]).column; if (const ColumnString * col = checkAndGetColumn(strcolumn.get())) { if (const ColumnConst * scale_column_num = checkAndGetColumn(numcolumn.get())) { - Field scale_field_num = scale_column_num->getField(); - UInt64 repeat_time = scale_field_num.get(); + UInt64 repeat_time = scale_column_num->getValue(); auto col_res = ColumnString::create(); - Impl::vectorNonConstStr(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), repeat_time); + RepeatImpl::vectorStrConstRepeat(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), repeat_time); block.getByPosition(result).column = std::move(col_res); + return; } - else if (!castType(block.getByPosition(arguments[1]).type.get(), [&](const auto & type) - { - using DataType = std::decay_t; - using T0 = typename DataType::FieldType; - const ColumnVector * colnum = checkAndGetColumn>(numcolumn.get()); - auto col_res = ColumnString::create(); - Impl::vectorNonConst(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), colnum->getData()); - block.getByPosition(result).column = std::move(col_res); - return 0; - })); - else - throw Exception( - "Illegal column " + block.getByPosition(arguments[1]).column->getName() + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - else if (const ColumnConst * scale_column_str = checkAndGetColumn(strcolumn.get())) - { - Field scale_field_str = scale_column_str->getField(); - String copy_str = scale_field_str.get(); - if (const ColumnConst * scale_column_num = checkAndGetColumn(numcolumn.get())) - { - Field scale_field_num = scale_column_num->getField(); - UInt64 repeat_time = scale_field_num.get(); - auto col_res = ColumnString::create(); - Impl::vectorConst(copy_str, repeat_time, col_res->getChars(), col_res->getOffsets()); - block.getByPosition(result).column = std::move(col_res); - } - else if (!castType(block.getByPosition(arguments[1]).type.get(), [&](const auto & type) + else if (castType(block.getByPosition(arguments[1]).type.get(), [&](const auto & type) { using DataType = std::decay_t; - using T0 = typename DataType::FieldType; - const ColumnVector * colnum = checkAndGetColumn>(numcolumn.get()); + using T = typename DataType::FieldType; + const ColumnVector * colnum = checkAndGetColumn>(numcolumn.get()); auto col_res = ColumnString::create(); - Impl::vectorNonConstInteger(copy_str, col_res->getChars(), col_res->getOffsets(), colnum->getData()); + RepeatImpl::vectorStrVectorRepeat(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), colnum->getData()); block.getByPosition(result).column = std::move(col_res); - return 0; + return true; })) { + return; } - else - throw Exception( - "Illegal column " + block.getByPosition(arguments[1]).column->getName() + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); } - else - throw Exception( - "Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); + else if (const ColumnConst * col_const = checkAndGetColumn(strcolumn.get())) + { + /// Note that const-const case is handled by useDefaultImplementationForConstants. + + StringRef copy_str = col_const->getDataColumn().getDataAt(0); + + if (castType(block.getByPosition(arguments[1]).type.get(), [&](const auto & type) + { + using DataType = std::decay_t; + using T = typename DataType::FieldType; + const ColumnVector * colnum = checkAndGetColumn>(numcolumn.get()); + auto col_res = ColumnString::create(); + RepeatImpl::constStrVectorRepeat(copy_str, col_res->getChars(), col_res->getOffsets(), colnum->getData()); + block.getByPosition(result).column = std::move(col_res); + return true; + })) + { + return; + } + } + + throw Exception( + "Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); } }; -using FunctionRepeat = FunctionRepeatImpl; void registerFunctionRepeat(FunctionFactory & factory) { factory.registerFunction(); } + } diff --git a/dbms/tests/queries/0_stateless/01013_repeat_function.sql b/dbms/tests/queries/0_stateless/01013_repeat_function.sql index 5de0e7a64e5..7d34307a21f 100644 --- a/dbms/tests/queries/0_stateless/01013_repeat_function.sql +++ b/dbms/tests/queries/0_stateless/01013_repeat_function.sql @@ -2,11 +2,11 @@ SELECT repeat('abc', 10); DROP TABLE IF EXISTS defaults; CREATE TABLE defaults ( - strings String, - u8 UInt8, - u16 UInt16, - u32 UInt32, - u64 UInt64 + strings String, + u8 UInt8, + u16 UInt16, + u32 UInt32, + u64 UInt64 )ENGINE = Memory(); INSERT INTO defaults values ('abc', 3, 12, 4, 56) ('sdfgg', 2, 10, 21, 200) ('xywq', 1, 4, 9, 5) ('plkf', 0, 5, 7,77); From 69c79c31bb91ddd00abc243523f8cd2c9a474783 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 23 Sep 2019 02:31:33 +0300 Subject: [PATCH 176/309] Merging function 'repeat': improved test --- dbms/tests/queries/0_stateless/01013_repeat_function.reference | 1 + dbms/tests/queries/0_stateless/01013_repeat_function.sql | 2 ++ 2 files changed, 3 insertions(+) diff --git a/dbms/tests/queries/0_stateless/01013_repeat_function.reference b/dbms/tests/queries/0_stateless/01013_repeat_function.reference index 7841bbd52f9..46bb248a99a 100644 --- a/dbms/tests/queries/0_stateless/01013_repeat_function.reference +++ b/dbms/tests/queries/0_stateless/01013_repeat_function.reference @@ -35,3 +35,4 @@ abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcab abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc abcabcabcabcabc abcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabcabc +Hello, world! Hello, world! Hello, world! diff --git a/dbms/tests/queries/0_stateless/01013_repeat_function.sql b/dbms/tests/queries/0_stateless/01013_repeat_function.sql index 7d34307a21f..85b0c16b4ab 100644 --- a/dbms/tests/queries/0_stateless/01013_repeat_function.sql +++ b/dbms/tests/queries/0_stateless/01013_repeat_function.sql @@ -21,4 +21,6 @@ SELECT repeat('abc', u16) FROM defaults; SELECT repeat('abc', u32) FROM defaults; SELECT repeat('abc', u64) FROM defaults; +SELECT repeat('Hello, world! ', 3); + DROP TABLE defaults; From 48f28bce6a4bffa1ffa15122798b2267eb9e3ae3 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 23 Sep 2019 02:37:13 +0300 Subject: [PATCH 177/309] Fixed bad error message --- dbms/src/Common/DiskSpaceMonitor.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/dbms/src/Common/DiskSpaceMonitor.cpp b/dbms/src/Common/DiskSpaceMonitor.cpp index 5b07e11f31b..392712e19fb 100644 --- a/dbms/src/Common/DiskSpaceMonitor.cpp +++ b/dbms/src/Common/DiskSpaceMonitor.cpp @@ -1,10 +1,12 @@ #include +#include +#include #include -#include #include + namespace DB { @@ -82,7 +84,7 @@ bool Disk::tryReserve(UInt64 bytes) const std::lock_guard lock(mutex); if (bytes == 0) { - LOG_DEBUG(&Logger::get("DiskSpaceMonitor"), "Reserving 0 bytes on disk " << name); + LOG_DEBUG(&Logger::get("DiskSpaceMonitor"), "Reserving 0 bytes on disk " << backQuote(name)); ++reservation_count; return true; } @@ -93,7 +95,7 @@ bool Disk::tryReserve(UInt64 bytes) const { LOG_DEBUG( &Logger::get("DiskSpaceMonitor"), - "Reserving " << bytes << " bytes on disk " << name << " having unreserved " << unreserved_space << " bytes."); + "Reserving " << bytes << " bytes on disk " << backQuote(name) << " having unreserved " << unreserved_space << " bytes."); ++reservation_count; reserved_bytes += bytes; return true; From 2585cde3d0a5dcc1e9b91f1acec95c3c2c2ec84f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 23 Sep 2019 02:49:15 +0300 Subject: [PATCH 178/309] Added safety threshold --- dbms/src/Functions/repeat.cpp | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/dbms/src/Functions/repeat.cpp b/dbms/src/Functions/repeat.cpp index ad1d3954393..112ae452e86 100644 --- a/dbms/src/Functions/repeat.cpp +++ b/dbms/src/Functions/repeat.cpp @@ -7,16 +7,27 @@ #include #include + namespace DB { namespace ErrorCodes { extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int TOO_LARGE_STRING_SIZE; } struct RepeatImpl { + /// Safety threshold against DoS. + static inline void checkRepeatTime(UInt64 repeat_time) + { + static constexpr UInt64 max_repeat_times = 1000000; + if (repeat_time > max_repeat_times) + throw Exception("Too many times to repeat (" + std::to_string(repeat_time) + "), maximum is: " + std::to_string(max_repeat_times), + ErrorCodes::TOO_LARGE_STRING_SIZE); + } + static void vectorStrConstRepeat( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, @@ -24,6 +35,8 @@ struct RepeatImpl ColumnString::Offsets & res_offsets, UInt64 repeat_time) { + checkRepeatTime(repeat_time); + UInt64 data_size = 0; res_offsets.assign(offsets); for (UInt64 i = 0; i < offsets.size(); ++i) @@ -54,9 +67,12 @@ struct RepeatImpl res_offsets[i] = data_size; } res_data.resize(data_size); + for (UInt64 i = 0; i < col_num.size(); ++i) { - process(data.data() + offsets[i - 1], res_data.data() + res_offsets[i - 1], offsets[i] - offsets[i - 1], col_num[i]); + T repeat_time = col_num[i]; + checkRepeatTime(repeat_time); + process(data.data() + offsets[i - 1], res_data.data() + res_offsets[i - 1], offsets[i] - offsets[i - 1], repeat_time); } } @@ -79,11 +95,13 @@ struct RepeatImpl res_data.resize(data_size); for (UInt64 i = 0; i < col_size; ++i) { + T repeat_time = col_num[i]; + checkRepeatTime(repeat_time); process( reinterpret_cast(const_cast(copy_str.data)), res_data.data() + res_offsets[i - 1], str_size + 1, - col_num[i]); + repeat_time); } } From 80849e0fd7c5f6338cd7be7352c0d6e5e3dec94a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 23 Sep 2019 02:49:26 +0300 Subject: [PATCH 179/309] Fixed bad log messages --- dbms/src/Common/DiskSpaceMonitor.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/dbms/src/Common/DiskSpaceMonitor.cpp b/dbms/src/Common/DiskSpaceMonitor.cpp index 392712e19fb..653692ea603 100644 --- a/dbms/src/Common/DiskSpaceMonitor.cpp +++ b/dbms/src/Common/DiskSpaceMonitor.cpp @@ -95,7 +95,8 @@ bool Disk::tryReserve(UInt64 bytes) const { LOG_DEBUG( &Logger::get("DiskSpaceMonitor"), - "Reserving " << bytes << " bytes on disk " << backQuote(name) << " having unreserved " << unreserved_space << " bytes."); + "Reserving " << formatReadableSizeWithBinarySuffix(bytes) << " on disk " << backQuote(name) + << ", having unreserved " << formatReadableSizeWithBinarySuffix(unreserved_space) << "."); ++reservation_count; reserved_bytes += bytes; return true; @@ -285,14 +286,14 @@ Volume::Volume( max_data_part_size = static_cast(sum_size * ratio / disks.size()); for (size_t i = 0; i < disks.size(); ++i) if (sizes[i] < max_data_part_size) - LOG_WARNING(logger, "Disk " << disks[i]->getName() << " on volume " << config_prefix << - " have not enough space (" << sizes[i] << + LOG_WARNING(logger, "Disk " << backQuote(disks[i]->getName()) << " on volume " << backQuote(config_prefix) << + " have not enough space (" << formatReadableSizeWithBinarySuffix(sizes[i]) << ") for containing part the size of max_data_part_size (" << - max_data_part_size << ")"); + formatReadableSizeWithBinarySuffix(max_data_part_size) << ")"); } constexpr UInt64 MIN_PART_SIZE = 8u * 1024u * 1024u; if (max_data_part_size < MIN_PART_SIZE) - LOG_WARNING(logger, "Volume '" << name << "' max_data_part_size is too low (" + LOG_WARNING(logger, "Volume " << backQuote(name) << " max_data_part_size is too low (" << formatReadableSizeWithBinarySuffix(max_data_part_size) << " < " << formatReadableSizeWithBinarySuffix(MIN_PART_SIZE) << ")"); } @@ -507,7 +508,7 @@ StoragePolicySelector::StoragePolicySelector( ErrorCodes::EXCESSIVE_ELEMENT_IN_CONFIG); policies.emplace(name, std::make_shared(name, config, config_prefix + "." + name, disks)); - LOG_INFO(&Logger::get("StoragePolicySelector"), "Storage policy " << name << " loaded"); + LOG_INFO(&Logger::get("StoragePolicySelector"), "Storage policy " << backQuote(name) << " loaded"); } constexpr auto default_storage_policy_name = "default"; From e6f1fdc01134456558c882538f80aae861624eaf Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 23 Sep 2019 02:56:27 +0300 Subject: [PATCH 180/309] Added another test --- .../01014_function_repeat_corner_cases.reference | 13 +++++++++++++ .../01014_function_repeat_corner_cases.sql | 6 ++++++ 2 files changed, 19 insertions(+) create mode 100644 dbms/tests/queries/0_stateless/01014_function_repeat_corner_cases.reference create mode 100644 dbms/tests/queries/0_stateless/01014_function_repeat_corner_cases.sql diff --git a/dbms/tests/queries/0_stateless/01014_function_repeat_corner_cases.reference b/dbms/tests/queries/0_stateless/01014_function_repeat_corner_cases.reference new file mode 100644 index 00000000000..50cc34ce29c --- /dev/null +++ b/dbms/tests/queries/0_stateless/01014_function_repeat_corner_cases.reference @@ -0,0 +1,13 @@ +1000000 +0 + +1 +22 +333 +4444 +55555 +666666 +7777777 +88888888 +999999999 +10101010101010101010 diff --git a/dbms/tests/queries/0_stateless/01014_function_repeat_corner_cases.sql b/dbms/tests/queries/0_stateless/01014_function_repeat_corner_cases.sql new file mode 100644 index 00000000000..53e55a63702 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01014_function_repeat_corner_cases.sql @@ -0,0 +1,6 @@ +SELECT length(repeat('x', 1000000)); +SELECT length(repeat('', 1000000)); +SELECT length(repeat('x', 1000001)); -- { serverError 131 } +SET max_memory_usage = 100000000; +SELECT length(repeat(repeat('Hello, world!', 1000000), 10)); -- { serverError 241 } +SELECT repeat(toString(number), number) FROM system.numbers LIMIT 11; From 7948b0bb7e3bfd6beb9a360c9851535fd4991db5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 23 Sep 2019 02:56:44 +0300 Subject: [PATCH 181/309] Minor modifications --- dbms/src/Common/DiskSpaceMonitor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Common/DiskSpaceMonitor.cpp b/dbms/src/Common/DiskSpaceMonitor.cpp index 653692ea603..c8347b5d106 100644 --- a/dbms/src/Common/DiskSpaceMonitor.cpp +++ b/dbms/src/Common/DiskSpaceMonitor.cpp @@ -47,7 +47,7 @@ std::filesystem::path getMountPoint(std::filesystem::path absolute_path) return absolute_path; } - /// Returns name of filesystem mounted to mount_point +/// Returns name of filesystem mounted to mount_point #if !defined(__linux__) [[noreturn]] #endif @@ -67,7 +67,7 @@ std::string getFilesystemName([[maybe_unused]] const std::string & mount_point) throw DB::Exception("Cannot find name of filesystem by mount point " + mount_point, ErrorCodes::SYSTEM_ERROR); return fs_info.mnt_fsname; #else - throw DB::Exception("Supported on linux only", ErrorCodes::NOT_IMPLEMENTED); + throw DB::Exception("The function getFilesystemName is supported on Linux only", ErrorCodes::NOT_IMPLEMENTED); #endif } From e1613d0704a04e48167d6972119989d94bbe3b94 Mon Sep 17 00:00:00 2001 From: maqroll Date: Mon, 23 Sep 2019 07:27:49 +0000 Subject: [PATCH 182/309] + integration test --- .../test_redirect_url_storage/__init__.py | 0 .../test_redirect_url_storage/test.py | 45 +++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 dbms/tests/integration/test_redirect_url_storage/__init__.py create mode 100644 dbms/tests/integration/test_redirect_url_storage/test.py diff --git a/dbms/tests/integration/test_redirect_url_storage/__init__.py b/dbms/tests/integration/test_redirect_url_storage/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/integration/test_redirect_url_storage/test.py b/dbms/tests/integration/test_redirect_url_storage/test.py new file mode 100644 index 00000000000..cf64e84b96b --- /dev/null +++ b/dbms/tests/integration/test_redirect_url_storage/test.py @@ -0,0 +1,45 @@ +import pytest +from helpers.hdfs_api import HDFSApi + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +node1 = cluster.add_instance('node1', with_zookeeper=False, with_hdfs=True) + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + finally: + cluster.shutdown() + +def test_url_without_redirect(started_cluster): + hdfs_api = HDFSApi("root") + hdfs_api.write_data("/simple_storage", "1\tMark\t72.53\n") + assert hdfs_api.read_data("/simple_storage") == "1\tMark\t72.53\n" + + # access datanode port directly + node1.query("create table WebHDFSStorage (id UInt32, name String, weight Float64) ENGINE = URL('http://hdfs1:50075/webhdfs/v1/simple_storage?op=OPEN&namenoderpcaddress=hdfs1:9000&offset=0', 'TSV')") + assert node1.query("select * from WebHDFSStorage") == "1\tMark\t72.53\n" + +def test_url_with_redirect_not_allowed(started_cluster): + hdfs_api = HDFSApi("root") + hdfs_api.write_data("/simple_storage", "1\tMark\t72.53\n") + assert hdfs_api.read_data("/simple_storage") == "1\tMark\t72.53\n" + + # access proxy port without allowing redirects + node1.query("create table WebHDFSStorageWithoutRedirect (id UInt32, name String, weight Float64) ENGINE = URL('http://hdfs1:50070/webhdfs/v1/simple_storage?op=OPEN&namenoderpcaddress=hdfs1:9000&offset=0', 'TSV')") + with pytest.raises(Exception): + assert node1.query("select * from WebHDFSStorageWithoutRedirect") == "1\tMark\t72.53\n" + +def test_url_with_redirect_allowed(started_cluster): + hdfs_api = HDFSApi("root") + hdfs_api.write_data("/simple_storage", "1\tMark\t72.53\n") + assert hdfs_api.read_data("/simple_storage") == "1\tMark\t72.53\n" + + # access proxy port with allowing redirects + # http://localhost:50070/webhdfs/v1/b?op=OPEN&namenoderpcaddress=hdfs1:9000&offset=0 + node1.query("create table WebHDFSStorageWithRedirect (id UInt32, name String, weight Float64) ENGINE = URL('http://hdfs1:50070/webhdfs/v1/simple_storage?op=OPEN&namenoderpcaddress=hdfs1:9000&offset=0', 'TSV')") + assert node1.query("SET max_http_get_redirects=1; select * from WebHDFSStorageWithRedirect") == "1\tMark\t72.53\n" From c45e7dc747b03a7091cb88da8eb3ebf83ad1fa13 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 23 Sep 2019 07:42:02 +0000 Subject: [PATCH 183/309] Many parts warning and a comment about that. --- dbms/src/IO/WriteBufferFromS3.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 1ef6f3b19a0..9604b6ce199 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -15,6 +15,10 @@ namespace DB { const int DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT = 2; + +// S3 protocol does not allow to have multipart upload with more than 10000 parts. +// In case server does not return an error on exceeding that number, we print a warning +// because custom S3 implementation may allow relaxed requirements on that. const int S3_WARN_MAX_PARTS = 10000; @@ -166,7 +170,7 @@ void WriteBufferFromS3::writePart(const String & data) if (part_tags.size() == S3_WARN_MAX_PARTS) { // Don't throw exception here by ourselves but leave the decision to take by S3 server. - LOG_WARNING(&Logger::get("WriteBufferFromS3"), "Maximum part number in S3 protocol has reached (too much parts). Server may not accept this whole upload."); + LOG_WARNING(&Logger::get("WriteBufferFromS3"), "Maximum part number in S3 protocol has reached (too many parts). Server may not accept this whole upload."); } for (int i = 0; i < DEFAULT_S3_MAX_FOLLOW_PUT_REDIRECT; ++i) From e99a87b5b752502ab5a0fe91d148f3b0f675e0be Mon Sep 17 00:00:00 2001 From: maqroll Date: Mon, 23 Sep 2019 08:53:09 +0000 Subject: [PATCH 184/309] update assertResponseIsOk --- dbms/src/IO/HTTPCommon.cpp | 4 ++-- dbms/src/IO/HTTPCommon.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dbms/src/IO/HTTPCommon.cpp b/dbms/src/IO/HTTPCommon.cpp index 26cc96b7d58..de094cd52cd 100644 --- a/dbms/src/IO/HTTPCommon.cpp +++ b/dbms/src/IO/HTTPCommon.cpp @@ -223,11 +223,11 @@ std::istream * receiveResponse( Poco::Net::HTTPClientSession & session, const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, const bool allow_redirects) { auto & istr = session.receiveResponse(response); - assertResponseIsOk(request, response, istr); + assertResponseIsOk(request, response, istr, allow_redirects); return &istr; } -void assertResponseIsOk(const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream & istr) +void assertResponseIsOk(const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream & istr, const bool allow_redirects) { auto status = response.getStatus(); diff --git a/dbms/src/IO/HTTPCommon.h b/dbms/src/IO/HTTPCommon.h index 39deaf4e3eb..7592c1c31b3 100644 --- a/dbms/src/IO/HTTPCommon.h +++ b/dbms/src/IO/HTTPCommon.h @@ -59,5 +59,5 @@ bool isRedirect(const Poco::Net::HTTPResponse::HTTPStatus status); */ std::istream * receiveResponse( Poco::Net::HTTPClientSession & session, const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, bool allow_redirects); -void assertResponseIsOk(const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream & istr); +void assertResponseIsOk(const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream & istr, const bool allow_redirects = false); } From 6a4a4b5674f58ca91797955e6a31c595cae135e6 Mon Sep 17 00:00:00 2001 From: Sergei Bocharov Date: Thu, 19 Sep 2019 15:31:03 +0300 Subject: [PATCH 185/309] Docs(hash_functions): fix javaHash & hiveHash functions --- .../functions/hash_functions.md | 62 +++++++++++++++++-- .../functions/hash_functions.md | 61 ++++++++++++++++-- 2 files changed, 112 insertions(+), 11 deletions(-) diff --git a/docs/en/query_language/functions/hash_functions.md b/docs/en/query_language/functions/hash_functions.md index b384dead609..8fb7e62aac4 100644 --- a/docs/en/query_language/functions/hash_functions.md +++ b/docs/en/query_language/functions/hash_functions.md @@ -177,16 +177,66 @@ SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:0 ## javaHash {#hash_functions-javahash} -Calculates [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) - from a string. -Accepts a String-type argument. Returns Int32. +Calculates [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) from a string. -## hiveHash +```sql +SELECT javaHash(); +``` + +**Returned value** + +A `Int32` data type hash value. + +Type: `javaHash`. + +**Example** + +Query: + +```sql +SELECT javaHash('Hello, world!'); +``` + +Result: + +```text +┌─javaHash('Hello, world!')─┐ +│ -1880044555 │ +└───────────────────────────┘ +``` + +## hiveHash {#hash_functions-hivehash} + +Calculates `HiveHash` from a string. + +```sql +SELECT hiveHash(); +``` -Calculates HiveHash from a string. -Accepts a String-type argument. Returns Int32. This is just [JavaHash](#hash_functions-javahash) with zeroed out sign bit. This function is used in [Apache Hive](https://en.wikipedia.org/wiki/Apache_Hive) for versions before 3.0. +**Returned value** + +A `Int32` data type hash value. + +Type: `hiveHash`. + +**Example** + +Query: + +```sql +SELECT hiveHash('Hello, world!'); +``` + +Result: + +```text +┌─hiveHash('Hello, world!')─┐ +│ 267439093 │ +└───────────────────────────┘ +``` + ## metroHash64 Produces a 64-bit [MetroHash](http://www.jandrewrogers.com/2015/05/27/metrohash/) hash value. diff --git a/docs/ru/query_language/functions/hash_functions.md b/docs/ru/query_language/functions/hash_functions.md index e171b2bfa38..3ab254b9140 100644 --- a/docs/ru/query_language/functions/hash_functions.md +++ b/docs/ru/query_language/functions/hash_functions.md @@ -180,13 +180,64 @@ SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:0 ## javaHash {#hash_functions-javahash} Вычисляет [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) от строки. -Принимает аргумент типа String. Возвращает значение типа Int32. -## hiveHash +```sql +SELECT javaHash(); +``` -Вычисляет HiveHash от строки. -Принимает аргумент типа String. Возвращает значение типа Int32. -HiveHash — это результат [JavaHash](#hash_functions-javahash) с обнулённым битом знака числа. Функция используется в [Apache Hive](https://en.wikipedia.org/wiki/Apache_Hive) вплоть до версии 3.0. +**Возвращаемое значение** + +Хэш-значение типа `Int32`. + +Тип: `javaHash`. + +**Пример** + +Запрос: + +```sql +SELECT javaHash('Hello, world!'); +``` + +Ответ: + +```text +┌─javaHash('Hello, world!')─┐ +│ -1880044555 │ +└───────────────────────────┘ +``` + +## hiveHash {#hash_functions-hivehash} + +Вычисляет `HiveHash` от строки. + +```sql +SELECT hiveHash(); +``` + +`HiveHash` — это результат [JavaHash](#hash_functions-javahash) с обнулённым битом знака числа. Функция используется в [Apache Hive](https://en.wikipedia.org/wiki/Apache_Hive) вплоть до версии 3.0. + +**Возвращаемое значение** + +Хэш-значение типа `Int32`. + +Тип: `hiveHash`. + +**Пример** + +Запрос: + +```sql +SELECT hiveHash('Hello, world!'); +``` + +Ответ: + +```text +┌─hiveHash('Hello, world!')─┐ +│ 267439093 │ +└───────────────────────────┘ +``` ## metroHash64 From 052c1a562de531dff628a0cac67fa61ead1899cd Mon Sep 17 00:00:00 2001 From: Sergei Bocharov Date: Thu, 19 Sep 2019 16:49:16 +0300 Subject: [PATCH 186/309] Fixes --- docs/en/query_language/functions/hash_functions.md | 4 ++-- docs/ru/query_language/functions/hash_functions.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/en/query_language/functions/hash_functions.md b/docs/en/query_language/functions/hash_functions.md index 8fb7e62aac4..587abd40b92 100644 --- a/docs/en/query_language/functions/hash_functions.md +++ b/docs/en/query_language/functions/hash_functions.md @@ -180,7 +180,7 @@ SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:0 Calculates [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) from a string. ```sql -SELECT javaHash(); +SELECT javaHash(''); ``` **Returned value** @@ -210,7 +210,7 @@ Result: Calculates `HiveHash` from a string. ```sql -SELECT hiveHash(); +SELECT hiveHash(''); ``` This is just [JavaHash](#hash_functions-javahash) with zeroed out sign bit. This function is used in [Apache Hive](https://en.wikipedia.org/wiki/Apache_Hive) for versions before 3.0. diff --git a/docs/ru/query_language/functions/hash_functions.md b/docs/ru/query_language/functions/hash_functions.md index 3ab254b9140..06a28b38a87 100644 --- a/docs/ru/query_language/functions/hash_functions.md +++ b/docs/ru/query_language/functions/hash_functions.md @@ -182,7 +182,7 @@ SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:0 Вычисляет [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) от строки. ```sql -SELECT javaHash(); +SELECT javaHash(''); ``` **Возвращаемое значение** @@ -212,7 +212,7 @@ SELECT javaHash('Hello, world!'); Вычисляет `HiveHash` от строки. ```sql -SELECT hiveHash(); +SELECT hiveHash(''); ``` `HiveHash` — это результат [JavaHash](#hash_functions-javahash) с обнулённым битом знака числа. Функция используется в [Apache Hive](https://en.wikipedia.org/wiki/Apache_Hive) вплоть до версии 3.0. From c051964feea426af5df56db1678979206a57f85b Mon Sep 17 00:00:00 2001 From: Sergei Bocharov Date: Mon, 23 Sep 2019 13:48:21 +0300 Subject: [PATCH 187/309] fixes after review --- docs/en/query_language/functions/hash_functions.md | 2 +- docs/ru/query_language/functions/hash_functions.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/query_language/functions/hash_functions.md b/docs/en/query_language/functions/hash_functions.md index 587abd40b92..b3410dd3b17 100644 --- a/docs/en/query_language/functions/hash_functions.md +++ b/docs/en/query_language/functions/hash_functions.md @@ -177,7 +177,7 @@ SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:0 ## javaHash {#hash_functions-javahash} -Calculates [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) from a string. +Calculates [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) from a string. `JavaHash` does not ensure fast response and quality, so this function should be considered deprecated. Use this function if you need to get the hash value using the same algorithm. ```sql SELECT javaHash(''); diff --git a/docs/ru/query_language/functions/hash_functions.md b/docs/ru/query_language/functions/hash_functions.md index 06a28b38a87..4db4fabc30e 100644 --- a/docs/ru/query_language/functions/hash_functions.md +++ b/docs/ru/query_language/functions/hash_functions.md @@ -179,7 +179,7 @@ SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:0 ## javaHash {#hash_functions-javahash} -Вычисляет [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) от строки. +Вычисляет [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) от строки. `JavaHash` не отличается ни скоростью, ни качеством, поэтому эту функцию следует считать устаревшей. Используйте эту функцию, если вам необходимо получить значение хэша по такому же алгоритму. ```sql SELECT javaHash(''); From 7a822ad815f725f275202f57f15b541d9f73e8ac Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Mon, 23 Sep 2019 14:08:56 +0300 Subject: [PATCH 188/309] fix Jinja2 (#7011) --- docs/tools/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt index b216433f772..2c395da402c 100644 --- a/docs/tools/requirements.txt +++ b/docs/tools/requirements.txt @@ -12,7 +12,7 @@ futures==3.1.1 htmlmin==0.1.12 idna==2.6 imagesize==0.7.1 -Jinja2==2.10 +Jinja2==2.10.1 jsmin==2.2.2 livereload==2.5.1 Markdown==2.6.11 From 6b6714761b2cc85c8e28d0cd4df40c90376e06d3 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 23 Sep 2019 15:20:08 +0300 Subject: [PATCH 189/309] Also pack config into shared archive --- docker/packager/binary/build.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docker/packager/binary/build.sh b/docker/packager/binary/build.sh index ed30feb8cb7..b5c50763b17 100755 --- a/docker/packager/binary/build.sh +++ b/docker/packager/binary/build.sh @@ -18,6 +18,10 @@ find . -name '*.so.*' -print -exec mv '{}' /output \; count=`ls -1 /output/*.so 2>/dev/null | wc -l` if [ $count != 0 ] then + mkdir -p /output/config + cp ../dbms/programs/server/config.xml /output/config + cp ../dbms/programs/server/users.xml /output/config + cp -r ../dbms/programs/server/config.d /output/config tar -czvf shared_build.tgz /output rm -r /output/* mv shared_build.tgz /output From f8c0fef919a180031e86008b888ce634bd8ebd90 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 23 Sep 2019 12:41:59 +0000 Subject: [PATCH 190/309] Attempt to make table-level setting `s3_min_upload_part_size`. --- dbms/src/Core/Settings.h | 1 + dbms/src/Storages/StorageS3.cpp | 12 ++++++++---- dbms/src/Storages/StorageS3.h | 2 ++ .../test_storage_s3/configs/min_chunk_size.xml | 5 ----- dbms/tests/integration/test_storage_s3/test.py | 4 ++-- .../tests/integration/test_storage_s3/test_server.py | 2 ++ 6 files changed, 15 insertions(+), 11 deletions(-) delete mode 100644 dbms/tests/integration/test_storage_s3/configs/min_chunk_size.xml diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index cacaf883fb7..d2c48d7c0a9 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -68,6 +68,7 @@ struct Settings : public SettingsCollection M(SettingUInt64, idle_connection_timeout, 3600, "Close idle TCP connections after specified number of seconds.") \ M(SettingUInt64, distributed_connections_pool_size, DBMS_DEFAULT_DISTRIBUTED_CONNECTIONS_POOL_SIZE, "Maximum number of connections with one remote server in the pool.") \ M(SettingUInt64, connections_with_failover_max_tries, DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES, "The maximum number of attempts to connect to replicas.") \ + M(SettingUInt64, s3_min_upload_part_size, 512*1024*1024, "The mininum size of part to upload during multipart upload to S3.") \ M(SettingBool, extremes, false, "Calculate minimums and maximums of the result columns. They can be output in JSON-formats.") \ M(SettingBool, use_uncompressed_cache, true, "Whether to use the cache of uncompressed blocks.") \ M(SettingBool, replace_running_query, false, "Whether the running request should be canceled with the same id as the new one.") \ diff --git a/dbms/src/Storages/StorageS3.cpp b/dbms/src/Storages/StorageS3.cpp index c135a16775a..ae774030c41 100644 --- a/dbms/src/Storages/StorageS3.cpp +++ b/dbms/src/Storages/StorageS3.cpp @@ -79,13 +79,13 @@ namespace public: StorageS3BlockOutputStream(const Poco::URI & uri, const String & format, + UInt64 min_upload_part_size, const Block & sample_block_, const Context & context, const ConnectionTimeouts & timeouts) : sample_block(sample_block_) { - auto minimum_upload_part_size = context.getConfigRef().getUInt64("s3.minimum_upload_part_size", 512 * 1024 * 1024); - write_buf = std::make_unique(uri, minimum_upload_part_size, timeouts); + write_buf = std::make_unique(uri, min_upload_part_size, timeouts); writer = FormatFactory::instance().getOutput(format, *write_buf, sample_block, context); } @@ -124,6 +124,7 @@ StorageS3::StorageS3( const std::string & database_name_, const std::string & table_name_, const String & format_name_, + UInt64 min_upload_part_size_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, Context & context_) @@ -133,6 +134,7 @@ StorageS3::StorageS3( , format_name(format_name_) , database_name(database_name_) , table_name(table_name_) + , min_upload_part_size(min_upload_part_size_) { setColumns(columns_); setConstraints(constraints_); @@ -171,7 +173,7 @@ void StorageS3::rename(const String & /*new_path_to_db*/, const String & new_dat BlockOutputStreamPtr StorageS3::write(const ASTPtr & /*query*/, const Context & /*context*/) { return std::make_shared( - uri, format_name, getSampleBlock(), context_global, ConnectionTimeouts::getHTTPTimeouts(context_global)); + uri, format_name, min_upload_part_size, getSampleBlock(), context_global, ConnectionTimeouts::getHTTPTimeouts(context_global)); } void registerStorageS3(StorageFactory & factory) @@ -193,7 +195,9 @@ void registerStorageS3(StorageFactory & factory) String format_name = engine_args[1]->as().value.safeGet(); - return StorageS3::create(uri, args.database_name, args.table_name, format_name, args.columns, args.constraints, args.context); + UInt64 min_upload_part_size = args.local_context.getSettingsRef().s3_min_upload_part_size; + + return StorageS3::create(uri, args.database_name, args.table_name, format_name, min_upload_part_size, args.columns, args.constraints, args.context); }); } } diff --git a/dbms/src/Storages/StorageS3.h b/dbms/src/Storages/StorageS3.h index 05a69f439d0..65cd65458c6 100644 --- a/dbms/src/Storages/StorageS3.h +++ b/dbms/src/Storages/StorageS3.h @@ -21,6 +21,7 @@ public: const std::string & database_name_, const std::string & table_name_, const String & format_name_, + UInt64 min_upload_part_size_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, Context & context_); @@ -59,6 +60,7 @@ private: String format_name; String database_name; String table_name; + UInt64 min_upload_part_size; }; } diff --git a/dbms/tests/integration/test_storage_s3/configs/min_chunk_size.xml b/dbms/tests/integration/test_storage_s3/configs/min_chunk_size.xml deleted file mode 100644 index 2a9c465a7b8..00000000000 --- a/dbms/tests/integration/test_storage_s3/configs/min_chunk_size.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - 1000000 - - diff --git a/dbms/tests/integration/test_storage_s3/test.py b/dbms/tests/integration/test_storage_s3/test.py index 84f6bf72f60..14ad78d4a4a 100644 --- a/dbms/tests/integration/test_storage_s3/test.py +++ b/dbms/tests/integration/test_storage_s3/test.py @@ -34,7 +34,7 @@ def put_communication_data(started_cluster, body): def started_cluster(): try: cluster = ClickHouseCluster(__file__) - instance = cluster.add_instance("dummy", config_dir="configs", main_configs=["configs/min_chunk_size.xml"]) + instance = cluster.add_instance("dummy") cluster.start() cluster.communication_port = 10000 @@ -150,7 +150,7 @@ def test_multipart_put(started_cluster): put_communication_data(started_cluster, "=== Multipart test ===") long_data = [[i, i+1, i+2] for i in range(100000)] long_values = "".join([ "{},{},{}\n".format(x,y,z) for x, y, z in long_data ]) - put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') format CSV".format(started_cluster.mock_host, started_cluster.multipart_preserving_data_port, started_cluster.bucket, format) + put_query = "set s3_min_upload_part_size = 1000000; insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') format CSV".format(started_cluster.mock_host, started_cluster.multipart_preserving_data_port, started_cluster.bucket, format) run_query(instance, put_query, stdin=long_values) data = get_communication_data(started_cluster) assert "multipart_received_data" in data diff --git a/dbms/tests/integration/test_storage_s3/test_server.py b/dbms/tests/integration/test_storage_s3/test_server.py index 8896af9c23e..08a1904d1f2 100644 --- a/dbms/tests/integration/test_storage_s3/test_server.py +++ b/dbms/tests/integration/test_storage_s3/test_server.py @@ -33,6 +33,7 @@ logging.getLogger().addHandler(logging.StreamHandler()) communication_port = int(sys.argv[1]) bucket = sys.argv[2] + def GetFreeTCPPortsAndIP(n): result = [] sockets = [] @@ -53,6 +54,7 @@ def GetFreeTCPPortsAndIP(n): redirecting_preserving_data_port ), localhost = GetFreeTCPPortsAndIP(5) + data = { "redirecting_to_http_port": redirecting_to_http_port, "preserving_data_port": preserving_data_port, From 87ed508aa49d8fc21d6d84e3507906da74f345ed Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 23 Sep 2019 16:19:55 +0300 Subject: [PATCH 191/309] Add image for split build smoke test --- docker/images.json | 1 + docker/test/split_build_smoke_test/Dockerfile | 6 ++++++ docker/test/split_build_smoke_test/run.sh | 16 ++++++++++++++++ 3 files changed, 23 insertions(+) create mode 100644 docker/test/split_build_smoke_test/Dockerfile create mode 100755 docker/test/split_build_smoke_test/run.sh diff --git a/docker/images.json b/docker/images.json index fef364a942f..a3094251a5a 100644 --- a/docker/images.json +++ b/docker/images.json @@ -12,5 +12,6 @@ "docker/test/stateless_with_coverage": "yandex/clickhouse-stateless-test-with-coverage", "docker/test/unit": "yandex/clickhouse-unit-test", "docker/test/stress": "yandex/clickhouse-stress-test", + "docker/test/split_build_smoke_test": "yandex/clickhouse-split-build-smoke-test", "dbms/tests/integration/image": "yandex/clickhouse-integration-tests-runner" } diff --git a/docker/test/split_build_smoke_test/Dockerfile b/docker/test/split_build_smoke_test/Dockerfile new file mode 100644 index 00000000000..c77db1c6c88 --- /dev/null +++ b/docker/test/split_build_smoke_test/Dockerfile @@ -0,0 +1,6 @@ +# docker build -t yandex/clickhouse-split-build-smoke-test . +FROM yandex/clickhouse-binary-builder + +COPY run.sh /run.sh + +CMD /run.sh diff --git a/docker/test/split_build_smoke_test/run.sh b/docker/test/split_build_smoke_test/run.sh new file mode 100755 index 00000000000..436cc7ff1c1 --- /dev/null +++ b/docker/test/split_build_smoke_test/run.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -x + +install_and_run_server() { + tar -xzf package_folder/shared_build.tgz -C package_folder --strip 1 + LD_LIBRARY_PATH=/package_folder /package_folder/clickhouse-server --config /package_folder/config/config.xml >/var/log/clickhouse-server/stderr.log 2>&1 & + sleep 5 +} + +run_client() { + LD_LIBRARY_PATH=/package_folder /package_folder/clickhouse-client --query \"select 'OK'\" 2>/var/log/clickhouse-server/clientstderr.log || echo 'FAIL' +} + +install_and_run_server +run_client From f15bc77778a516d1143784e107a3fe06ec670d5d Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 23 Sep 2019 16:36:20 +0300 Subject: [PATCH 192/309] Fix size of constant columns in case of execution function over LowCardinality without allowed defaults. --- dbms/src/Functions/IFunction.cpp | 48 +++++++++++++++++++------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/dbms/src/Functions/IFunction.cpp b/dbms/src/Functions/IFunction.cpp index a86ea724f7a..3bcaca3205d 100644 --- a/dbms/src/Functions/IFunction.cpp +++ b/dbms/src/Functions/IFunction.cpp @@ -337,19 +337,43 @@ static ColumnPtr replaceLowCardinalityColumnsByNestedAndGetDictionaryIndexes( size_t num_rows = input_rows_count; ColumnPtr indexes; + /// Find first LowCardinality column and replace it to nested dictionary. for (auto arg : args) { ColumnWithTypeAndName & column = block.getByPosition(arg); if (auto * low_cardinality_column = checkAndGetColumn(column.column.get())) { + /// Single LowCardinality column is supported now. if (indexes) throw Exception("Expected single dictionary argument for function.", ErrorCodes::LOGICAL_ERROR); - indexes = low_cardinality_column->getIndexesPtr(); + auto * low_cardinality_type = checkAndGetDataType(column.type.get()); + + if (!low_cardinality_type) + throw Exception("Incompatible type for low cardinality column: " + column.type->getName(), + ErrorCodes::LOGICAL_ERROR); + + if (can_be_executed_on_default_arguments) + { + /// Normal case, when function can be executed on values's default. + column.column = low_cardinality_column->getDictionary().getNestedColumn(); + indexes = low_cardinality_column->getIndexesPtr(); + } + else + { + /// Special case when default value can't be used. Example: 1 % LowCardinality(Int). + /// LowCardinality always contains default, so 1 % 0 will throw exception in normal case. + auto dict_encoded = low_cardinality_column->getMinimalDictionaryEncodedColumn(0, low_cardinality_column->size()); + column.column = dict_encoded.dictionary; + indexes = dict_encoded.indexes; + } + num_rows = low_cardinality_column->getDictionary().size(); + column.type = low_cardinality_type->getDictionaryType(); } } + /// Change size of constants. for (auto arg : args) { ColumnWithTypeAndName & column = block.getByPosition(arg); @@ -358,26 +382,12 @@ static ColumnPtr replaceLowCardinalityColumnsByNestedAndGetDictionaryIndexes( column.column = column_const->removeLowCardinality()->cloneResized(num_rows); column.type = removeLowCardinality(column.type); } - else if (auto * low_cardinality_column = checkAndGetColumn(column.column.get())) - { - auto * low_cardinality_type = checkAndGetDataType(column.type.get()); - - if (!low_cardinality_type) - throw Exception("Incompatible type for low cardinality column: " + column.type->getName(), - ErrorCodes::LOGICAL_ERROR); - - if (can_be_executed_on_default_arguments) - column.column = low_cardinality_column->getDictionary().getNestedColumn(); - else - { - auto dict_encoded = low_cardinality_column->getMinimalDictionaryEncodedColumn(0, low_cardinality_column->size()); - column.column = dict_encoded.dictionary; - indexes = dict_encoded.indexes; - } - column.type = low_cardinality_type->getDictionaryType(); - } } +#ifndef NDEBUG + block.checkNumberOfRows(); +#endif + return indexes; } From 9261d9d970dd99c85db7d724590d778a22e13f64 Mon Sep 17 00:00:00 2001 From: chertus Date: Mon, 23 Sep 2019 16:55:04 +0300 Subject: [PATCH 193/309] stabilize tests (add order by) --- .../0_stateless/00702_join_on_dups.sql | 8 +++---- .../00847_multiple_join_same_column.sql | 4 +++- .../0_stateless/00849_multiple_comma_join.sql | 21 +++++++++++++------ 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/dbms/tests/queries/0_stateless/00702_join_on_dups.sql b/dbms/tests/queries/0_stateless/00702_join_on_dups.sql index ca774a0a6c3..852378f543f 100644 --- a/dbms/tests/queries/0_stateless/00702_join_on_dups.sql +++ b/dbms/tests/queries/0_stateless/00702_join_on_dups.sql @@ -38,7 +38,7 @@ select s.*, j.* from (select * from X) as s full join (select * from Y) as j on --select X.*, Y.* from X full join Y on (X.id + 1) = (Y.id + 1) order by id; select 'self inner'; -select X.*, s.* from X inner join (select * from X) as s on X.id = s.id order by X.id; +select X.*, s.* from X inner join (select * from X) as s on X.id = s.id order by X.id, X.x_a, s.x_a; select 'self inner nullable'; select X.*, s.* from X inner join (select * from X) as s on X.x_b = s.x_b order by X.id; select 'self inner nullable vs not nullable'; @@ -48,7 +48,7 @@ select 'self inner nullable vs not nullable 2'; select Y.*, s.* from Y inner join (select * from Y) as s on concat('n', Y.y_a) = s.y_b order by id; select 'self left'; -select X.*, s.* from X left join (select * from X) as s on X.id = s.id order by X.id; +select X.*, s.* from X left join (select * from X) as s on X.id = s.id order by X.id, X.x_a, s.x_a; select 'self left nullable'; select X.*, s.* from X left join (select * from X) as s on X.x_b = s.x_b order by X.id; select 'self left nullable vs not nullable'; @@ -58,7 +58,7 @@ select 'self left nullable vs not nullable 2'; select Y.*, s.* from Y left join (select * from Y) as s on concat('n', Y.y_a) = s.y_b order by id; select 'self right'; -select X.*, s.* from X right join (select * from X) as s on X.id = s.id order by X.id; +select X.*, s.* from X right join (select * from X) as s on X.id = s.id order by X.id, X.x_a, s.x_a; select 'self right nullable'; select X.*, s.* from X right join (select * from X) as s on X.x_b = s.x_b order by X.id; select 'self right nullable vs not nullable'; @@ -67,7 +67,7 @@ select X.*, s.* from X right join (select * from X) as s on X.id = s.x_b order b --select Y.*, s.* from Y right join (select * from Y) as s on concat('n', Y.y_a) = s.y_b order by id; select 'self full'; -select X.*, s.* from X full join (select * from X) as s on X.id = s.id order by X.id; +select X.*, s.* from X full join (select * from X) as s on X.id = s.id order by X.id, X.x_a, s.x_a; select 'self full nullable'; select X.*, s.* from X full join (select * from X) as s on X.x_b = s.x_b order by X.id; select 'self full nullable vs not nullable'; diff --git a/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql b/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql index 44b3fe202d3..c7f0c6383c2 100644 --- a/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql +++ b/dbms/tests/queries/0_stateless/00847_multiple_join_same_column.sql @@ -12,7 +12,9 @@ insert into y values (1,1); select t.a, s.b, s.a, s.b, y.a, y.b from t left join s on (t.a = s.a and t.b = s.b) -left join y on (y.a = s.a and y.b = s.b) format Vertical; +left join y on (y.a = s.a and y.b = s.b) +order by t.a +format Vertical; select t.a, s.b, s.a, s.b, y.a, y.b from t left join s on (t.a = s.a and s.b = t.b) diff --git a/dbms/tests/queries/0_stateless/00849_multiple_comma_join.sql b/dbms/tests/queries/0_stateless/00849_multiple_comma_join.sql index 6fcf4ebb563..d1d247a0174 100644 --- a/dbms/tests/queries/0_stateless/00849_multiple_comma_join.sql +++ b/dbms/tests/queries/0_stateless/00849_multiple_comma_join.sql @@ -37,21 +37,30 @@ INSERT INTO t3_00849 values (1,1), (1, Null); INSERT INTO t4_00849 values (1,1), (1, Null); SELECT 'SELECT * FROM t1, t2'; -SELECT * FROM t1_00849, t2_00849; +SELECT * FROM t1_00849, t2_00849 +ORDER BY t1_00849.a, t2_00849.b; SELECT 'SELECT * FROM t1, t2 WHERE t1.a = t2.a'; -SELECT * FROM t1_00849, t2_00849 WHERE t1_00849.a = t2_00849.a; +SELECT * FROM t1_00849, t2_00849 WHERE t1_00849.a = t2_00849.a +ORDER BY t1_00849.a, t2_00849.b; SELECT 'SELECT t1.a, t2.a FROM t1, t2 WHERE t1.b = t2.b'; SELECT t1_00849.a, t2_00849.b FROM t1_00849, t2_00849 WHERE t1_00849.b = t2_00849.b; SELECT 'SELECT t1.a, t2.b, t3.b FROM t1, t2, t3 WHERE t1.a = t2.a AND t1.a = t3.a'; -SELECT t1_00849.a, t2_00849.b, t3_00849.b FROM t1_00849, t2_00849, t3_00849 WHERE t1_00849.a = t2_00849.a AND t1_00849.a = t3_00849.a; +SELECT t1_00849.a, t2_00849.b, t3_00849.b FROM t1_00849, t2_00849, t3_00849 +WHERE t1_00849.a = t2_00849.a AND t1_00849.a = t3_00849.a +ORDER BY t2_00849.b, t3_00849.b; SELECT 'SELECT t1.a, t2.b, t3.b FROM t1, t2, t3 WHERE t1.b = t2.b AND t1.b = t3.b'; SELECT t1_00849.a, t2_00849.b, t3_00849.b FROM t1_00849, t2_00849, t3_00849 WHERE t1_00849.b = t2_00849.b AND t1_00849.b = t3_00849.b; SELECT 'SELECT t1.a, t2.b, t3.b, t4.b FROM t1, t2, t3, t4 WHERE t1.a = t2.a AND t1.a = t3.a AND t1.a = t4.a'; -SELECT t1_00849.a, t2_00849.b, t3_00849.b, t4_00849.b FROM t1_00849, t2_00849, t3_00849, t4_00849 WHERE t1_00849.a = t2_00849.a AND t1_00849.a = t3_00849.a AND t1_00849.a = t4_00849.a; +SELECT t1_00849.a, t2_00849.b, t3_00849.b, t4_00849.b FROM t1_00849, t2_00849, t3_00849, t4_00849 +WHERE t1_00849.a = t2_00849.a AND t1_00849.a = t3_00849.a AND t1_00849.a = t4_00849.a +ORDER BY t2_00849.b, t3_00849.b, t4_00849.b; SELECT 'SELECT t1.a, t2.b, t3.b, t4.b FROM t1, t2, t3, t4 WHERE t1.b = t2.b AND t1.b = t3.b AND t1.b = t4.b'; -SELECT t1_00849.a, t2_00849.b, t3_00849.b, t4_00849.b FROM t1_00849, t2_00849, t3_00849, t4_00849 WHERE t1_00849.b = t2_00849.b AND t1_00849.b = t3_00849.b AND t1_00849.b = t4_00849.b; +SELECT t1_00849.a, t2_00849.b, t3_00849.b, t4_00849.b FROM t1_00849, t2_00849, t3_00849, t4_00849 +WHERE t1_00849.b = t2_00849.b AND t1_00849.b = t3_00849.b AND t1_00849.b = t4_00849.b; SELECT 'SELECT t1.a, t2.b, t3.b, t4.b FROM t1, t2, t3, t4 WHERE t1.a = t2.a AND t2.a = t3.a AND t3.a = t4.a'; -SELECT t1_00849.a, t2_00849.b, t3_00849.b, t4_00849.b FROM t1_00849, t2_00849, t3_00849, t4_00849 WHERE t1_00849.a = t2_00849.a AND t2_00849.a = t3_00849.a AND t3_00849.a = t4_00849.a; +SELECT t1_00849.a, t2_00849.b, t3_00849.b, t4_00849.b FROM t1_00849, t2_00849, t3_00849, t4_00849 +WHERE t1_00849.a = t2_00849.a AND t2_00849.a = t3_00849.a AND t3_00849.a = t4_00849.a +ORDER BY t2_00849.b, t3_00849.b, t4_00849.b; DROP TABLE t1_00849; DROP TABLE t2_00849; From 61b583ccb91fb403eab1700d58085e2bcf49e936 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 23 Sep 2019 17:19:02 +0300 Subject: [PATCH 194/309] Fix size of constant columns in case of execution function over LowCardinality without allowed defaults. --- dbms/src/Functions/IFunction.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Functions/IFunction.cpp b/dbms/src/Functions/IFunction.cpp index 3bcaca3205d..af2a9db02b3 100644 --- a/dbms/src/Functions/IFunction.cpp +++ b/dbms/src/Functions/IFunction.cpp @@ -368,7 +368,7 @@ static ColumnPtr replaceLowCardinalityColumnsByNestedAndGetDictionaryIndexes( indexes = dict_encoded.indexes; } - num_rows = low_cardinality_column->getDictionary().size(); + num_rows = column.column->size(); column.type = low_cardinality_type->getDictionaryType(); } } From 016f3b0a454b3bdb0e3a039e132232617912c371 Mon Sep 17 00:00:00 2001 From: chertus Date: Mon, 23 Sep 2019 17:37:42 +0300 Subject: [PATCH 195/309] fix partial merge join with totals --- dbms/src/Interpreters/IJoin.h | 2 +- dbms/src/Interpreters/MergeJoin.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dbms/src/Interpreters/IJoin.h b/dbms/src/Interpreters/IJoin.h index 9773f7ae979..af6d643cc2b 100644 --- a/dbms/src/Interpreters/IJoin.h +++ b/dbms/src/Interpreters/IJoin.h @@ -25,7 +25,7 @@ public: /// Could be called from different threads in parallel. virtual void joinBlock(Block & block) = 0; - virtual bool hasTotals() const { return false; } + virtual bool hasTotals() const = 0; virtual void setTotals(const Block & block) = 0; virtual void joinTotals(Block & block) const = 0; diff --git a/dbms/src/Interpreters/MergeJoin.h b/dbms/src/Interpreters/MergeJoin.h index 72a431c8d59..89165c70890 100644 --- a/dbms/src/Interpreters/MergeJoin.h +++ b/dbms/src/Interpreters/MergeJoin.h @@ -24,6 +24,7 @@ public: void joinBlock(Block &) override; void joinTotals(Block &) const override; void setTotals(const Block &) override; + bool hasTotals() const override { return totals; } size_t getTotalRowCount() const override { return right_blocks_row_count; } private: From ce2fb6d3d104eb5e24d09e256d6c265476985214 Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Mon, 23 Sep 2019 17:38:13 +0300 Subject: [PATCH 196/309] Fix build --- cmake/target.cmake | 10 +++++----- contrib/CMakeLists.txt | 4 +++- dbms/src/Functions/registerFunctionsIntrospection.cpp | 3 +-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/cmake/target.cmake b/cmake/target.cmake index 7dbb353db99..eb11fc57cf1 100644 --- a/cmake/target.cmake +++ b/cmake/target.cmake @@ -35,15 +35,15 @@ string(REGEX MATCH "-?[0-9]+(.[0-9]+)?$" COMPILER_POSTFIX ${CMAKE_CXX_COMPILER}) if (OS_LINUX) find_program (LLD_PATH NAMES "lld${COMPILER_POSTFIX}" "lld") - find_program (GOLD_PATH NAMES "gold") + find_program (GOLD_PATH NAMES "ld.gold" "gold") endif() option (LINKER_NAME "Linker name or full path") if (NOT LINKER_NAME) if (COMPILER_CLANG AND LLD_PATH) - set (LINKER_NAME NAMES "lld") + set (LINKER_NAME "lld") elseif (GOLD_PATH) - set (LINKER_NAME NAMES "ld.gold" "gold") + set (LINKER_NAME "gold") endif () endif () @@ -62,8 +62,8 @@ if (CMAKE_CROSSCOMPILING) set (HAS_PRE_1970_EXITCODE "0" CACHE STRING "Result from TRY_RUN" FORCE) set (HAS_PRE_1970_EXITCODE__TRYRUN_OUTPUT "" CACHE STRING "Output from TRY_RUN" FORCE) - set( HAS_POST_2038_EXITCODE "0" CACHE STRING "Result from TRY_RUN" FORCE) - set( HAS_POST_2038_EXITCODE__TRYRUN_OUTPUT "" CACHE STRING "Output from TRY_RUN" FORCE) + set (HAS_POST_2038_EXITCODE "0" CACHE STRING "Result from TRY_RUN" FORCE) + set (HAS_POST_2038_EXITCODE__TRYRUN_OUTPUT "" CACHE STRING "Output from TRY_RUN" FORCE) endif () # Don't know why but CXX_STANDARD doesn't work for cross-compilation diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 54fdc4d69e0..0833614594d 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -193,13 +193,15 @@ if (USE_INTERNAL_PARQUET_LIBRARY_NATIVE_CMAKE) endif() else() - if(USE_INTERNAL_SNAPPY_LIBRARY) set(SNAPPY_BUILD_TESTS 0 CACHE INTERNAL "") if (NOT MAKE_STATIC_LIBRARIES) set(BUILD_SHARED_LIBS 1) # TODO: set at root dir endif() + add_subdirectory(snappy) + + set (SNAPPY_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/snappy") if(SANITIZE STREQUAL "undefined") target_compile_options(${SNAPPY_LIBRARY} PRIVATE -fno-sanitize=undefined) endif() diff --git a/dbms/src/Functions/registerFunctionsIntrospection.cpp b/dbms/src/Functions/registerFunctionsIntrospection.cpp index 22160125b00..267c2e0bef9 100644 --- a/dbms/src/Functions/registerFunctionsIntrospection.cpp +++ b/dbms/src/Functions/registerFunctionsIntrospection.cpp @@ -5,7 +5,7 @@ namespace DB class FunctionFactory; -#ifdef defined(OS_LINUX) +#if defined(OS_LINUX) void registerFunctionAddressToSymbol(FunctionFactory & factory); void registerFunctionAddressToLine(FunctionFactory & factory); #endif @@ -23,4 +23,3 @@ void registerFunctionsIntrospection(FunctionFactory & factory) } } - From fbe3270dfbcdb82b1bfc8220eb25390a678e2c96 Mon Sep 17 00:00:00 2001 From: stavrolia Date: Mon, 23 Sep 2019 17:50:33 +0300 Subject: [PATCH 197/309] need to mention... --- dbms/src/Storages/StorageFile.cpp | 1 + dbms/src/Storages/StorageHDFS.cpp | 1 + docs/en/operations/table_engines/hdfs.md | 2 +- docs/en/query_language/table_functions/file.md | 2 +- docs/en/query_language/table_functions/hdfs.md | 2 +- docs/ru/operations/table_engines/hdfs.md | 2 +- docs/ru/query_language/table_functions/file.md | 2 +- docs/ru/query_language/table_functions/hdfs.md | 2 +- 8 files changed, 8 insertions(+), 6 deletions(-) diff --git a/dbms/src/Storages/StorageFile.cpp b/dbms/src/Storages/StorageFile.cpp index 79534c1c6b0..4168c577e62 100644 --- a/dbms/src/Storages/StorageFile.cpp +++ b/dbms/src/Storages/StorageFile.cpp @@ -87,6 +87,7 @@ std::vector LSWithRegexpMatching(const std::string & path_for_ls, c if (re2::RE2::FullMatch(file_name, matcher)) { Strings result_part = LSWithRegexpMatching(full_path + "/", suffix_with_globs.substr(next_slash)); + /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); } } diff --git a/dbms/src/Storages/StorageHDFS.cpp b/dbms/src/Storages/StorageHDFS.cpp index 2a029463706..21beb4f79b4 100644 --- a/dbms/src/Storages/StorageHDFS.cpp +++ b/dbms/src/Storages/StorageHDFS.cpp @@ -171,6 +171,7 @@ Strings LSWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, c if (re2::RE2::FullMatch(file_name, matcher)) { Strings result_part = LSWithRegexpMatching(full_path + "/", fs, suffix_with_globs.substr(next_slash)); + /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); } } diff --git a/docs/en/operations/table_engines/hdfs.md b/docs/en/operations/table_engines/hdfs.md index efab476aded..942e2641da1 100644 --- a/docs/en/operations/table_engines/hdfs.md +++ b/docs/en/operations/table_engines/hdfs.md @@ -53,7 +53,7 @@ SELECT * FROM hdfs_engine_table LIMIT 2 Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern. Listing of files determines during `SELECT` (not at `CREATE` moment). -- `*` — Substitutes any number of any characters including none. +- `*` — Substitutes any number of any characters except `/` including none. - `?` — Substitutes any single character. - `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. - `{N..M}` — Substitutes any number in range from N to M including both borders. diff --git a/docs/en/query_language/table_functions/file.md b/docs/en/query_language/table_functions/file.md index de5e679a7b8..11ed8138212 100644 --- a/docs/en/query_language/table_functions/file.md +++ b/docs/en/query_language/table_functions/file.md @@ -55,7 +55,7 @@ SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 U Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix). -- `*` — Substitutes any number of any characters including none. +- `*` — Substitutes any number of any characters except `/` including none. - `?` — Substitutes any single character. - `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. - `{N..M}` — Substitutes any number in range from N to M including both borders. diff --git a/docs/en/query_language/table_functions/hdfs.md b/docs/en/query_language/table_functions/hdfs.md index a438f9a9ae6..8b8ceb7fbf1 100644 --- a/docs/en/query_language/table_functions/hdfs.md +++ b/docs/en/query_language/table_functions/hdfs.md @@ -38,7 +38,7 @@ LIMIT 2 Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix). -- `*` — Substitutes any number of any characters including none. +- `*` — Substitutes any number of any characters except `/` including none. - `?` — Substitutes any single character. - `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. - `{N..M}` — Substitutes any number in range from N to M including both borders. diff --git a/docs/ru/operations/table_engines/hdfs.md b/docs/ru/operations/table_engines/hdfs.md index fd7006d1d9a..cb38e03ee5a 100644 --- a/docs/ru/operations/table_engines/hdfs.md +++ b/docs/ru/operations/table_engines/hdfs.md @@ -50,7 +50,7 @@ SELECT * FROM hdfs_engine_table LIMIT 2 Шаблоны могут содержаться в нескольких компонентах пути. Обрабатываются только существующие файлы, название которых целиком удовлетворяет шаблону (не только суффиксом или префиксом). -- `*` — Заменяет любое количество любых символов, включая отсутствие символов. +- `*` — Заменяет любое количество любых символов кроме `/`, включая отсутствие символов. - `?` — Заменяет ровно один любой символ. - `{some_string,another_string,yet_another_one}` — Заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. - `{N..M}` — Заменяет любое число в интервале от `N` до `M` включительно. diff --git a/docs/ru/query_language/table_functions/file.md b/docs/ru/query_language/table_functions/file.md index 6fbcebceba0..2f9da603848 100644 --- a/docs/ru/query_language/table_functions/file.md +++ b/docs/ru/query_language/table_functions/file.md @@ -47,7 +47,7 @@ LIMIT 2 Шаблоны могут содержаться в нескольких компонентах пути. Обрабатываются только существующие файлы, название которых целиком удовлетворяет шаблону (не только суффиксом или префиксом). -- `*` — Заменяет любое количество любых символов, включая отсутствие символов. +- `*` — Заменяет любое количество любых символов кроме `/`, включая отсутствие символов. - `?` — Заменяет ровно один любой символ. - `{some_string,another_string,yet_another_one}` — Заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. - `{N..M}` — Заменяет любое число в интервале от `N` до `M` включительно. diff --git a/docs/ru/query_language/table_functions/hdfs.md b/docs/ru/query_language/table_functions/hdfs.md index 58f0accc19d..a40bf668e79 100644 --- a/docs/ru/query_language/table_functions/hdfs.md +++ b/docs/ru/query_language/table_functions/hdfs.md @@ -35,7 +35,7 @@ LIMIT 2 **Шаблоны в пути** -- `*` — Заменяет любое количество любых символов, включая отсутствие символов. +- `*` — Заменяет любое количество любых символов кроме `/`, включая отсутствие символов. - `?` — Заменяет ровно один любой символ. - `{some_string,another_string,yet_another_one}` — Заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. - `{N..M}` — Заменяет любое число в интервале от `N` до `M` включительно. From 769d33848b084ec5fd75a46e6ffbf92ebf5ff26e Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 23 Sep 2019 18:10:48 +0300 Subject: [PATCH 198/309] allow specifying column idx in format string --- .../Formats/ParsedTemplateFormatString.cpp | 22 +++++++++++++---- .../Impl/TemplateBlockOutputFormat.cpp | 23 ++++++++++++------ .../Formats/Impl/TemplateRowInputFormat.cpp | 24 ++++--------------- .../00937_template_output_format.sql | 2 +- .../00938_template_input_format.sh | 2 +- 5 files changed, 41 insertions(+), 32 deletions(-) diff --git a/dbms/src/Formats/ParsedTemplateFormatString.cpp b/dbms/src/Formats/ParsedTemplateFormatString.cpp index f89b1756693..c8b8a655475 100644 --- a/dbms/src/Formats/ParsedTemplateFormatString.cpp +++ b/dbms/src/Formats/ParsedTemplateFormatString.cpp @@ -41,6 +41,8 @@ void ParsedTemplateFormatString::parse(const String & format_string, const Colum const char * token_begin = pos; ParserState state = Delimiter; delimiters.emplace_back(); + char * col_idx_end; + std::optional column_idx; for (; *pos; ++pos) { switch (state) @@ -60,8 +62,9 @@ void ParsedTemplateFormatString::parse(const String & format_string, const Colum token_begin = pos; } else - throwInvalidFormat("at pos " + std::to_string(pos - format_string.c_str()) + - ": expected '{' or '$' after '$'", columnsCount()); + throwInvalidFormat("At pos " + std::to_string(pos - format_string.c_str()) + + ": Expected '{' or '$' after '$'" + + ", got \"" + std::string(pos, std::min(end - pos, 16l)) + "\"", columnsCount()); } break; @@ -78,10 +81,21 @@ void ParsedTemplateFormatString::parse(const String & format_string, const Colum state = Delimiter; } else - throwInvalidFormat("Expected ':' or '}' after column name: \"" + column_names.back() + "\"", columnsCount()); + throwInvalidFormat("At pos " + std::to_string(pos - format_string.c_str()) + + ": Expected ':' or '}' after column name \"" + column_names.back() + "\"" + + ", got \"" + std::string(pos, std::min(end - pos, 16l)) + "\"", columnsCount()); token_begin = pos + 1; - format_idx_to_column_idx.emplace_back(idx_by_name(column_names.back())); + column_idx.reset(); + if (!column_names.back().empty()) + { + col_idx_end = nullptr; + errno = 0; + column_idx = strtoull(column_names.back().c_str(), &col_idx_end, 10); + if (col_idx_end != column_names.back().c_str() + column_names.back().size() || errno) + column_idx = idx_by_name(column_names.back()); + } + format_idx_to_column_idx.emplace_back(column_idx); break; case Format: diff --git a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index cbaef1b0012..08d97c3c0d6 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -34,23 +34,29 @@ TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, Writ for (size_t i = 0; i < format.format_idx_to_column_idx.size(); ++i) { if (!format.format_idx_to_column_idx[i]) - format.throwInvalidFormat("Output part name cannot be empty, it's a bug.", i); - switch (static_cast(*format.format_idx_to_column_idx[i])) + format.throwInvalidFormat("Output part name cannot be empty.", i); + switch (*format.format_idx_to_column_idx[i]) { - case OutputPart::Data: + case static_cast(OutputPart::Data): data_idx = i; [[fallthrough]]; - case OutputPart::Totals: - case OutputPart::ExtremesMin: - case OutputPart::ExtremesMax: + case static_cast(OutputPart::Totals): + case static_cast(OutputPart::ExtremesMin): + case static_cast(OutputPart::ExtremesMax): if (format.formats[i] != ColumnFormat::None) format.throwInvalidFormat("Serialization type for data, totals, min and max must be empty or None", i); break; - default: + case static_cast(OutputPart::Rows): + case static_cast(OutputPart::RowsBeforeLimit): + case static_cast(OutputPart::TimeElapsed): + case static_cast(OutputPart::RowsRead): + case static_cast(OutputPart::BytesRead): if (format.formats[i] == ColumnFormat::None) format.throwInvalidFormat("Serialization type for output part rows, rows_before_limit, time, " "rows_read or bytes_read is not specified", i); break; + default: + format.throwInvalidFormat("Invalid output part", i); } } if (data_idx != 0) @@ -69,6 +75,9 @@ TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, Writ { if (!row_format.format_idx_to_column_idx[i]) row_format.throwInvalidFormat("Cannot skip format field for output, it's a bug.", i); + if (header_.columns() <= *row_format.format_idx_to_column_idx[i]) + row_format.throwInvalidFormat("Column index " + std::to_string(*row_format.format_idx_to_column_idx[i]) + + " must be less then number of columns (" + std::to_string(header_.columns()) + ")", i); if (row_format.formats[i] == ColumnFormat::None) row_format.throwInvalidFormat("Serialization type for file column is not specified", i); } diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index c2145f88e69..617e9124e83 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -29,16 +29,6 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer { if (partName == "data") return 0; - else if (partName.empty()) /// For skipping some values in prefix and suffix -#if !__clang__ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#endif - /// Suppress false-positive warning (bug in GCC 9: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86465) - return {}; -#if !__clang__ -#pragma GCC diagnostic pop -#endif throw Exception("Unknown input part " + partName, ErrorCodes::SYNTAX_ERROR); }); @@ -48,6 +38,8 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer { if (format.format_idx_to_column_idx[i]) { + if (*format.format_idx_to_column_idx[i] != 0) + format.throwInvalidFormat("Invalid input part", i); if (has_data) format.throwInvalidFormat("${data} can occur only once", i); if (format.formats[i] != ColumnFormat::None) @@ -65,15 +57,6 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer /// Parse format string for rows row_format = ParsedTemplateFormatString(settings.template_settings.row_format, [&](const String & colName) -> std::optional { - if (colName.empty()) -#if !__clang__ - #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#endif - return {}; -#if !__clang__ -#pragma GCC diagnostic pop -#endif return header_.getPositionByName(colName); }); @@ -86,6 +69,9 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer if (row_format.format_idx_to_column_idx[i]) { + if (header_.columns() <= *row_format.format_idx_to_column_idx[i]) + row_format.throwInvalidFormat("Column index " + std::to_string(*row_format.format_idx_to_column_idx[i]) + + " must be less then number of columns (" + std::to_string(header_.columns()) + ")", i); if (row_format.formats[i] == ColumnFormat::None) row_format.throwInvalidFormat("Column is not skipped, but deserialization type is None", i); diff --git a/dbms/tests/queries/0_stateless/00937_template_output_format.sql b/dbms/tests/queries/0_stateless/00937_template_output_format.sql index 40312272ccb..8c3865ad0fa 100644 --- a/dbms/tests/queries/0_stateless/00937_template_output_format.sql +++ b/dbms/tests/queries/0_stateless/00937_template_output_format.sql @@ -6,7 +6,7 @@ INSERT INTO template VALUES SELECT * FROM template GROUP BY s1, s2, `s 3`, "s 4", n, d WITH TOTALS ORDER BY n LIMIT 4 FORMAT Template SETTINGS extremes = 1, format_schema = '{prefix} \n${data:None}\n------\n${totals:}\n------\n${min}\n------\n${max}\n${rows:Escaped} rows\nbefore limit ${rows_before_limit:XML}\nread ${rows_read:Escaped} $$ suffix $$', -format_schema_rows = 'n:\t${n:JSON}, s1:\t${s1:Escaped}, s2:\t${s2:Quoted}, s3:\t${`s 3`:JSON}, s4:\t${"s 4":CSV}, d:\t${d:Escaped}, n:\t${n:Raw}\t', +format_schema_rows = 'n:\t${n:JSON}, s1:\t${0:Escaped}, s2:\t${s2:Quoted}, s3:\t${`s 3`:JSON}, s4:\t${"s 4":CSV}, d:\t${d:Escaped}, n:\t${n:Raw}\t', format_schema_rows_between_delimiter = ';\n'; DROP TABLE template; diff --git a/dbms/tests/queries/0_stateless/00938_template_input_format.sh b/dbms/tests/queries/0_stateless/00938_template_input_format.sh index c33741543e9..63297ab3850 100755 --- a/dbms/tests/queries/0_stateless/00938_template_input_format.sh +++ b/dbms/tests/queries/0_stateless/00938_template_input_format.sh @@ -18,7 +18,7 @@ n: 9876543210, s1: , s2: 'zx\\ncv\\tbn m', s3: \"qwe,rty\", s4: \"as\"\"df'gh\" n: 789, s1: zx\\ncv\\tbn m , s2: 'qwe,rty', s3: \"as\\\"df'gh\", s4: \"\", d: 2016-01-04 $ suffix $" | $CLICKHOUSE_CLIENT --query="INSERT INTO template1 FORMAT Template SETTINGS \ format_schema = '{prefix} \n\${data}\n \$\$ suffix \$\$\n', \ -format_schema_rows = 'n:\t\${n:Escaped}, s1:\t\${s1:Escaped}\t, s2:\t\${s2:Quoted}, s3:\t\${s3:JSON}, s4:\t\${s4:CSV}, d:\t\${d:Escaped}\t', \ +format_schema_rows = 'n:\t\${n:Escaped}, s1:\t\${0:Escaped}\t, s2:\t\${1:Quoted}, s3:\t\${s3:JSON}, s4:\t\${3:CSV}, d:\t\${d:Escaped}\t', \ format_schema_rows_between_delimiter = ';\n'"; $CLICKHOUSE_CLIENT --query="SELECT * FROM template1 ORDER BY n FORMAT CSV"; From 433e97f9e2f4cfe1f2fb3e0753ab8dbfe61d1b39 Mon Sep 17 00:00:00 2001 From: stavrolia Date: Mon, 23 Sep 2019 18:12:43 +0300 Subject: [PATCH 199/309] minor changes --- docs/en/operations/table_engines/hdfs.md | 2 +- docs/en/query_language/table_functions/file.md | 2 +- docs/en/query_language/table_functions/hdfs.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/operations/table_engines/hdfs.md b/docs/en/operations/table_engines/hdfs.md index 942e2641da1..5b983aa4743 100644 --- a/docs/en/operations/table_engines/hdfs.md +++ b/docs/en/operations/table_engines/hdfs.md @@ -53,7 +53,7 @@ SELECT * FROM hdfs_engine_table LIMIT 2 Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern. Listing of files determines during `SELECT` (not at `CREATE` moment). -- `*` — Substitutes any number of any characters except `/` including none. +- `*` — Substitutes any number of any characters except `/` including empty string. - `?` — Substitutes any single character. - `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. - `{N..M}` — Substitutes any number in range from N to M including both borders. diff --git a/docs/en/query_language/table_functions/file.md b/docs/en/query_language/table_functions/file.md index 11ed8138212..2189a32482e 100644 --- a/docs/en/query_language/table_functions/file.md +++ b/docs/en/query_language/table_functions/file.md @@ -55,7 +55,7 @@ SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 U Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix). -- `*` — Substitutes any number of any characters except `/` including none. +- `*` — Substitutes any number of any characters except `/` including empty string. - `?` — Substitutes any single character. - `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. - `{N..M}` — Substitutes any number in range from N to M including both borders. diff --git a/docs/en/query_language/table_functions/hdfs.md b/docs/en/query_language/table_functions/hdfs.md index 8b8ceb7fbf1..0d9b66e07cd 100644 --- a/docs/en/query_language/table_functions/hdfs.md +++ b/docs/en/query_language/table_functions/hdfs.md @@ -38,7 +38,7 @@ LIMIT 2 Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix). -- `*` — Substitutes any number of any characters except `/` including none. +- `*` — Substitutes any number of any characters except `/` including empty string. - `?` — Substitutes any single character. - `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. - `{N..M}` — Substitutes any number in range from N to M including both borders. From 2d2bc052e16473505f3811e18df20bada815660e Mon Sep 17 00:00:00 2001 From: BayoNet Date: Mon, 23 Sep 2019 18:31:46 +0300 Subject: [PATCH 200/309] DOCAPI-8530: Code blocks markup fix (#7060) * Typo fix. * Links fix. * Fixed links in docs. * More fixes. * docs/en: cleaning some files * docs/en: cleaning data_types * docs/en: cleaning database_engines * docs/en: cleaning development * docs/en: cleaning getting_started * docs/en: cleaning interfaces * docs/en: cleaning operations * docs/en: cleaning query_lamguage * docs/en: cleaning en * docs/ru: cleaning data_types * docs/ru: cleaning index * docs/ru: cleaning database_engines * docs/ru: cleaning development * docs/ru: cleaning general * docs/ru: cleaning getting_started * docs/ru: cleaning interfaces * docs/ru: cleaning operations * docs/ru: cleaning query_language * docs: cleaning interfaces/http * Update docs/en/data_types/array.md decorated ``` Co-Authored-By: BayoNet * Update docs/en/getting_started/example_datasets/nyc_taxi.md fixed typo Co-Authored-By: BayoNet * Update docs/en/getting_started/example_datasets/ontime.md fixed typo Co-Authored-By: BayoNet * Update docs/en/interfaces/formats.md fixed error Co-Authored-By: BayoNet * Update docs/en/operations/table_engines/custom_partitioning_key.md Co-Authored-By: BayoNet * Update docs/en/operations/utils/clickhouse-local.md Co-Authored-By: BayoNet * Update docs/en/query_language/dicts/external_dicts_dict_sources.md Co-Authored-By: BayoNet * Update docs/en/operations/utils/clickhouse-local.md Co-Authored-By: BayoNet * Update docs/en/query_language/functions/json_functions.md Co-Authored-By: BayoNet * Update docs/en/query_language/functions/json_functions.md Co-Authored-By: BayoNet * Update docs/en/query_language/functions/other_functions.md Co-Authored-By: BayoNet * Update docs/en/query_language/functions/other_functions.md Co-Authored-By: BayoNet * Update docs/en/query_language/functions/date_time_functions.md Co-Authored-By: BayoNet * Update docs/en/operations/table_engines/jdbc.md Co-Authored-By: BayoNet * docs: fixed error * docs: fixed error --- docs/en/data_types/array.md | 50 ++---- docs/en/data_types/decimal.md | 20 +-- docs/en/data_types/domains/ipv4.md | 20 +-- docs/en/data_types/domains/ipv6.md | 22 +-- docs/en/data_types/enum.md | 33 ++-- docs/en/data_types/fixedstring.md | 11 +- docs/en/data_types/float.md | 16 +- .../aggregatefunction.md | 2 +- .../nested_data_structures/nested.md | 10 +- docs/en/data_types/nullable.md | 37 +---- .../data_types/special_data_types/nothing.md | 11 +- docs/en/data_types/tuple.md | 24 +-- docs/en/data_types/uuid.md | 33 ++-- docs/en/database_engines/mysql.md | 2 +- docs/en/development/build.md | 48 +++--- docs/en/development/build_osx.md | 27 +-- docs/en/development/tests.md | 28 ++-- docs/en/faq/general.md | 2 +- .../example_datasets/amplab_benchmark.md | 34 ++-- .../example_datasets/criteo.md | 6 +- .../example_datasets/metrica.md | 56 +++---- .../example_datasets/nyc_taxi.md | 49 +++--- .../example_datasets/ontime.md | 46 +++--- .../example_datasets/star_schema.md | 72 ++++---- .../example_datasets/wikistat.md | 8 +- docs/en/getting_started/index.md | 36 ++-- docs/en/index.md | 22 +-- docs/en/interfaces/cli.md | 12 +- docs/en/interfaces/formats.md | 58 +++---- docs/en/interfaces/http.md | 26 +-- docs/en/operations/configuration_files.md | 4 +- .../settings/constraints_on_settings.md | 8 +- docs/en/operations/settings/settings.md | 17 +- docs/en/operations/settings/settings_users.md | 8 +- docs/en/operations/system_tables.md | 14 +- .../table_engines/aggregatingmergetree.md | 8 +- docs/en/operations/table_engines/buffer.md | 2 +- .../table_engines/collapsingmergetree.md | 34 ++-- .../table_engines/custom_partitioning_key.md | 14 +- .../en/operations/table_engines/dictionary.md | 32 +--- .../operations/table_engines/distributed.md | 2 +- .../operations/table_engines/external_data.md | 8 +- docs/en/operations/table_engines/file.md | 8 +- .../table_engines/graphitemergetree.md | 2 +- docs/en/operations/table_engines/hdfs.md | 10 +- docs/en/operations/table_engines/jdbc.md | 21 ++- docs/en/operations/table_engines/join.md | 2 +- docs/en/operations/table_engines/kafka.md | 10 +- docs/en/operations/table_engines/mergetree.md | 8 +- docs/en/operations/table_engines/mysql.md | 2 +- docs/en/operations/table_engines/odbc.md | 14 +- docs/en/operations/table_engines/stripelog.md | 6 +- .../table_engines/summingmergetree.md | 12 +- docs/en/operations/table_engines/url.md | 8 +- .../versionedcollapsingmergetree.md | 18 +- docs/en/operations/tips.md | 14 +- docs/en/operations/troubleshooting.md | 22 +-- docs/en/operations/update.md | 8 +- docs/en/operations/utils/clickhouse-copier.md | 2 +- docs/en/operations/utils/clickhouse-local.md | 14 +- .../agg_functions/combinators.md | 2 +- docs/en/query_language/agg_functions/index.md | 21 +-- .../agg_functions/parametric_functions.md | 10 +- .../query_language/agg_functions/reference.md | 88 +++++----- docs/en/query_language/alter.md | 56 +++---- docs/en/query_language/create.md | 18 +- .../dicts/external_dicts_dict_layout.md | 10 +- .../dicts/external_dicts_dict_sources.md | 19 ++- .../functions/arithmetic_functions.md | 4 +- .../functions/array_functions.md | 50 +++--- .../en/query_language/functions/array_join.md | 2 +- .../functions/bitmap_functions.md | 78 ++++----- .../functions/conditional_functions.md | 6 +- .../functions/date_time_functions.md | 18 +- .../functions/ext_dict_functions.md | 10 +- .../functions/functions_for_nulls.md | 97 ++++++----- docs/en/query_language/functions/geo.md | 38 ++--- .../functions/hash_functions.md | 18 +- .../functions/higher_order_functions.md | 34 ++-- .../functions/ip_address_functions.md | 44 ++--- .../functions/json_functions.md | 64 ++++---- .../functions/math_functions.md | 4 +- .../functions/other_functions.md | 155 ++++++++---------- .../functions/rounding_functions.md | 10 +- .../functions/splitting_merging_functions.md | 5 +- .../functions/string_functions.md | 11 +- .../functions/string_replace_functions.md | 16 +- .../functions/type_conversion_functions.md | 31 ++-- .../query_language/functions/url_functions.md | 16 +- .../functions/uuid_functions.md | 33 ++-- .../functions/ym_dict_functions.md | 10 +- docs/en/query_language/insert_into.md | 10 +- docs/en/query_language/misc.md | 36 ++-- docs/en/query_language/operators.md | 40 ++--- docs/en/query_language/select.md | 150 ++++++++--------- docs/en/query_language/syntax.md | 11 +- .../en/query_language/table_functions/file.md | 6 +- .../en/query_language/table_functions/hdfs.md | 4 +- .../query_language/table_functions/input.md | 6 +- .../en/query_language/table_functions/jdbc.md | 6 +- .../query_language/table_functions/mysql.md | 4 +- .../query_language/table_functions/numbers.md | 4 +- .../en/query_language/table_functions/odbc.md | 14 +- .../query_language/table_functions/remote.md | 12 +- docs/en/query_language/table_functions/url.md | 2 +- docs/ru/data_types/array.md | 49 ++---- docs/ru/data_types/decimal.md | 20 +-- docs/ru/data_types/domains/ipv4.md | 10 +- docs/ru/data_types/domains/ipv6.md | 10 +- docs/ru/data_types/enum.md | 33 ++-- docs/ru/data_types/fixedstring.md | 10 +- docs/ru/data_types/float.md | 16 +- .../aggregatefunction.md | 2 +- .../nested_data_structures/nested.md | 10 +- docs/ru/data_types/nullable.md | 37 +---- .../data_types/special_data_types/nothing.md | 11 +- docs/ru/data_types/tuple.md | 24 +-- docs/ru/data_types/uuid.md | 29 ++-- docs/ru/database_engines/mysql.md | 4 +- docs/ru/development/style.md | 4 +- docs/ru/faq/general.md | 2 +- .../example_datasets/amplab_benchmark.md | 32 ++-- .../example_datasets/criteo.md | 8 +- .../example_datasets/metrica.md | 56 +++---- .../example_datasets/nyc_taxi.md | 47 +++--- .../example_datasets/ontime.md | 44 ++--- .../example_datasets/star_schema.md | 72 ++++---- .../example_datasets/wikistat.md | 8 +- docs/ru/getting_started/index.md | 44 +++-- docs/ru/index.md | 22 +-- docs/ru/interfaces/cli.md | 10 +- docs/ru/interfaces/formats.md | 52 +++--- docs/ru/interfaces/http.md | 30 ++-- docs/ru/operations/configuration_files.md | 4 +- .../settings/constraints_on_settings.md | 8 +- docs/ru/operations/settings/settings.md | 17 +- .../operations/settings/settings_profiles.md | 2 +- docs/ru/operations/settings/settings_users.md | 8 +- docs/ru/operations/system_tables.md | 14 +- .../table_engines/aggregatingmergetree.md | 8 +- docs/ru/operations/table_engines/buffer.md | 4 +- .../table_engines/collapsingmergetree.md | 35 ++-- .../table_engines/custom_partitioning_key.md | 10 +- .../ru/operations/table_engines/dictionary.md | 32 +--- .../operations/table_engines/distributed.md | 2 +- .../operations/table_engines/external_data.md | 8 +- docs/ru/operations/table_engines/file.md | 8 +- .../table_engines/graphitemergetree.md | 4 +- docs/ru/operations/table_engines/hdfs.md | 10 +- docs/ru/operations/table_engines/jdbc.md | 22 ++- docs/ru/operations/table_engines/join.md | 2 +- docs/ru/operations/table_engines/kafka.md | 6 +- docs/ru/operations/table_engines/merge.md | 8 +- docs/ru/operations/table_engines/mergetree.md | 10 +- docs/ru/operations/table_engines/mysql.md | 2 +- docs/ru/operations/table_engines/odbc.md | 14 +- .../operations/table_engines/replication.md | 2 +- docs/ru/operations/table_engines/stripelog.md | 6 +- .../table_engines/summingmergetree.md | 8 +- docs/ru/operations/table_engines/url.md | 8 +- .../versionedcollapsingmergetree.md | 18 +- docs/ru/operations/tips.md | 14 +- docs/ru/operations/troubleshooting.md | 22 +-- docs/ru/operations/update.md | 8 +- docs/ru/operations/utils/clickhouse-copier.md | 2 +- docs/ru/operations/utils/clickhouse-local.md | 12 +- docs/ru/query_language/agg_functions/index.md | 21 +-- .../agg_functions/parametric_functions.md | 8 +- .../query_language/agg_functions/reference.md | 78 ++++----- docs/ru/query_language/alter.md | 36 ++-- docs/ru/query_language/create.md | 2 +- .../dicts/external_dicts_dict_layout.md | 8 +- .../dicts/external_dicts_dict_sources.md | 18 +- .../functions/arithmetic_functions.md | 4 +- .../functions/array_functions.md | 123 +++++++------- .../ru/query_language/functions/array_join.md | 4 +- .../functions/bitmap_functions.md | 54 +++--- .../functions/conditional_functions.md | 4 +- .../functions/date_time_functions.md | 4 +- .../functions/ext_dict_functions.md | 8 +- .../functions/functions_for_nulls.md | 100 ++++++----- docs/ru/query_language/functions/geo.md | 24 +-- .../functions/hash_functions.md | 18 +- .../functions/higher_order_functions.md | 30 ++-- .../functions/ip_address_functions.md | 20 +-- .../functions/json_functions.md | 64 ++++---- .../functions/math_functions.md | 4 +- .../functions/other_functions.md | 146 ++++++++--------- .../functions/rounding_functions.md | 10 +- .../functions/splitting_merging_functions.md | 4 +- .../functions/string_functions.md | 11 +- .../functions/string_replace_functions.md | 16 +- .../functions/type_conversion_functions.md | 21 ++- .../query_language/functions/url_functions.md | 16 +- .../functions/uuid_functions.md | 30 ++-- .../functions/ym_dict_functions.md | 10 +- docs/ru/query_language/insert_into.md | 10 +- docs/ru/query_language/misc.md | 2 +- docs/ru/query_language/operators.md | 40 ++--- docs/ru/query_language/select.md | 134 ++++++++------- docs/ru/query_language/syntax.md | 9 +- docs/ru/query_language/system.md | 6 +- .../ru/query_language/table_functions/file.md | 6 +- .../ru/query_language/table_functions/hdfs.md | 6 +- .../query_language/table_functions/input.md | 6 +- .../ru/query_language/table_functions/jdbc.md | 6 +- .../query_language/table_functions/mysql.md | 4 +- .../query_language/table_functions/numbers.md | 4 +- .../ru/query_language/table_functions/odbc.md | 14 +- .../query_language/table_functions/remote.md | 12 +- docs/ru/query_language/table_functions/url.md | 2 +- 211 files changed, 2254 insertions(+), 2373 deletions(-) diff --git a/docs/en/data_types/array.md b/docs/en/data_types/array.md index 3338d2648d2..4e9c7d5930e 100644 --- a/docs/en/data_types/array.md +++ b/docs/en/data_types/array.md @@ -8,42 +8,34 @@ Array of `T`-type items. You can use a function to create an array: -``` +```sql array(T) ``` You can also use square brackets. -``` +```sql [] ``` Example of creating an array: +```sql +SELECT array(1, 2) AS x, toTypeName(x) ``` -:) SELECT array(1, 2) AS x, toTypeName(x) - -SELECT - [1, 2] AS x, - toTypeName(x) - +```text ┌─x─────┬─toTypeName(array(1, 2))─┐ │ [1,2] │ Array(UInt8) │ └───────┴─────────────────────────┘ - -1 rows in set. Elapsed: 0.002 sec. - -:) SELECT [1, 2] AS x, toTypeName(x) - -SELECT - [1, 2] AS x, - toTypeName(x) - +``` +```sql +SELECT [1, 2] AS x, toTypeName(x) +``` +```text ┌─x─────┬─toTypeName([1, 2])─┐ │ [1,2] │ Array(UInt8) │ └───────┴────────────────────┘ -1 rows in set. Elapsed: 0.002 sec. ``` ## Working with data types @@ -54,31 +46,23 @@ If ClickHouse couldn't determine the data type, it will generate an exception. F Examples of automatic data type detection: -``` -:) SELECT array(1, 2, NULL) AS x, toTypeName(x) - -SELECT - [1, 2, NULL] AS x, - toTypeName(x) - +```sql +SELECT array(1, 2, NULL) AS x, toTypeName(x) +``` +```text ┌─x──────────┬─toTypeName(array(1, 2, NULL))─┐ │ [1,2,NULL] │ Array(Nullable(UInt8)) │ └────────────┴───────────────────────────────┘ - -1 rows in set. Elapsed: 0.002 sec. ``` If you try to create an array of incompatible data types, ClickHouse throws an exception: +```sql +SELECT array(1, 'a') ``` -:) SELECT array(1, 'a') - -SELECT [1, 'a'] - +```text Received exception from server (version 1.1.54388): Code: 386. DB::Exception: Received from localhost:9000, 127.0.0.1. DB::Exception: There is no supertype for types UInt8, String because some of them are String/FixedString and some of them are not. - -0 rows in set. Elapsed: 0.246 sec. ``` diff --git a/docs/en/data_types/decimal.md b/docs/en/data_types/decimal.md index 136297f8d48..262330dcff9 100644 --- a/docs/en/data_types/decimal.md +++ b/docs/en/data_types/decimal.md @@ -51,36 +51,36 @@ Some functions on Decimal return result as Float64 (for example, var or stddev). During calculations on Decimal, integer overflows might happen. Excessive digits in fraction are discarded (not rounded). Excessive digits in integer part will lead to exception. -``` +```sql SELECT toDecimal32(2, 4) AS x, x / 3 ``` -``` +```text ┌──────x─┬─divide(toDecimal32(2, 4), 3)─┐ │ 2.0000 │ 0.6666 │ └────────┴──────────────────────────────┘ ``` -``` +```sql SELECT toDecimal32(4.2, 8) AS x, x * x ``` -``` +```text DB::Exception: Scale is out of bounds. ``` -``` +```sql SELECT toDecimal32(4.2, 8) AS x, 6 * x ``` -``` +```text DB::Exception: Decimal math overflow. ``` Overflow checks lead to operations slowdown. If it is known that overflows are not possible, it makes sense to disable checks using `decimal_check_overflow` setting. When checks are disabled and overflow happens, the result will be incorrect: -``` +```sql SET decimal_check_overflow = 0; SELECT toDecimal32(4.2, 8) AS x, 6 * x ``` -``` +```text ┌──────────x─┬─multiply(6, toDecimal32(4.2, 8))─┐ │ 4.20000000 │ -17.74967296 │ └────────────┴──────────────────────────────────┘ @@ -88,10 +88,10 @@ SELECT toDecimal32(4.2, 8) AS x, 6 * x Overflow checks happen not only on arithmetic operations, but also on value comparison: -``` +```sql SELECT toDecimal32(1, 8) < 100 ``` -``` +```text DB::Exception: Can't compare. ``` diff --git a/docs/en/data_types/domains/ipv4.md b/docs/en/data_types/domains/ipv4.md index dd7e08d673e..1ee6903b897 100644 --- a/docs/en/data_types/domains/ipv4.md +++ b/docs/en/data_types/domains/ipv4.md @@ -4,13 +4,13 @@ ### Basic Usage -``` sql +```sql CREATE TABLE hits (url String, from IPv4) ENGINE = MergeTree() ORDER BY url; DESCRIBE TABLE hits; ``` -``` +```text ┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ │ url │ String │ │ │ │ │ │ from │ IPv4 │ │ │ │ │ @@ -19,19 +19,19 @@ DESCRIBE TABLE hits; OR you can use IPv4 domain as a key: -``` sql +```sql CREATE TABLE hits (url String, from IPv4) ENGINE = MergeTree() ORDER BY from; ``` `IPv4` domain supports custom input format as IPv4-strings: -``` sql +```sql INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '116.253.40.133')('https://clickhouse.yandex', '183.247.232.58')('https://clickhouse.yandex/docs/en/', '116.106.34.242'); SELECT * FROM hits; ``` -``` +```text ┌─url────────────────────────────────┬───────────from─┐ │ https://clickhouse.yandex/docs/en/ │ 116.106.34.242 │ │ https://wikipedia.org │ 116.253.40.133 │ @@ -41,11 +41,11 @@ SELECT * FROM hits; Values are stored in compact binary form: -``` sql +```sql SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; ``` -``` +```text ┌─toTypeName(from)─┬─hex(from)─┐ │ IPv4 │ B7F7E83A │ └──────────────────┴───────────┘ @@ -54,7 +54,7 @@ SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; Domain values are not implicitly convertible to types other than `UInt32`. If you want to convert `IPv4` value to a string, you have to do that explicitly with `IPv4NumToString()` function: -``` sql +```sql SELECT toTypeName(s), IPv4NumToString(from) as s FROM hits LIMIT 1; ``` @@ -66,11 +66,11 @@ SELECT toTypeName(s), IPv4NumToString(from) as s FROM hits LIMIT 1; Or cast to a `UInt32` value: -``` sql +```sql SELECT toTypeName(i), CAST(from as UInt32) as i FROM hits LIMIT 1; ``` -``` +```text ┌─toTypeName(CAST(from, 'UInt32'))─┬──────────i─┐ │ UInt32 │ 3086477370 │ └──────────────────────────────────┴────────────┘ diff --git a/docs/en/data_types/domains/ipv6.md b/docs/en/data_types/domains/ipv6.md index 1bfbe3400b5..e2abaff0172 100644 --- a/docs/en/data_types/domains/ipv6.md +++ b/docs/en/data_types/domains/ipv6.md @@ -4,13 +4,13 @@ ### Basic Usage -``` sql +```sql CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY url; DESCRIBE TABLE hits; ``` -``` +```text ┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ │ url │ String │ │ │ │ │ │ from │ IPv6 │ │ │ │ │ @@ -19,19 +19,19 @@ DESCRIBE TABLE hits; OR you can use `IPv6` domain as a key: -``` sql +```sql CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY from; ``` `IPv6` domain supports custom input as IPv6-strings: -``` sql +```sql INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '2a02:aa08:e000:3100::2')('https://clickhouse.yandex', '2001:44c8:129:2632:33:0:252:2')('https://clickhouse.yandex/docs/en/', '2a02:e980:1e::1'); SELECT * FROM hits; ``` -``` +```text ┌─url────────────────────────────────┬─from──────────────────────────┐ │ https://clickhouse.yandex │ 2001:44c8:129:2632:33:0:252:2 │ │ https://clickhouse.yandex/docs/en/ │ 2a02:e980:1e::1 │ @@ -41,11 +41,11 @@ SELECT * FROM hits; Values are stored in compact binary form: -``` sql +```sql SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; ``` -``` +```text ┌─toTypeName(from)─┬─hex(from)────────────────────────┐ │ IPv6 │ 200144C8012926320033000002520002 │ └──────────────────┴──────────────────────────────────┘ @@ -54,11 +54,11 @@ SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; Domain values are not implicitly convertible to types other than `FixedString(16)`. If you want to convert `IPv6` value to a string, you have to do that explicitly with `IPv6NumToString()` function: -``` sql +```sql SELECT toTypeName(s), IPv6NumToString(from) as s FROM hits LIMIT 1; ``` -``` +```text ┌─toTypeName(IPv6NumToString(from))─┬─s─────────────────────────────┐ │ String │ 2001:44c8:129:2632:33:0:252:2 │ └───────────────────────────────────┴───────────────────────────────┘ @@ -66,11 +66,11 @@ SELECT toTypeName(s), IPv6NumToString(from) as s FROM hits LIMIT 1; Or cast to a `FixedString(16)` value: -``` sql +```sql SELECT toTypeName(i), CAST(from as FixedString(16)) as i FROM hits LIMIT 1; ``` -``` +```text ┌─toTypeName(CAST(from, 'FixedString(16)'))─┬─i───────┐ │ FixedString(16) │ ��� │ └───────────────────────────────────────────┴─────────┘ diff --git a/docs/en/data_types/enum.md b/docs/en/data_types/enum.md index 692cbad39b1..3b27b2e6c7e 100644 --- a/docs/en/data_types/enum.md +++ b/docs/en/data_types/enum.md @@ -26,19 +26,15 @@ ENGINE = TinyLog Column `x` can only store values that are listed in the type definition: `'hello'` or `'world'`. If you try to save any other value, ClickHouse will raise an exception. 8-bit size for this `Enum` is chosen automatically. ```sql -:) INSERT INTO t_enum VALUES ('hello'), ('world'), ('hello') - -INSERT INTO t_enum VALUES - +INSERT INTO t_enum VALUES ('hello'), ('world'), ('hello') +``` +```text Ok. - -3 rows in set. Elapsed: 0.002 sec. - -:) insert into t_enum values('a') - -INSERT INTO t_enum VALUES - - +``` +```sql +INSERT INTO t_enum values('a') +``` +```text Exception on client: Code: 49. DB::Exception: Unknown element 'a' for type Enum('hello' = 1, 'world' = 2) ``` @@ -47,7 +43,8 @@ When you query data from the table, ClickHouse outputs the string values from `E ```sql SELECT * FROM t_enum - +``` +```text ┌─x─────┐ │ hello │ │ world │ @@ -59,7 +56,8 @@ If you need to see the numeric equivalents of the rows, you must cast the `Enum` ```sql SELECT CAST(x, 'Int8') FROM t_enum - +``` +```text ┌─CAST(x, 'Int8')─┐ │ 1 │ │ 2 │ @@ -71,7 +69,8 @@ To create an Enum value in a query, you also need to use `CAST`. ```sql SELECT toTypeName(CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)')) - +``` +```text ┌─toTypeName(CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)'))─┐ │ Enum8('a' = 1, 'b' = 2) │ └─────────────────────────────────────────────────────┘ @@ -85,7 +84,7 @@ Neither the string nor the numeric value in an `Enum` can be [NULL](../query_lan An `Enum` can be contained in [Nullable](nullable.md) type. So if you create a table using the query -``` +```sql CREATE TABLE t_enum_nullable ( x Nullable( Enum8('hello' = 1, 'world' = 2) ) @@ -95,7 +94,7 @@ ENGINE = TinyLog it can store not only `'hello'` and `'world'`, but `NULL`, as well. -``` +```sql INSERT INTO t_enum_nullable Values('hello'),('world'),(NULL) ``` diff --git a/docs/en/data_types/fixedstring.md b/docs/en/data_types/fixedstring.md index 9725c71a779..fd2f1a17a37 100644 --- a/docs/en/data_types/fixedstring.md +++ b/docs/en/data_types/fixedstring.md @@ -4,7 +4,7 @@ A fixed-length string of `N` bytes (neither characters nor code points). To declare a column of `FixedString` type, use the following syntax: -``` +```sql FixedString(N) ``` @@ -30,7 +30,7 @@ When selecting the data, ClickHouse does not remove the null bytes at the end of Let's consider the following table with the single `FixedString(2)` column: -``` +```text ┌─name──┐ │ b │ └───────┘ @@ -38,15 +38,14 @@ Let's consider the following table with the single `FixedString(2)` column: The query `SELECT * FROM FixedStringTable WHERE a = 'b'` does not return any data as a result. We should complement the filter pattern with null bytes. -``` +```sql SELECT * FROM FixedStringTable WHERE a = 'b\0' - +``` +```text ┌─a─┐ │ b │ └───┘ - -1 rows in set. Elapsed: 0.002 sec. ``` This behavior differs from MySQL behavior for the `CHAR` type (where strings are padded with spaces, and the spaces are removed for output). diff --git a/docs/en/data_types/float.md b/docs/en/data_types/float.md index 29ff604bb02..a365c80b4d1 100644 --- a/docs/en/data_types/float.md +++ b/docs/en/data_types/float.md @@ -13,11 +13,11 @@ We recommend that you store data in integer form whenever possible. For example, - Computations with floating-point numbers might produce a rounding error. -``` sql +```sql SELECT 1 - 0.9 ``` -``` +```text ┌───────minus(1, 0.9)─┐ │ 0.09999999999999998 │ └─────────────────────┘ @@ -33,11 +33,11 @@ In contrast to standard SQL, ClickHouse supports the following categories of flo - `Inf` – Infinity. -``` sql +```sql SELECT 0.5 / 0 ``` -``` +```text ┌─divide(0.5, 0)─┐ │ inf │ └────────────────┘ @@ -45,11 +45,11 @@ SELECT 0.5 / 0 - `-Inf` – Negative infinity. -``` sql +```sql SELECT -0.5 / 0 ``` -``` +```text ┌─divide(-0.5, 0)─┐ │ -inf │ └─────────────────┘ @@ -57,11 +57,11 @@ SELECT -0.5 / 0 - `NaN` – Not a number. -``` +```sql SELECT 0 / 0 ``` -``` +```text ┌─divide(0, 0)─┐ │ nan │ └──────────────┘ diff --git a/docs/en/data_types/nested_data_structures/aggregatefunction.md b/docs/en/data_types/nested_data_structures/aggregatefunction.md index 95ed4d751e6..46e43ea5d91 100644 --- a/docs/en/data_types/nested_data_structures/aggregatefunction.md +++ b/docs/en/data_types/nested_data_structures/aggregatefunction.md @@ -33,7 +33,7 @@ To insert data, use `INSERT SELECT` with aggregate `-State`- functions. **Function examples** -``` +```sql uniqState(UserID) quantilesState(0.5, 0.9)(SendTiming) ``` diff --git a/docs/en/data_types/nested_data_structures/nested.md b/docs/en/data_types/nested_data_structures/nested.md index 1d90048d626..3083d0f915c 100644 --- a/docs/en/data_types/nested_data_structures/nested.md +++ b/docs/en/data_types/nested_data_structures/nested.md @@ -4,7 +4,7 @@ A nested data structure is like a nested table. The parameters of a nested data Example: -``` sql +```sql CREATE TABLE test.visits ( CounterID UInt32, @@ -35,7 +35,7 @@ In most cases, when working with a nested data structure, its individual columns Example: -``` sql +```sql SELECT Goals.ID, Goals.EventTime @@ -44,7 +44,7 @@ WHERE CounterID = 101500 AND length(Goals.ID) < 5 LIMIT 10 ``` -``` +```text ┌─Goals.ID───────────────────────┬─Goals.EventTime───────────────────────────────────────────────────────────────────────────┐ │ [1073752,591325,591325] │ ['2014-03-17 16:38:10','2014-03-17 16:38:48','2014-03-17 16:42:27'] │ │ [1073752] │ ['2014-03-17 00:28:25'] │ @@ -63,7 +63,7 @@ It is easiest to think of a nested data structure as a set of multiple column ar The only place where a SELECT query can specify the name of an entire nested data structure instead of individual columns is the ARRAY JOIN clause. For more information, see "ARRAY JOIN clause". Example: -``` sql +```sql SELECT Goal.ID, Goal.EventTime @@ -73,7 +73,7 @@ WHERE CounterID = 101500 AND length(Goals.ID) < 5 LIMIT 10 ``` -``` +```text ┌─Goal.ID─┬──────Goal.EventTime─┐ │ 1073752 │ 2014-03-17 16:38:10 │ │ 591325 │ 2014-03-17 16:38:48 │ diff --git a/docs/en/data_types/nullable.md b/docs/en/data_types/nullable.md index 3030f71a9c6..a46bcbcb3f2 100644 --- a/docs/en/data_types/nullable.md +++ b/docs/en/data_types/nullable.md @@ -17,39 +17,20 @@ To store `Nullable` type values in table column, ClickHouse uses a separate file ## Usage example +```sql +CREATE TABLE t_null(x Int8, y Nullable(Int8)) ENGINE TinyLog ``` -:) CREATE TABLE t_null(x Int8, y Nullable(Int8)) ENGINE TinyLog - -CREATE TABLE t_null -( - x Int8, - y Nullable(Int8) -) -ENGINE = TinyLog - -Ok. - -0 rows in set. Elapsed: 0.012 sec. - -:) INSERT INTO t_null VALUES (1, NULL), (2, 3) - -INSERT INTO t_null VALUES - -Ok. - -1 rows in set. Elapsed: 0.007 sec. - -:) SELECT x + y FROM t_null - -SELECT x + y -FROM t_null - +```sql +INSERT INTO t_null VALUES (1, NULL), (2, 3) +``` +```sql +SELECT x + y FROM t_null +``` +```text ┌─plus(x, y)─┐ │ ᴺᵁᴸᴸ │ │ 5 │ └────────────┘ - -2 rows in set. Elapsed: 0.144 sec. ``` [Original article](https://clickhouse.yandex/docs/en/data_types/nullable/) diff --git a/docs/en/data_types/special_data_types/nothing.md b/docs/en/data_types/special_data_types/nothing.md index 2ccf7a7c72a..d0c34e1fecf 100644 --- a/docs/en/data_types/special_data_types/nothing.md +++ b/docs/en/data_types/special_data_types/nothing.md @@ -7,16 +7,13 @@ For example, literal [NULL](../../query_language/syntax.md#null-literal) has typ The `Nothing` type can also used to denote empty arrays: -```bash -:) SELECT toTypeName(array()) - -SELECT toTypeName([]) - +```sql +SELECT toTypeName(array()) +``` +```text ┌─toTypeName(array())─┐ │ Array(Nothing) │ └─────────────────────┘ - -1 rows in set. Elapsed: 0.062 sec. ``` diff --git a/docs/en/data_types/tuple.md b/docs/en/data_types/tuple.md index d3d3c42cf36..7726c25ce85 100644 --- a/docs/en/data_types/tuple.md +++ b/docs/en/data_types/tuple.md @@ -11,24 +11,19 @@ Tuples can be the result of a query. In this case, for text formats other than J You can use a function to create a tuple: -``` +```sql tuple(T1, T2, ...) ``` Example of creating a tuple: +```sql +SELECT tuple(1,'a') AS x, toTypeName(x) ``` -:) SELECT tuple(1,'a') AS x, toTypeName(x) - -SELECT - (1, 'a') AS x, - toTypeName(x) - +```text ┌─x───────┬─toTypeName(tuple(1, 'a'))─┐ │ (1,'a') │ Tuple(UInt8, String) │ └─────────┴───────────────────────────┘ - -1 rows in set. Elapsed: 0.021 sec. ``` ## Working with data types @@ -37,18 +32,13 @@ When creating a tuple on the fly, ClickHouse automatically detects the type of e Example of automatic data type detection: -``` +```sql SELECT tuple(1, NULL) AS x, toTypeName(x) - -SELECT - (1, NULL) AS x, - toTypeName(x) - +``` +```text ┌─x────────┬─toTypeName(tuple(1, NULL))──────┐ │ (1,NULL) │ Tuple(UInt8, Nullable(Nothing)) │ └──────────┴─────────────────────────────────┘ - -1 rows in set. Elapsed: 0.002 sec. ``` diff --git a/docs/en/data_types/uuid.md b/docs/en/data_types/uuid.md index f66e1895a20..81e6d11435a 100644 --- a/docs/en/data_types/uuid.md +++ b/docs/en/data_types/uuid.md @@ -4,13 +4,13 @@ A universally unique identifier (UUID) is a 16-byte number used to identify reco The example of UUID type value is represented below: -``` +```text 61f0c404-5cb3-11e7-907b-a6006ad3dba0 ``` If you do not specify the UUID column value when inserting a new record, the UUID value is filled with zero: -``` +```text 00000000-0000-0000-0000-000000000000 ``` @@ -24,13 +24,16 @@ To generate the UUID value, ClickHouse provides the [generateUUIDv4](../query_la This example demonstrates creating a table with the UUID type column and inserting a value into the table. -``` sql -:) CREATE TABLE t_uuid (x UUID, y String) ENGINE=TinyLog - -:) INSERT INTO t_uuid SELECT generateUUIDv4(), 'Example 1' - -:) SELECT * FROM t_uuid - +```sql +CREATE TABLE t_uuid (x UUID, y String) ENGINE=TinyLog +``` +```sql +INSERT INTO t_uuid SELECT generateUUIDv4(), 'Example 1' +``` +```sql +SELECT * FROM t_uuid +``` +```text ┌────────────────────────────────────x─┬─y─────────┐ │ 417ddc5d-e556-4d27-95dd-a34d84e46a50 │ Example 1 │ └──────────────────────────────────────┴───────────┘ @@ -40,11 +43,13 @@ This example demonstrates creating a table with the UUID type column and inserti In this example, the UUID column value is not specified when inserting a new record. -``` sql -:) INSERT INTO t_uuid (y) VALUES ('Example 2') - -:) SELECT * FROM t_uuid - +```sql +INSERT INTO t_uuid (y) VALUES ('Example 2') +``` +```sql +SELECT * FROM t_uuid +``` +```text ┌────────────────────────────────────x─┬─y─────────┐ │ 417ddc5d-e556-4d27-95dd-a34d84e46a50 │ Example 1 │ │ 00000000-0000-0000-0000-000000000000 │ Example 2 │ diff --git a/docs/en/database_engines/mysql.md b/docs/en/database_engines/mysql.md index 24352c1924c..ba2405de0ea 100644 --- a/docs/en/database_engines/mysql.md +++ b/docs/en/database_engines/mysql.md @@ -55,7 +55,7 @@ All other MySQL data types are converted into [String](../data_types/string.md). Table in MySQL: -``` +```text mysql> USE test; Database changed diff --git a/docs/en/development/build.md b/docs/en/development/build.md index 02cea936c70..9cc5a5c5aa3 100644 --- a/docs/en/development/build.md +++ b/docs/en/development/build.md @@ -3,21 +3,21 @@ ## Install Git and Pbuilder ```bash -sudo apt-get update -sudo apt-get install git pbuilder debhelper lsb-release fakeroot sudo debian-archive-keyring debian-keyring +$ sudo apt-get update +$ sudo apt-get install git pbuilder debhelper lsb-release fakeroot sudo debian-archive-keyring debian-keyring ``` ## Checkout ClickHouse Sources ```bash -git clone --recursive --branch stable https://github.com/yandex/ClickHouse.git -cd ClickHouse +$ git clone --recursive --branch stable https://github.com/yandex/ClickHouse.git +$ cd ClickHouse ``` ## Run Release Script ```bash -./release +$ ./release ``` # How to Build ClickHouse for Development @@ -29,13 +29,13 @@ Only x86_64 with SSE 4.2 is supported. Support for AArch64 is experimental. To test for SSE 4.2, do ```bash -grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported" +$ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported" ``` ## Install Git and CMake ```bash -sudo apt-get install git cmake ninja-build +$ sudo apt-get install git cmake ninja-build ``` Or cmake3 instead of cmake on older systems. @@ -47,10 +47,10 @@ There are several ways to do this. ### Install from a PPA Package ```bash -sudo apt-get install software-properties-common -sudo apt-add-repository ppa:ubuntu-toolchain-r/test -sudo apt-get update -sudo apt-get install gcc-9 g++-9 +$ sudo apt-get install software-properties-common +$ sudo apt-add-repository ppa:ubuntu-toolchain-r/test +$ sudo apt-get update +$ sudo apt-get install gcc-9 g++-9 ``` ### Install from Sources @@ -60,23 +60,25 @@ Look at [utils/ci/build-gcc-from-sources.sh](https://github.com/yandex/ClickHous ## Use GCC 9 for Builds ```bash -export CC=gcc-9 -export CXX=g++-9 +$ export CC=gcc-9 +$ export CXX=g++-9 ``` ## Install Required Libraries from Packages ```bash -sudo apt-get install libicu-dev libreadline-dev gperf +$ sudo apt-get install libicu-dev libreadline-dev gperf ``` ## Checkout ClickHouse Sources ```bash -git clone --recursive git@github.com:yandex/ClickHouse.git -# or: git clone --recursive https://github.com/yandex/ClickHouse.git - -cd ClickHouse +$ git clone --recursive git@github.com:yandex/ClickHouse.git +``` +or +```bash +$ git clone --recursive https://github.com/yandex/ClickHouse.git +$ cd ClickHouse ``` For the latest stable version, switch to the `stable` branch. @@ -84,11 +86,11 @@ For the latest stable version, switch to the `stable` branch. ## Build ClickHouse ```bash -mkdir build -cd build -cmake .. -ninja -cd .. +$ mkdir build +$ cd build +$ cmake .. +$ ninja +$ cd .. ``` To create an executable, run `ninja clickhouse`. diff --git a/docs/en/development/build_osx.md b/docs/en/development/build_osx.md index 928f8f88dfd..c6a2be20530 100644 --- a/docs/en/development/build_osx.md +++ b/docs/en/development/build_osx.md @@ -5,22 +5,25 @@ Build should work on Mac OS X 10.12. ## Install Homebrew ```bash -/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" +$ /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" ``` ## Install Required Compilers, Tools, and Libraries ```bash -brew install cmake ninja gcc icu4c openssl libtool gettext readline gperf +$ brew install cmake ninja gcc icu4c openssl libtool gettext readline gperf ``` ## Checkout ClickHouse Sources ```bash -git clone --recursive git@github.com:yandex/ClickHouse.git -# or: git clone --recursive https://github.com/yandex/ClickHouse.git +$ git clone --recursive git@github.com:yandex/ClickHouse.git +``` +or +```bash +$ git clone --recursive https://github.com/yandex/ClickHouse.git -cd ClickHouse +$ cd ClickHouse ``` For the latest stable version, switch to the `stable` branch. @@ -28,11 +31,11 @@ For the latest stable version, switch to the `stable` branch. ## Build ClickHouse ```bash -mkdir build -cd build -cmake .. -DCMAKE_CXX_COMPILER=`which g++-8` -DCMAKE_C_COMPILER=`which gcc-8` -ninja -cd .. +$ mkdir build +$ cd build +$ cmake .. -DCMAKE_CXX_COMPILER=`which g++-8` -DCMAKE_C_COMPILER=`which gcc-8` +$ ninja +$ cd .. ``` ## Caveats @@ -45,7 +48,7 @@ If you intend to run clickhouse-server, make sure to increase the system's maxfi To do so, create the following file: /Library/LaunchDaemons/limit.maxfiles.plist: -``` xml +```xml @@ -70,7 +73,7 @@ To do so, create the following file: ``` Execute the following command: -``` bash +```bash $ sudo chown root:wheel /Library/LaunchDaemons/limit.maxfiles.plist ``` diff --git a/docs/en/development/tests.md b/docs/en/development/tests.md index 881aaf9dd46..e451d17e717 100644 --- a/docs/en/development/tests.md +++ b/docs/en/development/tests.md @@ -86,21 +86,21 @@ Note that all clickhouse tools (server, client, etc) are just symlinks to a sing Alternatively you can install ClickHouse package: either stable release from Yandex repository or you can build package for yourself with `./release` in ClickHouse sources root. Then start the server with `sudo service clickhouse-server start` (or stop to stop the server). Look for logs at `/etc/clickhouse-server/clickhouse-server.log`. When ClickHouse is already installed on your system, you can build a new `clickhouse` binary and replace the existing binary: -``` -sudo service clickhouse-server stop -sudo cp ./clickhouse /usr/bin/ -sudo service clickhouse-server start +```bash +$ sudo service clickhouse-server stop +$ sudo cp ./clickhouse /usr/bin/ +$ sudo service clickhouse-server start ``` Also you can stop system clickhouse-server and run your own with the same configuration but with logging to terminal: -``` -sudo service clickhouse-server stop -sudo -u clickhouse /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml +```bash +$ sudo service clickhouse-server stop +$ sudo -u clickhouse /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml ``` Example with gdb: -``` -sudo -u clickhouse gdb --args /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml +```bash +$ sudo -u clickhouse gdb --args /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml ``` If the system clickhouse-server is already running and you don't want to stop it, you can change port numbers in your `config.xml` (or override them in a file in `config.d` directory), provide appropriate data path, and run it. @@ -112,7 +112,7 @@ If the system clickhouse-server is already running and you don't want to stop it Before publishing release as stable we deploy it on testing environment. Testing environment is a cluster that process 1/39 part of [Yandex.Metrica](https://metrica.yandex.com/) data. We share our testing environment with Yandex.Metrica team. ClickHouse is upgraded without downtime on top of existing data. We look at first that data is processed successfully without lagging from realtime, the replication continue to work and there is no issues visible to Yandex.Metrica team. First check can be done in the following way: -``` +```sql SELECT hostName() AS h, any(version()), any(uptime()), max(UTCEventTime), count() FROM remote('example01-01-{1..3}t', merge, hits) WHERE EventDate >= today() - 2 GROUP BY h ORDER BY h; ``` @@ -126,16 +126,16 @@ After deploying to testing environment we run load testing with queries from pro Make sure you have enabled `query_log` on your production cluster. Collect query log for a day or more: -``` -clickhouse-client --query="SELECT DISTINCT query FROM system.query_log WHERE event_date = today() AND query LIKE '%ym:%' AND query NOT LIKE '%system.query_log%' AND type = 2 AND is_initial_query" > queries.tsv +```bash +$ clickhouse-client --query="SELECT DISTINCT query FROM system.query_log WHERE event_date = today() AND query LIKE '%ym:%' AND query NOT LIKE '%system.query_log%' AND type = 2 AND is_initial_query" > queries.tsv ``` This is a way complicated example. `type = 2` will filter queries that are executed successfully. `query LIKE '%ym:%'` is to select relevant queries from Yandex.Metrica. `is_initial_query` is to select only queries that are initiated by client, not by ClickHouse itself (as parts of distributed query processing). `scp` this log to your testing cluster and run it as following: -``` -clickhouse benchmark --concurrency 16 < queries.tsv +```bash +$ clickhouse benchmark --concurrency 16 < queries.tsv ``` (probably you also want to specify a `--user`) diff --git a/docs/en/faq/general.md b/docs/en/faq/general.md index 4cd9189c256..41026e54a08 100644 --- a/docs/en/faq/general.md +++ b/docs/en/faq/general.md @@ -17,7 +17,7 @@ If you use Oracle through the ODBC driver as a source of external dictionaries, **Example** -``` +```sql NLS_LANG=RUSSIAN_RUSSIA.UTF8 ``` diff --git a/docs/en/getting_started/example_datasets/amplab_benchmark.md b/docs/en/getting_started/example_datasets/amplab_benchmark.md index e5d9812a9c3..29c404cedfe 100644 --- a/docs/en/getting_started/example_datasets/amplab_benchmark.md +++ b/docs/en/getting_started/example_datasets/amplab_benchmark.md @@ -7,21 +7,21 @@ Sign up for a free account at . You will need a credit c Run the following in the console: ```bash -sudo apt-get install s3cmd -mkdir tiny; cd tiny; -s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ . -cd .. -mkdir 1node; cd 1node; -s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ . -cd .. -mkdir 5nodes; cd 5nodes; -s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ . -cd .. +$ sudo apt-get install s3cmd +$ mkdir tiny; cd tiny; +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ . +$ cd .. +$ mkdir 1node; cd 1node; +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ . +$ cd .. +$ mkdir 5nodes; cd 5nodes; +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ . +$ cd .. ``` Run the following ClickHouse queries: -``` sql +```sql CREATE TABLE rankings_tiny ( pageURL String, @@ -86,12 +86,12 @@ CREATE TABLE uservisits_5nodes_on_single Go back to the console: ```bash -for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done -for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done -for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done -for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done -for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done -for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done +$ for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done +$ for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done +$ for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done +$ for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done +$ for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done +$ for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done ``` Queries for obtaining data samples: diff --git a/docs/en/getting_started/example_datasets/criteo.md b/docs/en/getting_started/example_datasets/criteo.md index 268e8346d6c..8c57a2b0e9b 100644 --- a/docs/en/getting_started/example_datasets/criteo.md +++ b/docs/en/getting_started/example_datasets/criteo.md @@ -4,14 +4,14 @@ Download the data from hits_v1.tsv -# now create table -clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" -clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192" -# import data -cat hits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.hits_v1 FORMAT TSV" --max_insert_block_size=100000 -# optionally you can optimize table -clickhouse-client --query "OPTIMIZE TABLE datasets.hits_v1 FINAL" -clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" +$ curl https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv +$ # now create table +$ clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" +$ clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192" +$ # import data +$ cat hits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.hits_v1 FORMAT TSV" --max_insert_block_size=100000 +$ # optionally you can optimize table +$ clickhouse-client --query "OPTIMIZE TABLE datasets.hits_v1 FINAL" +$ clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" ``` **Download and import visits from compressed tsv-file** ```bash -curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv -# now create table -clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" -clickhouse-client --query "CREATE TABLE datasets.visits_v1 ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime, Price Int64, OrderID String, CurrencyID UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, TraficSource Nested(ID Int8, SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), Market Nested(Type UInt8, GoalID UInt32, OrderID String, OrderPrice Int64, PP UInt32, DirectPlaceID UInt32, DirectOrderID UInt32, DirectBannerID UInt32, GoodID String, GoodName String, GoodQuantity Int32, GoodPrice Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign)" -# import data -cat visits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.visits_v1 FORMAT TSV" --max_insert_block_size=100000 -# optionally you can optimize table -clickhouse-client --query "OPTIMIZE TABLE datasets.visits_v1 FINAL" -clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" +$ curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv +$ # now create table +$ clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" +$ clickhouse-client --query "CREATE TABLE datasets.visits_v1 ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime, Price Int64, OrderID String, CurrencyID UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, TraficSource Nested(ID Int8, SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), Market Nested(Type UInt8, GoalID UInt32, OrderID String, OrderPrice Int64, PP UInt32, DirectPlaceID UInt32, DirectOrderID UInt32, DirectBannerID UInt32, GoodID String, GoodName String, GoodQuantity Int32, GoodPrice Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign)" +$ # import data +$ cat visits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.visits_v1 FORMAT TSV" --max_insert_block_size=100000 +$ # optionally you can optimize table +$ clickhouse-client --query "OPTIMIZE TABLE datasets.visits_v1 FINAL" +$ clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" ``` ## Queries diff --git a/docs/en/getting_started/example_datasets/nyc_taxi.md b/docs/en/getting_started/example_datasets/nyc_taxi.md index ed073fc85a6..a3226e3d28a 100644 --- a/docs/en/getting_started/example_datasets/nyc_taxi.md +++ b/docs/en/getting_started/example_datasets/nyc_taxi.md @@ -29,8 +29,8 @@ It takes about 20-30 minutes to process each month's worth of data in PostgreSQL You can check the number of downloaded rows as follows: -``` -time psql nyc-taxi-data -c "SELECT count(*) FROM trips;" +```bash +$ time psql nyc-taxi-data -c "SELECT count(*) FROM trips;" ## Count 1298979494 (1 row) @@ -44,7 +44,7 @@ The data in PostgreSQL uses 370 GB of space. Exporting the data from PostgreSQL: -``` sql +```sql COPY ( SELECT trips.id, @@ -119,7 +119,7 @@ This takes about 5 hours. The resulting TSV file is 590612904969 bytes. Create a temporary table in ClickHouse: -``` sql +```sql CREATE TABLE trips ( trip_id UInt32, @@ -178,8 +178,8 @@ dropoff_puma Nullable(String) It is needed for converting fields to more correct data types and, if possible, to eliminate NULLs. -``` -time clickhouse-client --query="INSERT INTO trips FORMAT TabSeparated" < trips.tsv +```bash +$ time clickhouse-client --query="INSERT INTO trips FORMAT TabSeparated" < trips.tsv real 75m56.214s ``` @@ -196,7 +196,7 @@ To start, we'll create a table on a single server. Later we will make the table Create and populate a summary table: -``` +```sql CREATE TABLE trips_mergetree ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) AS SELECT @@ -263,13 +263,10 @@ To load it faster, you can create the table with the `Log` engine instead of `Me The table uses 126 GB of disk space. +```sql +SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mergetree' AND active ``` -:) SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mergetree' AND active - -SELECT formatReadableSize(sum(bytes)) -FROM system.parts -WHERE (table = 'trips_mergetree') AND active - +```text ┌─formatReadableSize(sum(bytes))─┐ │ 126.18 GiB │ └────────────────────────────────┘ @@ -277,14 +274,14 @@ WHERE (table = 'trips_mergetree') AND active Among other things, you can run the OPTIMIZE query on MergeTree. But it's not required, since everything will be fine without it. -## Dowload of Prepared Partitions +## Download of Prepared Partitions ```bash -curl -O https://clickhouse-datasets.s3.yandex.net/trips_mergetree/partitions/trips_mergetree.tar -tar xvf trips_mergetree.tar -C /var/lib/clickhouse # path to ClickHouse data directory -# check permissions of unpacked data, fix if required -sudo service clickhouse-server restart -clickhouse-client --query "select count(*) from datasets.trips_mergetree" +$ curl -O https://clickhouse-datasets.s3.yandex.net/trips_mergetree/partitions/trips_mergetree.tar +$ tar xvf trips_mergetree.tar -C /var/lib/clickhouse # path to ClickHouse data directory +$ # check permissions of unpacked data, fix if required +$ sudo service clickhouse-server restart +$ clickhouse-client --query "select count(*) from datasets.trips_mergetree" ``` !!!info @@ -296,7 +293,7 @@ clickhouse-client --query "select count(*) from datasets.trips_mergetree" Q1: -``` sql +```sql SELECT cab_type, count(*) FROM trips_mergetree GROUP BY cab_type ``` @@ -304,7 +301,7 @@ SELECT cab_type, count(*) FROM trips_mergetree GROUP BY cab_type Q2: -``` sql +```sql SELECT passenger_count, avg(total_amount) FROM trips_mergetree GROUP BY passenger_count ``` @@ -312,7 +309,7 @@ SELECT passenger_count, avg(total_amount) FROM trips_mergetree GROUP BY passenge Q3: -``` sql +```sql SELECT passenger_count, toYear(pickup_date) AS year, count(*) FROM trips_mergetree GROUP BY passenger_count, year ``` @@ -320,7 +317,7 @@ SELECT passenger_count, toYear(pickup_date) AS year, count(*) FROM trips_mergetr Q4: -``` sql +```sql SELECT passenger_count, toYear(pickup_date) AS year, round(trip_distance) AS distance, count(*) FROM trips_mergetree GROUP BY passenger_count, year, distance @@ -339,19 +336,19 @@ Creating a table on three servers: On each server: -``` +```sql CREATE TABLE default.trips_mergetree_third ( trip_id UInt32, vendor_id Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14), pickup_date Date, pickup_datetime DateTime, dropoff_date Date, dropoff_datetime DateTime, store_and_fwd_flag UInt8, rate_code_id UInt8, pickup_longitude Float64, pickup_latitude Float64, dropoff_longitude Float64, dropoff_latitude Float64, passenger_count UInt8, trip_distance Float64, fare_amount Float32, extra Float32, mta_tax Float32, tip_amount Float32, tolls_amount Float32, ehail_fee Float32, improvement_surcharge Float32, total_amount Float32, payment_type_ Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4), trip_type UInt8, pickup FixedString(25), dropoff FixedString(25), cab_type Enum8('yellow' = 1, 'green' = 2, 'uber' = 3), pickup_nyct2010_gid UInt8, pickup_ctlabel Float32, pickup_borocode UInt8, pickup_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), pickup_ct2010 FixedString(6), pickup_boroct2010 FixedString(7), pickup_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), pickup_ntacode FixedString(4), pickup_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), pickup_puma UInt16, dropoff_nyct2010_gid UInt8, dropoff_ctlabel Float32, dropoff_borocode UInt8, dropoff_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), dropoff_ct2010 FixedString(6), dropoff_boroct2010 FixedString(7), dropoff_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), dropoff_ntacode FixedString(4), dropoff_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), dropoff_puma UInt16) ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) ``` On the source server: -``` sql +```sql CREATE TABLE trips_mergetree_x3 AS trips_mergetree_third ENGINE = Distributed(perftest, default, trips_mergetree_third, rand()) ``` The following query redistributes data: -``` sql +```sql INSERT INTO trips_mergetree_x3 SELECT * FROM trips_mergetree ``` diff --git a/docs/en/getting_started/example_datasets/ontime.md b/docs/en/getting_started/example_datasets/ontime.md index d076f5b6469..5df0bd1fe5e 100644 --- a/docs/en/getting_started/example_datasets/ontime.md +++ b/docs/en/getting_started/example_datasets/ontime.md @@ -24,7 +24,7 @@ done Creating a table: -``` sql +```sql CREATE TABLE `ontime` ( `Year` UInt16, `Quarter` UInt8, @@ -141,17 +141,17 @@ CREATE TABLE `ontime` ( Loading data: ```bash -for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done +$ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done ``` -## Dowload of Prepared Partitions +## Download of Prepared Partitions ```bash -curl -O https://clickhouse-datasets.s3.yandex.net/ontime/partitions/ontime.tar -tar xvf ontime.tar -C /var/lib/clickhouse # path to ClickHouse data directory -# check permissions of unpacked data, fix if required -sudo service clickhouse-server restart -clickhouse-client --query "select count(*) from datasets.ontime" +$ curl -O https://clickhouse-datasets.s3.yandex.net/ontime/partitions/ontime.tar +$ tar xvf ontime.tar -C /var/lib/clickhouse # path to ClickHouse data directory +$ # check permissions of unpacked data, fix if required +$ sudo service clickhouse-server restart +$ clickhouse-client --query "select count(*) from datasets.ontime" ``` !!!info @@ -162,7 +162,7 @@ clickhouse-client --query "select count(*) from datasets.ontime" Q0. -``` sql +```sql SELECT avg(c1) FROM ( @@ -174,7 +174,7 @@ FROM Q1. The number of flights per day from the year 2000 to 2008 -``` sql +```sql SELECT DayOfWeek, count(*) AS c FROM ontime WHERE Year>=2000 AND Year<=2008 @@ -184,7 +184,7 @@ ORDER BY c DESC; Q2. The number of flights delayed by more than 10 minutes, grouped by the day of the week, for 2000-2008 -``` sql +```sql SELECT DayOfWeek, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 @@ -194,7 +194,7 @@ ORDER BY c DESC; Q3. The number of delays by airport for 2000-2008 -``` sql +```sql SELECT Origin, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 @@ -205,7 +205,7 @@ LIMIT 10; Q4. The number of delays by carrier for 2007 -``` sql +```sql SELECT Carrier, count(*) FROM ontime WHERE DepDelay>10 AND Year=2007 @@ -215,7 +215,7 @@ ORDER BY count(*) DESC; Q5. The percentage of delays by carrier for 2007 -``` sql +```sql SELECT Carrier, c, c2, c*100/c2 as c3 FROM ( @@ -241,7 +241,7 @@ ORDER BY c3 DESC; Better version of the same query: -``` sql +```sql SELECT Carrier, avg(DepDelay>10)*100 AS c3 FROM ontime WHERE Year=2007 @@ -251,7 +251,7 @@ ORDER BY Carrier Q6. The previous request for a broader range of years, 2000-2008 -``` sql +```sql SELECT Carrier, c, c2, c*100/c2 as c3 FROM ( @@ -277,7 +277,7 @@ ORDER BY c3 DESC; Better version of the same query: -``` sql +```sql SELECT Carrier, avg(DepDelay>10)*100 AS c3 FROM ontime WHERE Year>=2000 AND Year<=2008 @@ -287,7 +287,7 @@ ORDER BY Carrier; Q7. Percentage of flights delayed for more than 10 minutes, by year -``` sql +```sql SELECT Year, c1/c2 FROM ( @@ -311,7 +311,7 @@ ORDER BY Year; Better version of the same query: -``` sql +```sql SELECT Year, avg(DepDelay>10) FROM ontime GROUP BY Year @@ -320,7 +320,7 @@ ORDER BY Year; Q8. The most popular destinations by the number of directly connected cities for various year ranges -``` sql +```sql SELECT DestCityName, uniqExact(OriginCityName) AS u F ROM ontime WHERE Year>=2000 and Year<=2010 @@ -331,7 +331,7 @@ LIMIT 10; Q9. -``` sql +```sql SELECT Year, count(*) AS c1 FROM ontime GROUP BY Year; @@ -339,7 +339,7 @@ GROUP BY Year; Q10. -``` sql +```sql SELECT min(Year), max(Year), Carrier, count(*) AS cnt, sum(ArrDelayMinutes>30) AS flights_delayed, @@ -357,7 +357,7 @@ LIMIT 1000; Bonus: -``` sql +```sql SELECT avg(cnt) FROM ( diff --git a/docs/en/getting_started/example_datasets/star_schema.md b/docs/en/getting_started/example_datasets/star_schema.md index 545eaeea6a6..2e66ced7149 100644 --- a/docs/en/getting_started/example_datasets/star_schema.md +++ b/docs/en/getting_started/example_datasets/star_schema.md @@ -2,25 +2,25 @@ Compiling dbgen: -``` -git clone git@github.com:vadimtk/ssb-dbgen.git -cd ssb-dbgen -make +```bash +$ git clone git@github.com:vadimtk/ssb-dbgen.git +$ cd ssb-dbgen +$ make ``` Generating data: -``` -./dbgen -s 1000 -T c -./dbgen -s 1000 -T l -./dbgen -s 1000 -T p -./dbgen -s 1000 -T s -./dbgen -s 1000 -T d +```bash +$ ./dbgen -s 1000 -T c +$ ./dbgen -s 1000 -T l +$ ./dbgen -s 1000 -T p +$ ./dbgen -s 1000 -T s +$ ./dbgen -s 1000 -T d ``` Creating tables in ClickHouse: -``` +```sql CREATE TABLE customer ( C_CUSTKEY UInt32, @@ -85,16 +85,16 @@ ENGINE = MergeTree ORDER BY S_SUPPKEY; Inserting data: -``` -clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl -clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl -clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl -clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl +```bash +$ clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl +$ clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl +$ clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl +$ clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl ``` Converting "star schema" to denormalized "flat schema": -``` +```sql SET max_memory_usage = 20000000000, allow_experimental_multiple_joins_emulation = 1; CREATE TABLE lineorder_flat @@ -112,44 +112,56 @@ ALTER TABLE lineorder_flat DROP COLUMN C_CUSTKEY, DROP COLUMN S_SUPPKEY, DROP CO Running the queries: -``` Q1.1 +```sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYear(LO_ORDERDATE) = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25; - +``` Q1.2 +```sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYYYYMM(LO_ORDERDATE) = 199401 AND LO_DISCOUNT BETWEEN 4 AND 6 AND LO_QUANTITY BETWEEN 26 AND 35; - +``` Q1.3 +```sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994 AND LO_DISCOUNT BETWEEN 5 AND 7 AND LO_QUANTITY BETWEEN 26 AND 35; - +``` Q2.1 +```sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_CATEGORY = 'MFGR#12' AND S_REGION = 'AMERICA' GROUP BY year, P_BRAND ORDER BY year, P_BRAND; - +``` Q2.2 +```sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_BRAND BETWEEN 'MFGR#2221' AND 'MFGR#2228' AND S_REGION = 'ASIA' GROUP BY year, P_BRAND ORDER BY year, P_BRAND; - +``` Q2.3 +```sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_BRAND = 'MFGR#2239' AND S_REGION = 'EUROPE' GROUP BY year, P_BRAND ORDER BY year, P_BRAND; - +``` Q3.1 +```sql SELECT C_NATION, S_NATION, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE C_REGION = 'ASIA' AND S_REGION = 'ASIA' AND year >= 1992 AND year <= 1997 GROUP BY C_NATION, S_NATION, year ORDER BY year asc, revenue desc; - +``` Q3.2 +```sql SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE C_NATION = 'UNITED STATES' AND S_NATION = 'UNITED STATES' AND year >= 1992 AND year <= 1997 GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc; - +``` Q3.3 +```sql SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND year >= 1992 AND year <= 1997 GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc; - +``` Q3.4 +```sql SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND toYYYYMM(LO_ORDERDATE) = '199712' GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc; - +``` Q4.1 +```sql SELECT toYear(LO_ORDERDATE) AS year, C_NATION, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') GROUP BY year, C_NATION ORDER BY year, C_NATION; - +``` Q4.2 +```sql SELECT toYear(LO_ORDERDATE) AS year, S_NATION, P_CATEGORY, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (year = 1997 OR year = 1998) AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') GROUP BY year, S_NATION, P_CATEGORY ORDER BY year, S_NATION, P_CATEGORY; - +``` Q4.3 +```sql SELECT toYear(LO_ORDERDATE) AS year, S_CITY, P_BRAND, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE S_NATION = 'UNITED STATES' AND (year = 1997 OR year = 1998) AND P_CATEGORY = 'MFGR#14' GROUP BY year, S_CITY, P_BRAND ORDER BY year, S_CITY, P_BRAND; ``` diff --git a/docs/en/getting_started/example_datasets/wikistat.md b/docs/en/getting_started/example_datasets/wikistat.md index f81d0525367..b9ab6f184f7 100644 --- a/docs/en/getting_started/example_datasets/wikistat.md +++ b/docs/en/getting_started/example_datasets/wikistat.md @@ -4,7 +4,7 @@ See: Creating a table: -``` sql +```sql CREATE TABLE wikistat ( date Date, @@ -20,9 +20,9 @@ CREATE TABLE wikistat Loading data: ```bash -for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt -cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done -ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done +$ for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt +$ cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done +$ ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done ``` diff --git a/docs/en/getting_started/index.md b/docs/en/getting_started/index.md index 8cdbae86e5e..ed7335b748b 100644 --- a/docs/en/getting_started/index.md +++ b/docs/en/getting_started/index.md @@ -18,8 +18,8 @@ Yandex ClickHouse team recommends using official pre-compiled `deb` packages for To install official packages add the Yandex repository in `/etc/apt/sources.list` or in a separate `/etc/apt/sources.list.d/clickhouse.list` file: -``` -deb http://repo.yandex.ru/clickhouse/deb/stable/ main/ +```bash +$ deb http://repo.yandex.ru/clickhouse/deb/stable/ main/ ``` If you want to use the most recent version, replace `stable` with `testing` (this is recommended for your testing environments). @@ -27,10 +27,10 @@ If you want to use the most recent version, replace `stable` with `testing` (thi Then run these commands to actually install packages: ```bash -sudo apt-get install dirmngr # optional -sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 # optional -sudo apt-get update -sudo apt-get install clickhouse-client clickhouse-server +$ sudo apt-get install dirmngr # optional +$ sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 # optional +$ sudo apt-get update +$ sudo apt-get install clickhouse-client clickhouse-server ``` You can also download and install packages manually from here: . @@ -42,9 +42,9 @@ Yandex ClickHouse team recommends using official pre-compiled `rpm` packages for First you need to add the official repository: ```bash -sudo yum install yum-utils -sudo rpm --import https://repo.yandex.ru/clickhouse/CLICKHOUSE-KEY.GPG -sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/x86_64 +$ sudo yum install yum-utils +$ sudo rpm --import https://repo.yandex.ru/clickhouse/CLICKHOUSE-KEY.GPG +$ sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/x86_64 ``` If you want to use the most recent version, replace `stable` with `testing` (this is recommended for your testing environments). @@ -52,7 +52,7 @@ If you want to use the most recent version, replace `stable` with `testing` (thi Then run these commands to actually install packages: ```bash -sudo yum install clickhouse-server clickhouse-client +$ sudo yum install clickhouse-server clickhouse-client ``` You can also download and install packages manually from here: . @@ -67,13 +67,13 @@ To manually compile ClickHouse, follow the instructions for [Linux](../developme You can compile packages and install them or use programs without installing packages. Also by building manually you can disable SSE 4.2 requirement or build for AArch64 CPUs. -``` +```text Client: dbms/programs/clickhouse-client Server: dbms/programs/clickhouse-server ``` You'll need to create a data and metadata folders and `chown` them for the desired user. Their paths can be changed in server config (src/dbms/programs/server/config.xml), by default they are: -``` +```text /opt/clickhouse/data/default/ /opt/clickhouse/metadata/default/ ``` @@ -129,18 +129,14 @@ $ ./clickhouse-client ClickHouse client version 0.0.18749. Connecting to localhost:9000. Connected to ClickHouse server version 0.0.18749. - -:) SELECT 1 - +``` +```sql SELECT 1 - +``` +```text ┌─1─┐ │ 1 │ └───┘ - -1 rows in set. Elapsed: 0.003 sec. - -:) ``` **Congratulations, the system works!** diff --git a/docs/en/index.md b/docs/en/index.md index 32bfe2d4fa3..40158f524ec 100644 --- a/docs/en/index.md +++ b/docs/en/index.md @@ -78,22 +78,16 @@ See the difference? For example, the query "count the number of records for each advertising platform" requires reading one "advertising platform ID" column, which takes up 1 byte uncompressed. If most of the traffic was not from advertising platforms, you can expect at least 10-fold compression of this column. When using a quick compression algorithm, data decompression is possible at a speed of at least several gigabytes of uncompressed data per second. In other words, this query can be processed at a speed of approximately several billion rows per second on a single server. This speed is actually achieved in practice.
Example -``` +```bash $ clickhouse-client ClickHouse client version 0.0.52053. Connecting to localhost:9000. Connected to ClickHouse server version 0.0.52053. - -:) SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 - -SELECT -CounterID, -count() -FROM hits -GROUP BY CounterID -ORDER BY count() DESC -LIMIT 20 - +``` +```sql +SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 +``` +```text ┌─CounterID─┬──count()─┐ │ 114208 │ 56057344 │ │ 115080 │ 51619590 │ @@ -116,10 +110,6 @@ LIMIT 20 │ 115079 │ 8837972 │ │ 337234 │ 8205961 │ └───────────┴──────────┘ - -20 rows in set. Elapsed: 0.153 sec. Processed 1.00 billion rows, 4.00 GB (6.53 billion rows/s., 26.10 GB/s.) - -:) ```
diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index b6e59c4aa50..b582ab447d2 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -22,14 +22,14 @@ Similar to the HTTP interface, when using the 'query' parameter and sending data Example of using the client to insert data: ```bash -echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; +$ echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; -cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; +$ cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; 3, 'some text', '2016-08-14 00:00:00' 4, 'some more text', '2016-08-14 00:00:01' _EOF -cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; +$ cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; ``` In batch mode, the default data format is TabSeparated. You can set the format in the FORMAT clause of the query. @@ -70,14 +70,14 @@ The command-line client allows passing external data (external temporary tables) You can create a query with parameters and pass values to them from client application. This allows to avoid formatting query with specific dynamic values on client side. For example: ```bash -clickhouse-client --param_parName="[1, 2]" -q "SELECT * FROM table WHERE a = {parName:Array(UInt16)}" +$ clickhouse-client --param_parName="[1, 2]" -q "SELECT * FROM table WHERE a = {parName:Array(UInt16)}" ``` #### Query Syntax {#cli-queries-with-parameters-syntax} Format a query as usual, then place the values that you want to pass from the app parameters to the query in braces in the following format: -``` +```sql {:} ``` @@ -87,7 +87,7 @@ Format a query as usual, then place the values that you want to pass from the ap #### Example ```bash -clickhouse-client --param_tuple_in_tuple="(10, ('dt', 10))" -q "SELECT * FROM table WHERE val = {tuple_in_tuple:Tuple(UInt8, Tuple(String, UInt8))}" +$ clickhouse-client --param_tuple_in_tuple="(10, ('dt', 10))" -q "SELECT * FROM table WHERE val = {tuple_in_tuple:Tuple(UInt8, Tuple(String, UInt8))}" ``` ## Configuring {#interfaces_cli_configuration} diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 67fe9762ffb..88699adbd94 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -47,11 +47,11 @@ The `TabSeparated` format is convenient for processing data using custom program The `TabSeparated` format supports outputting total values (when using WITH TOTALS) and extreme values (when 'extremes' is set to 1). In these cases, the total values and extremes are output after the main data. The main result, total values, and extremes are separated from each other by an empty line. Example: -``` sql +```sql SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated`` ``` -``` +```text 2014-03-17 1406958 2014-03-18 1383658 2014-03-19 1405797 @@ -83,7 +83,7 @@ As an exception, parsing dates with times is also supported in Unix timestamp fo Strings are output with backslash-escaped special characters. The following escape sequences are used for output: `\b`, `\f`, `\r`, `\n`, `\t`, `\0`, `\'`, `\\`. Parsing also supports the sequences `\a`, `\v`, and `\xHH` (hex escape sequences) and any `\c` sequences, where `c` is any character (these sequences are converted to `c`). Thus, reading data supports formats where a line feed can be written as `\n` or `\`, or as a line feed. For example, the string `Hello world` with a line feed between the words instead of a space can be parsed in any of the following variations: -``` +```text Hello\nworld Hello\ @@ -211,7 +211,7 @@ format_schema_rows_between_delimiter = '\n ' ``` `Insert` example: -``` +```text Some header Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 @@ -241,7 +241,7 @@ format_schema_rows_between_delimiter = ',' Similar to TabSeparated, but outputs a value in name=value format. Names are escaped the same way as in TabSeparated format, and the = symbol is also escaped. -``` +```text SearchPhrase= count()=8267016 SearchPhrase=bathroom interior design count()=2166 SearchPhrase=yandex count()=1655 @@ -260,7 +260,7 @@ SearchPhrase=baku count()=1000 SELECT * FROM t_null FORMAT TSKV ``` -``` +```text x=1 y=\N ``` @@ -276,8 +276,8 @@ Comma Separated Values format ([RFC](https://tools.ietf.org/html/rfc4180)). When formatting, rows are enclosed in double quotes. A double quote inside a string is output as two double quotes in a row. There are no other rules for escaping characters. Date and date-time are enclosed in double quotes. Numbers are output without quotes. Values are separated by a delimiter character, which is `,` by default. The delimiter character is defined in the setting [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter). Rows are separated using the Unix line feed (LF). Arrays are serialized in CSV as follows: first the array is serialized to a string as in TabSeparated format, and then the resulting string is output to CSV in double quotes. Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost). -``` -clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv +```bash +$ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv ``` *By default, the delimiter is `,`. See the [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter) setting for more information. @@ -300,7 +300,7 @@ Also prints the header row, similar to `TabSeparatedWithNames`. Outputs data in JSON format. Besides data tables, it also outputs column names and types, along with some additional information: the total number of output rows, and the number of rows that could have been output if there weren't a LIMIT. Example: -``` sql +```sql SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTALS ORDER BY c DESC LIMIT 5 FORMAT JSON ``` @@ -445,7 +445,7 @@ When inserting the data, you should provide a separate JSON object for each row. ### Inserting Data -``` +```sql INSERT INTO UserActivity FORMAT JSONEachRow {"PageViews":5, "UserID":"4324182021466249494", "Duration":146,"Sign":-1} {"UserID":"4324182021466249494","PageViews":6,"Duration":185,"Sign":1} ``` @@ -464,7 +464,7 @@ If `DEFAULT expr` is specified, ClickHouse uses different substitution rules dep Consider the following table: -``` +```sql CREATE TABLE IF NOT EXISTS example_table ( x UInt32, @@ -482,7 +482,7 @@ CREATE TABLE IF NOT EXISTS example_table Consider the `UserActivity` table as an example: -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ │ 4324182021466249494 │ 6 │ 185 │ 1 │ @@ -491,7 +491,7 @@ Consider the `UserActivity` table as an example: The query `SELECT * FROM UserActivity FORMAT JSONEachRow` returns: -``` +```text {"UserID":"4324182021466249494","PageViews":5,"Duration":146,"Sign":-1} {"UserID":"4324182021466249494","PageViews":6,"Duration":185,"Sign":1} ``` @@ -576,11 +576,11 @@ Each result block is output as a separate table. This is necessary so that block Example (shown for the [PrettyCompact](#prettycompact) format): -``` sql +```sql SELECT * FROM t_null ``` -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ └───┴──────┘ @@ -588,11 +588,11 @@ SELECT * FROM t_null Rows are not escaped in Pretty* formats. Example is shown for the [PrettyCompact](#prettycompact) format: -``` sql +```sql SELECT 'String with \'quotes\' and \t character' AS Escaping_test ``` -``` +```text ┌─Escaping_test────────────────────────┐ │ String with 'quotes' and character │ └──────────────────────────────────────┘ @@ -603,11 +603,11 @@ This format is only appropriate for outputting a query result, but not for parsi The Pretty format supports outputting total values (when using WITH TOTALS) and extremes (when 'extremes' is set to 1). In these cases, total values and extreme values are output after the main data, in separate tables. Example (shown for the [PrettyCompact](#prettycompact) format): -``` sql +```sql SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT PrettyCompact ``` -``` +```text ┌──EventDate─┬───────c─┐ │ 2014-03-17 │ 1406958 │ │ 2014-03-18 │ 1383658 │ @@ -646,7 +646,7 @@ Differs from Pretty in that ANSI-escape sequences aren't used. This is necessary Example: ```bash -watch -n1 "clickhouse-client --query='SELECT event, value FROM system.events FORMAT PrettyCompactNoEscapes'" +$ watch -n1 "clickhouse-client --query='SELECT event, value FROM system.events FORMAT PrettyCompactNoEscapes'" ``` You can use the HTTP interface for displaying in the browser. @@ -702,11 +702,11 @@ Prints each value on a separate line with the column name specified. This format Example: -``` sql +```sql SELECT * FROM t_null FORMAT Vertical ``` -``` +```text Row 1: ────── x: 1 @@ -714,11 +714,11 @@ y: ᴺᵁᴸᴸ ``` Rows are not escaped in Vertical format: -``` sql +```sql SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT Vertical ``` -``` +```text Row 1: ────── test: string with 'quotes' and with some special @@ -807,12 +807,12 @@ Cap'n Proto is a binary message format similar to Protocol Buffers and Thrift, b Cap'n Proto messages are strictly typed and not self-describing, meaning they need an external schema description. The schema is applied on the fly and cached for each query. ```bash -cat capnproto_messages.bin | clickhouse-client --query "INSERT INTO test.hits FORMAT CapnProto SETTINGS format_schema='schema:Message'" +$ cat capnproto_messages.bin | clickhouse-client --query "INSERT INTO test.hits FORMAT CapnProto SETTINGS format_schema='schema:Message'" ``` Where `schema.capnp` looks like this: -``` +```capnp struct Message { SearchPhrase @0 :Text; c @1 :Uint64; @@ -842,7 +842,7 @@ cat protobuf_messages.bin | clickhouse-client --query "INSERT INTO test.table FO where the file `schemafile.proto` looks like this: -``` +```capnp syntax = "proto3"; message MessageType { @@ -859,7 +859,7 @@ If types of a column and a field of Protocol Buffers' message are different the Nested messages are supported. For example, for the field `z` in the following message type -``` +```capnp message MessageType { message XType { message YType { @@ -876,7 +876,7 @@ Nested messages are suitable to input or output a [nested data structures](../da Default values defined in a protobuf schema like this -``` +```capnp syntax = "proto2"; message MessageType { diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index 80cf72ec0e2..ee05a1cdb64 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -75,31 +75,31 @@ The POST method of transmitting data is necessary for INSERT queries. In this ca Examples: Creating a table: ```bash -echo 'CREATE TABLE t (a UInt8) ENGINE = Memory' | curl 'http://localhost:8123/' --data-binary @- +$ echo 'CREATE TABLE t (a UInt8) ENGINE = Memory' | curl 'http://localhost:8123/' --data-binary @- ``` Using the familiar INSERT query for data insertion: ```bash -echo 'INSERT INTO t VALUES (1),(2),(3)' | curl 'http://localhost:8123/' --data-binary @- +$ echo 'INSERT INTO t VALUES (1),(2),(3)' | curl 'http://localhost:8123/' --data-binary @- ``` Data can be sent separately from the query: ```bash -echo '(4),(5),(6)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20VALUES' --data-binary @- +$ echo '(4),(5),(6)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20VALUES' --data-binary @- ``` You can specify any data format. The 'Values' format is the same as what is used when writing INSERT INTO t VALUES: ```bash -echo '(7),(8),(9)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20Values' --data-binary @- +$ echo '(7),(8),(9)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20Values' --data-binary @- ``` To insert data from a tab-separated dump, specify the corresponding format: ```bash -echo -ne '10\n11\n12\n' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20TabSeparated' --data-binary @- +$ echo -ne '10\n11\n12\n' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20TabSeparated' --data-binary @- ``` Reading the table contents. Data is output in random order due to parallel query processing: @@ -123,7 +123,7 @@ $ curl 'http://localhost:8123/?query=SELECT%20a%20FROM%20t' Deleting the table. ```bash -echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- +$ echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- ``` For successful requests that don't return a data table, an empty response body is returned. @@ -141,10 +141,10 @@ Examples of sending data with compression: ```bash #Sending data to the server: -curl -vsS "http://localhost:8123/?enable_http_compression=1" -d 'SELECT number FROM system.numbers LIMIT 10' -H 'Accept-Encoding: gzip' +$ curl -vsS "http://localhost:8123/?enable_http_compression=1" -d 'SELECT number FROM system.numbers LIMIT 10' -H 'Accept-Encoding: gzip' #Sending data to the client: -echo "SELECT 1" | gzip -c | curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' +$ echo "SELECT 1" | gzip -c | curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' ``` !!! note "Note" @@ -173,13 +173,13 @@ The username and password can be indicated in one of two ways: 1. Using HTTP Basic Authentication. Example: ```bash -echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @- +$ echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @- ``` 2. In the 'user' and 'password' URL parameters. Example: ```bash -echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @- +$ echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @- ``` If the user name is not specified, the `default` name is used. If the password is not specified, the empty password is used. @@ -207,7 +207,7 @@ Similarly, you can use ClickHouse sessions in the HTTP protocol. To do this, you You can receive information about the progress of a query in `X-ClickHouse-Progress` response headers. To do this, enable [send_progress_in_http_headers](../operations/settings/settings.md#settings-send_progress_in_http_headers). Example of the header sequence: -``` +```text X-ClickHouse-Progress: {"read_rows":"2752512","read_bytes":"240570816","total_rows_to_read":"8880128"} X-ClickHouse-Progress: {"read_rows":"5439488","read_bytes":"482285394","total_rows_to_read":"8880128"} X-ClickHouse-Progress: {"read_rows":"8783786","read_bytes":"819092887","total_rows_to_read":"8880128"} @@ -239,7 +239,7 @@ To ensure that the entire response is buffered, set `wait_end_of_query=1`. In th Example: ```bash -curl -sS 'http://localhost:8123/?max_result_bytes=4000000&buffer_size=3000000&wait_end_of_query=1' -d 'SELECT toUInt8(number) FROM system.numbers LIMIT 9000000 FORMAT RowBinary' +$ curl -sS 'http://localhost:8123/?max_result_bytes=4000000&buffer_size=3000000&wait_end_of_query=1' -d 'SELECT toUInt8(number) FROM system.numbers LIMIT 9000000 FORMAT RowBinary' ``` Use buffering to avoid situations where a query processing error occurred after the response code and HTTP headers were sent to the client. In this situation, an error message is written at the end of the response body, and on the client side, the error can only be detected at the parsing stage. @@ -251,7 +251,7 @@ You can create a query with parameters and pass values for them from the corresp ### Example ```bash -curl -sS "
?param_id=2¶m_phrase=test" -d "SELECT * FROM table WHERE int_column = {id:UInt8} and string_column = {phrase:String}" +$ curl -sS "
?param_id=2¶m_phrase=test" -d "SELECT * FROM table WHERE int_column = {id:UInt8} and string_column = {phrase:String}" ``` [Original article](https://clickhouse.yandex/docs/en/interfaces/http_interface/) diff --git a/docs/en/operations/configuration_files.md b/docs/en/operations/configuration_files.md index de4bb0a0f7b..ea10eb3bfe2 100644 --- a/docs/en/operations/configuration_files.md +++ b/docs/en/operations/configuration_files.md @@ -20,8 +20,10 @@ Substitutions can also be performed from ZooKeeper. To do this, specify the attr The `config.xml` file can specify a separate config with user settings, profiles, and quotas. The relative path to this config is set in the 'users_config' element. By default, it is `users.xml`. If `users_config` is omitted, the user settings, profiles, and quotas are specified directly in `config.xml`. In addition, `users_config` may have overrides in files from the `users_config.d` directory (for example, `users.d`) and substitutions. For example, you can have separate config file for each user like this: -``` xml +```bash $ cat /etc/clickhouse-server/users.d/alice.xml +``` +```xml diff --git a/docs/en/operations/settings/constraints_on_settings.md b/docs/en/operations/settings/constraints_on_settings.md index 144eab7b0c0..c3bc4f05f22 100644 --- a/docs/en/operations/settings/constraints_on_settings.md +++ b/docs/en/operations/settings/constraints_on_settings.md @@ -3,7 +3,7 @@ The constraints on settings can be defined in the `users` section of the `user.xml` configuration file and prohibit users from changing some of the settings with the `SET` query. The constraints are defined as following: -``` +```xml @@ -30,7 +30,7 @@ There are supported three types of constraints: `min`, `max`, `readonly`. The `m **Example:** Let `users.xml` includes lines: -``` +```xml 10000000000 @@ -51,13 +51,13 @@ There are supported three types of constraints: `min`, `max`, `readonly`. The `m The following queries all throw exceptions: -``` +```sql SET max_memory_usage=20000000001; SET max_memory_usage=4999999999; SET force_index_by_date=1; ``` -``` +```text Code: 452, e.displayText() = DB::Exception: Setting max_memory_usage should not be greater than 20000000000. Code: 452, e.displayText() = DB::Exception: Setting max_memory_usage should not be less than 5000000000. Code: 452, e.displayText() = DB::Exception: Setting force_index_by_date should not be changed. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index d4e433803ae..a2dbf5122fa 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -179,7 +179,8 @@ Insert the [DateTime](../../data_types/datetime.md) type value with the differen ```sql SET input_format_values_interpret_expressions = 0; INSERT INTO datetime_t VALUES (now()) - +``` +```text Exception on client: Code: 27. DB::Exception: Cannot parse input: expected ) before: now()): (at row 1) ``` @@ -187,7 +188,8 @@ Code: 27. DB::Exception: Cannot parse input: expected ) before: now()): (at row ```sql SET input_format_values_interpret_expressions = 1; INSERT INTO datetime_t VALUES (now()) - +``` +```text Ok. ``` @@ -196,7 +198,8 @@ The last query is equivalent to the following: ```sql SET input_format_values_interpret_expressions = 0; INSERT INTO datetime_t SELECT now() - +``` +```text Ok. ``` @@ -599,7 +602,7 @@ ClickHouse supports the following algorithms of choosing replicas: ### Random (by default) {#load_balancing-random} -``` +```sql load_balancing = random ``` @@ -608,7 +611,7 @@ Disadvantages: Server proximity is not accounted for; if the replicas have diffe ### Nearest Hostname {#load_balancing-nearest_hostname} -``` +```sql load_balancing = nearest_hostname ``` @@ -622,7 +625,7 @@ We can also assume that when sending a query to the same server, in the absence ### In Order {#load_balancing-in_order} -``` +```sql load_balancing = in_order ``` @@ -632,7 +635,7 @@ This method is appropriate when you know exactly which replica is preferable. ### First or Random {#load_balancing-first_or_random} -``` +```sql load_balancing = first_or_random ``` diff --git a/docs/en/operations/settings/settings_users.md b/docs/en/operations/settings/settings_users.md index 7f5dba73306..99d558ff295 100644 --- a/docs/en/operations/settings/settings_users.md +++ b/docs/en/operations/settings/settings_users.md @@ -4,7 +4,7 @@ The `users` section of the `user.xml` configuration file contains user settings. Structure of the `users` section: -``` +```xml @@ -80,7 +80,7 @@ All results of DNS requests are cached until the server restarts. To open access for user from any network, specify: -``` +```xml ::/0 ``` @@ -90,7 +90,7 @@ To open access for user from any network, specify: To open access only from localhost, specify: -``` +```xml ::1 127.0.0.1 ``` @@ -114,7 +114,7 @@ In this section, you can you can limit rows that are returned by ClickHouse for The following configuration forces that user `user1` can only see the rows of `table1` as the result of `SELECT` queries, where the value of the `id` field is 1000. -``` +```xml diff --git a/docs/en/operations/system_tables.md b/docs/en/operations/system_tables.md index d5f38f51421..5eae7ecd544 100644 --- a/docs/en/operations/system_tables.md +++ b/docs/en/operations/system_tables.md @@ -494,7 +494,7 @@ WHERE table = 'visits' FORMAT Vertical ``` -``` +```text Row 1: ────── database: merge @@ -520,7 +520,7 @@ active_replicas: 2 Columns: -``` +```text database: Database name table: Table name engine: Table engine name @@ -573,7 +573,7 @@ If you don't request the last 4 columns (log_max_index, log_pointer, total_repli For example, you can check that everything is working correctly like this: -``` sql +```sql SELECT database, table, @@ -619,13 +619,13 @@ Columns: Example: -``` sql +```sql SELECT * FROM system.settings WHERE changed ``` -``` +```text ┌─name───────────────────┬─value───────┬─changed─┐ │ max_threads │ 8 │ 1 │ │ use_uncompressed_cache │ 0 │ 1 │ @@ -686,14 +686,14 @@ Columns: Example: -``` sql +```sql SELECT * FROM system.zookeeper WHERE path = '/clickhouse/tables/01-08/visits/replicas' FORMAT Vertical ``` -``` +```text Row 1: ────── name: example01-08-1.yandex.ru diff --git a/docs/en/operations/table_engines/aggregatingmergetree.md b/docs/en/operations/table_engines/aggregatingmergetree.md index bab352d5cb5..006614eec1c 100644 --- a/docs/en/operations/table_engines/aggregatingmergetree.md +++ b/docs/en/operations/table_engines/aggregatingmergetree.md @@ -11,7 +11,7 @@ It is appropriate to use `AggregatingMergeTree` if it reduces the number of rows ## Creating a Table -``` sql +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -59,7 +59,7 @@ In the results of `SELECT` query the values of `AggregateFunction` type have imp `AggregatingMergeTree` materialized view that watches the `test.visits` table: -``` sql +```sql CREATE MATERIALIZED VIEW test.basic ENGINE = AggregatingMergeTree() PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate) AS SELECT @@ -73,7 +73,7 @@ GROUP BY CounterID, StartDate; Inserting of data into the `test.visits` table. -``` sql +```sql INSERT INTO test.visits ... ``` @@ -81,7 +81,7 @@ The data are inserted in both the table and view `test.basic` that will perform To get the aggregated data, we need to execute a query such as `SELECT ... GROUP BY ...` from the view `test.basic`: -``` sql +```sql SELECT StartDate, sumMerge(Visits) AS Visits, diff --git a/docs/en/operations/table_engines/buffer.md b/docs/en/operations/table_engines/buffer.md index 18024918835..8f4035da19b 100644 --- a/docs/en/operations/table_engines/buffer.md +++ b/docs/en/operations/table_engines/buffer.md @@ -25,7 +25,7 @@ The conditions for flushing the data are calculated separately for each of the ` Example: -``` sql +```sql CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10, 100, 10000, 1000000, 10000000, 100000000) ``` diff --git a/docs/en/operations/table_engines/collapsingmergetree.md b/docs/en/operations/table_engines/collapsingmergetree.md index 67cad64e950..42c1bf91860 100644 --- a/docs/en/operations/table_engines/collapsingmergetree.md +++ b/docs/en/operations/table_engines/collapsingmergetree.md @@ -65,7 +65,7 @@ Use the particular column `Sign`. If `Sign = 1` it means that the row is a state For example, we want to calculate how much pages users checked at some site and how long they were there. At some moment of time we write the following row with the state of user activity: -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ @@ -73,7 +73,7 @@ For example, we want to calculate how much pages users checked at some site and At some moment later we register the change of user activity and write it with the following two rows. -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ │ 4324182021466249494 │ 6 │ 185 │ 1 │ @@ -86,7 +86,7 @@ The second row contains the current state. As we need only the last state of user activity, the rows -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ │ 4324182021466249494 │ 5 │ 146 │ -1 │ @@ -131,7 +131,7 @@ If you need to extract data without aggregation (for example, to check whether r Example data: -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ │ 4324182021466249494 │ 5 │ 146 │ -1 │ @@ -166,11 +166,11 @@ We use two `INSERT` queries to create two different data parts. If we insert the Getting the data: -``` +```sql SELECT * FROM UAct ``` -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ │ 4324182021466249494 │ 6 │ 185 │ 1 │ @@ -195,7 +195,7 @@ FROM UAct GROUP BY UserID HAVING sum(Sign) > 0 ``` -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┐ │ 4324182021466249494 │ 6 │ 185 │ └─────────────────────┴───────────┴──────────┘ @@ -206,7 +206,7 @@ If we do not need aggregation and want to force collapsing, we can use `FINAL` m ```sql SELECT * FROM UAct FINAL ``` -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ @@ -218,7 +218,7 @@ This way of selecting the data is very inefficient. Don't use it for big tables. Example data: -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ │ 4324182021466249494 │ -5 │ -146 │ -1 │ @@ -247,28 +247,38 @@ insert into UAct values(4324182021466249494, -5, -146, -1); insert into UAct values(4324182021466249494, 6, 185, 1); select * from UAct final; // avoid using final in production (just for a test or small tables) +``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ - +``` +```sql SELECT UserID, sum(PageViews) AS PageViews, sum(Duration) AS Duration FROM UAct GROUP BY UserID +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┐ │ 4324182021466249494 │ 6 │ 185 │ └─────────────────────┴───────────┴──────────┘ - +``` +```sqk select count() FROM UAct +``` +```text ┌─count()─┐ │ 3 │ └─────────┘ - +``` +```sql optimize table UAct final; select * FROM UAct +``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ diff --git a/docs/en/operations/table_engines/custom_partitioning_key.md b/docs/en/operations/table_engines/custom_partitioning_key.md index e53cf903aed..ba9c551ce2a 100644 --- a/docs/en/operations/table_engines/custom_partitioning_key.md +++ b/docs/en/operations/table_engines/custom_partitioning_key.md @@ -6,7 +6,7 @@ A partition is a logical combination of records in a table by a specified criter The partition is specified in the `PARTITION BY expr` clause when [creating a table](mergetree.md#table_engine-mergetree-creating-a-table). The partition key can be any expression from the table columns. For example, to specify partitioning by month, use the expression `toYYYYMM(date_column)`: -``` sql +```sql CREATE TABLE visits ( VisitDate Date, @@ -20,7 +20,7 @@ ORDER BY Hour; The partition key can also be a tuple of expressions (similar to the [primary key](mergetree.md#primary-keys-and-indexes-in-queries)). For example: -``` sql +```sql ENGINE = ReplicatedCollapsingMergeTree('/clickhouse/tables/name', 'replica1', Sign) PARTITION BY (toMonday(StartDate), EventType) ORDER BY (CounterID, StartDate, intHash32(UserID)); @@ -35,7 +35,7 @@ When inserting new data to a table, this data is stored as a separate part (chun Use the [system.parts](../system_tables.md#system_tables-parts) table to view the table parts and partitions. For example, let's assume that we have a `visits` table with partitioning by month. Let's perform the `SELECT` query for the `system.parts` table: -``` sql +```sql SELECT partition, name, @@ -44,7 +44,7 @@ FROM system.parts WHERE table = 'visits' ``` -``` +```text ┌─partition─┬─name───────────┬─active─┐ │ 201901 │ 201901_1_3_1 │ 0 │ │ 201901 │ 201901_1_9_2 │ 1 │ @@ -74,11 +74,11 @@ The `active` column shows the status of the part. `1` is active; `0` is inactive As you can see in the example, there are several separated parts of the same partition (for example, `201901_1_3_1` and `201901_1_9_2`). This means that these parts are not merged yet. ClickHouse merges the inserted parts of data periodically, approximately 15 minutes after inserting. In addition, you can perform a non-scheduled merge using the [OPTIMIZE](../../query_language/misc.md#misc_operations-optimize) query. Example: -``` sql +```sql OPTIMIZE TABLE visits PARTITION 201902; ``` -``` +```text ┌─partition─┬─name───────────┬─active─┐ │ 201901 │ 201901_1_3_1 │ 0 │ │ 201901 │ 201901_1_9_2 │ 1 │ @@ -96,7 +96,7 @@ Inactive parts will be deleted approximately 10 minutes after merging. Another way to view a set of parts and partitions is to go into the directory of the table: `/var/lib/clickhouse/data///`. For example: ```bash -dev:/var/lib/clickhouse/data/default/visits$ ls -l +/var/lib/clickhouse/data/default/visits$ ls -l total 40 drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 201901_1_3_1 drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201901_1_9_2 diff --git a/docs/en/operations/table_engines/dictionary.md b/docs/en/operations/table_engines/dictionary.md index 4ed0199297e..045c013f1c7 100644 --- a/docs/en/operations/table_engines/dictionary.md +++ b/docs/en/operations/table_engines/dictionary.md @@ -38,9 +38,7 @@ As an example, consider a dictionary of `products` with the following configurat Query the dictionary data: -``` sql -select name, type, key, attribute.names, attribute.types, bytes_allocated, element_count,source from system.dictionaries where name = 'products'; - +```sql SELECT name, type, @@ -54,7 +52,7 @@ FROM system.dictionaries WHERE name = 'products' ``` -``` +```text ┌─name─────┬─type─┬─key────┬─attribute.names─┬─attribute.types─┬─bytes_allocated─┬─element_count─┬─source──────────┐ │ products │ Flat │ UInt64 │ ['title'] │ ['String'] │ 23065376 │ 175032 │ ODBC: .products │ └──────────┴──────┴────────┴─────────────────┴─────────────────┴─────────────────┴───────────────┴─────────────────┘ @@ -66,45 +64,29 @@ This view isn't helpful when you need to get raw data, or when performing a `JOI Syntax: -``` +```sql CREATE TABLE %table_name% (%fields%) engine = Dictionary(%dictionary_name%)` ``` Usage example: -``` sql +```sql create table products (product_id UInt64, title String) Engine = Dictionary(products); - -CREATE TABLE products -( - product_id UInt64, - title String, -) -ENGINE = Dictionary(products) ``` - ``` -Ok. - -0 rows in set. Elapsed: 0.004 sec. +Ok ``` Take a look at what's in the table. -``` sql +```sql select * from products limit 1; - -SELECT * -FROM products -LIMIT 1 ``` -``` +```text ┌────product_id─┬─title───────────┐ │ 152689 │ Some item │ └───────────────┴─────────────────┘ - -1 rows in set. Elapsed: 0.006 sec. ``` diff --git a/docs/en/operations/table_engines/distributed.md b/docs/en/operations/table_engines/distributed.md index ade20da4eff..8bb0082843b 100644 --- a/docs/en/operations/table_engines/distributed.md +++ b/docs/en/operations/table_engines/distributed.md @@ -6,7 +6,7 @@ Reading is automatically parallelized. During a read, the table indexes on remot The Distributed engine accepts parameters: the cluster name in the server's config file, the name of a remote database, the name of a remote table, and (optionally) a sharding key. Example: -``` +```sql Distributed(logs, default, hits[, sharding_key]) ``` diff --git a/docs/en/operations/table_engines/external_data.md b/docs/en/operations/table_engines/external_data.md index 315bcb386a8..c57794df565 100644 --- a/docs/en/operations/table_engines/external_data.md +++ b/docs/en/operations/table_engines/external_data.md @@ -32,9 +32,9 @@ The files specified in 'file' will be parsed by the format specified in 'format' Examples: ```bash -echo -ne "1\n2\n3\n" | clickhouse-client --query="SELECT count() FROM test.visits WHERE TraficSourceID IN _data" --external --file=- --types=Int8 +$ echo -ne "1\n2\n3\n" | clickhouse-client --query="SELECT count() FROM test.visits WHERE TraficSourceID IN _data" --external --file=- --types=Int8 849897 -cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, count() AS c FROM passwd GROUP BY shell ORDER BY c DESC" --external --file=- --name=passwd --structure='login String, unused String, uid UInt16, gid UInt16, comment String, home String, shell String' +$ cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, count() AS c FROM passwd GROUP BY shell ORDER BY c DESC" --external --file=- --name=passwd --structure='login String, unused String, uid UInt16, gid UInt16, comment String, home String, shell String' /bin/sh 20 /bin/false 5 /bin/bash 4 @@ -47,9 +47,9 @@ When using the HTTP interface, external data is passed in the multipart/form-dat Example: ```bash -cat /etc/passwd | sed 's/:/\t/g' > passwd.tsv +$ cat /etc/passwd | sed 's/:/\t/g' > passwd.tsv -curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+count()+AS+c+FROM+passwd+GROUP+BY+shell+ORDER+BY+c+DESC&passwd_structure=login+String,+unused+String,+uid+UInt16,+gid+UInt16,+comment+String,+home+String,+shell+String' +$ curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+count()+AS+c+FROM+passwd+GROUP+BY+shell+ORDER+BY+c+DESC&passwd_structure=login+String,+unused+String,+uid+UInt16,+gid+UInt16,+comment+String,+home+String,+shell+String' /bin/sh 20 /bin/false 5 /bin/bash 4 diff --git a/docs/en/operations/table_engines/file.md b/docs/en/operations/table_engines/file.md index bd7ee3cb90e..71e29bdff6d 100644 --- a/docs/en/operations/table_engines/file.md +++ b/docs/en/operations/table_engines/file.md @@ -11,7 +11,7 @@ Usage examples: ## Usage in ClickHouse Server -``` +```sql File(Format) ``` @@ -33,7 +33,7 @@ You may manually create this subfolder and file in server filesystem and then [A **1.** Set up the `file_engine_table` table: -``` sql +```sql CREATE TABLE file_engine_table (name String, value UInt32) ENGINE=File(TabSeparated) ``` @@ -49,11 +49,11 @@ two 2 **3.** Query the data: -``` sql +```sql SELECT * FROM file_engine_table ``` -``` +```text ┌─name─┬─value─┐ │ one │ 1 │ │ two │ 2 │ diff --git a/docs/en/operations/table_engines/graphitemergetree.md b/docs/en/operations/table_engines/graphitemergetree.md index b85a88f56ce..a8ed8aaaddf 100644 --- a/docs/en/operations/table_engines/graphitemergetree.md +++ b/docs/en/operations/table_engines/graphitemergetree.md @@ -89,7 +89,7 @@ patterns Structure of the `patterns` section: -``` +```text pattern regexp function diff --git a/docs/en/operations/table_engines/hdfs.md b/docs/en/operations/table_engines/hdfs.md index 652ca43b176..5526d3daaaa 100644 --- a/docs/en/operations/table_engines/hdfs.md +++ b/docs/en/operations/table_engines/hdfs.md @@ -5,7 +5,7 @@ to the [File](file.md) and [URL](url.md) engines, but provides Hadoop-specific f ## Usage -``` +```sql ENGINE = HDFS(URI, format) ``` The `URI` parameter is the whole file URI in HDFS. @@ -18,22 +18,22 @@ The `format` parameter specifies one of the available file formats. To perform **1.** Set up the `hdfs_engine_table` table: -``` sql +```sql CREATE TABLE hdfs_engine_table (name String, value UInt32) ENGINE=HDFS('hdfs://hdfs1:9000/other_storage', 'TSV') ``` **2.** Fill file: -``` sql +```sql INSERT INTO hdfs_engine_table VALUES ('one', 1), ('two', 2), ('three', 3) ``` **3.** Query the data: -``` sql +```sql SELECT * FROM hdfs_engine_table LIMIT 2 ``` -``` +```text ┌─name─┬─value─┐ │ one │ 1 │ │ two │ 2 │ diff --git a/docs/en/operations/table_engines/jdbc.md b/docs/en/operations/table_engines/jdbc.md index 91e93c53232..e2ceb12641d 100644 --- a/docs/en/operations/table_engines/jdbc.md +++ b/docs/en/operations/table_engines/jdbc.md @@ -27,7 +27,7 @@ ENGINE = JDBC(dbms_uri, external_database, external_table) Creating a table in MySQL server by connecting directly with it's console client: -``` +```text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, @@ -50,30 +50,29 @@ mysql> select * from test; Creating a table in ClickHouse server and selecting data from it: -``` +```sql CREATE TABLE jdbc_table ENGINE JDBC('jdbc:mysql://localhost:3306/?user=root&password=root', 'test', 'test') - -Ok. - +``` +```sql DESCRIBE TABLE jdbc_table - +``` +```text ┌─name───────────────┬─type───────────────┬─default_type─┬─default_expression─┐ │ int_id │ Int32 │ │ │ │ int_nullable │ Nullable(Int32) │ │ │ │ float │ Float32 │ │ │ │ float_nullable │ Nullable(Float32) │ │ │ └────────────────────┴────────────────────┴──────────────┴────────────────────┘ - -10 rows in set. Elapsed: 0.031 sec. - +``` +```sql SELECT * FROM jdbc_table - +``` +```text ┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ │ 1 │ ᴺᵁᴸᴸ │ 2 │ ᴺᵁᴸᴸ │ └────────┴──────────────┴───────┴────────────────┘ -1 rows in set. Elapsed: 0.055 sec. ``` ## See Also diff --git a/docs/en/operations/table_engines/join.md b/docs/en/operations/table_engines/join.md index 6a7236e2c5b..4182ac936f8 100644 --- a/docs/en/operations/table_engines/join.md +++ b/docs/en/operations/table_engines/join.md @@ -4,7 +4,7 @@ Prepared data structure for using in [JOIN](../../query_language/select.md#selec ## Creating a Table -``` +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], diff --git a/docs/en/operations/table_engines/kafka.md b/docs/en/operations/table_engines/kafka.md index 90745ebb4cf..16c53d786c6 100644 --- a/docs/en/operations/table_engines/kafka.md +++ b/docs/en/operations/table_engines/kafka.md @@ -11,7 +11,7 @@ Kafka lets you: ## Creating a Table {#table_engine-kafka-creating-a-table} -``` +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -44,7 +44,7 @@ Optional parameters: Examples: -``` sql +```sql CREATE TABLE queue ( timestamp UInt64, level String, @@ -79,7 +79,7 @@ Examples: Do not use this method in new projects. If possible, switch old projects to the method described above. -``` +```sql Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format [, kafka_row_delimiter, kafka_schema, kafka_num_consumers, kafka_skip_broken_messages]) ``` @@ -104,7 +104,7 @@ One kafka table can have as many materialized views as you like, they do not rea Example: -``` sql +```sql CREATE TABLE queue ( timestamp UInt64, level String, @@ -128,7 +128,7 @@ To improve performance, received messages are grouped into blocks the size of [m To stop receiving topic data or to change the conversion logic, detach the materialized view: -``` +```sql DETACH TABLE consumer; ATTACH MATERIALIZED VIEW consumer; ``` diff --git a/docs/en/operations/table_engines/mergetree.md b/docs/en/operations/table_engines/mergetree.md index c3d64395a02..2c7dbbd4b23 100644 --- a/docs/en/operations/table_engines/mergetree.md +++ b/docs/en/operations/table_engines/mergetree.md @@ -101,7 +101,7 @@ The `index_granularity` setting can be omitted because 8192 is the default value !!! attention Do not use this method in new projects. If possible, switch old projects to the method described above. -``` +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -119,7 +119,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **Example** -``` +```sql MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID)), 8192) ``` @@ -370,14 +370,14 @@ The `TTL` clause can be set for the whole table and for each individual column. The table must have the column in the [Date](../../data_types/date.md) or [DateTime](../../data_types/datetime.md) data type. To define the lifetime of data, use operations on this time column, for example: -``` +```sql TTL time_column TTL time_column + interval ``` To define `interval`, use [time interval](../../query_language/operators.md#operators-datetime) operators. -``` +```sql TTL date_time + INTERVAL 1 MONTH TTL date_time + INTERVAL 15 HOUR ``` diff --git a/docs/en/operations/table_engines/mysql.md b/docs/en/operations/table_engines/mysql.md index 3b044a6184a..9dac9ba6478 100644 --- a/docs/en/operations/table_engines/mysql.md +++ b/docs/en/operations/table_engines/mysql.md @@ -45,7 +45,7 @@ The rest of the conditions and the `LIMIT` sampling constraint are executed in C Table in MySQL: -``` +```text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, diff --git a/docs/en/operations/table_engines/odbc.md b/docs/en/operations/table_engines/odbc.md index a3e7cda4a39..afcedd4849e 100644 --- a/docs/en/operations/table_engines/odbc.md +++ b/docs/en/operations/table_engines/odbc.md @@ -8,7 +8,7 @@ This engine supports the [Nullable](../../data_types/nullable.md) data type. ## Creating a Table -``` +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1], @@ -41,15 +41,17 @@ Ensure that unixODBC and MySQL Connector are installed. By default (if installed from packages), ClickHouse starts as user `clickhouse`. Thus, you need to create and configure this user in the MySQL server. +```bash +$ sudo mysql ``` -sudo mysql +```sql mysql> CREATE USER 'clickhouse'@'localhost' IDENTIFIED BY 'clickhouse'; mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'clickhouse' WITH GRANT OPTION; ``` Then configure the connection in `/etc/odbc.ini`. -``` +```bash $ cat /etc/odbc.ini [mysqlconn] DRIVER = /usr/local/lib/libmyodbc5w.so @@ -62,8 +64,8 @@ PASSWORD = clickhouse You can check the connection using the `isql` utility from the unixODBC installation. -``` -isql -v mysqlconn +```bash +$ isql -v mysqlconn +---------------------------------------+ | Connected! | | | @@ -72,7 +74,7 @@ isql -v mysqlconn Table in MySQL: -``` +```text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, diff --git a/docs/en/operations/table_engines/stripelog.md b/docs/en/operations/table_engines/stripelog.md index 6637055bc30..11ba0ae3ace 100644 --- a/docs/en/operations/table_engines/stripelog.md +++ b/docs/en/operations/table_engines/stripelog.md @@ -6,7 +6,7 @@ Use this engine in scenarios when you need to write many tables with a small amo ## Creating a Table {#table_engines-stripelog-creating-a-table} -``` +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( column1_name [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -60,7 +60,7 @@ ClickHouse uses multiple threads when selecting data. Each thread reads a separa ```sql SELECT * FROM stripe_log_table ``` -``` +```text ┌───────────timestamp─┬─message_type─┬─message────────────────────┐ │ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ │ 2019-01-18 14:34:53 │ WARNING │ The first warning message │ @@ -75,7 +75,7 @@ Sorting the results (ascending order by default): ```sql SELECT * FROM stripe_log_table ORDER BY timestamp ``` -``` +```text ┌───────────timestamp─┬─message_type─┬─message────────────────────┐ │ 2019-01-18 14:23:43 │ REGULAR │ The first regular message │ │ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ diff --git a/docs/en/operations/table_engines/summingmergetree.md b/docs/en/operations/table_engines/summingmergetree.md index 807476861f2..11a7b73bc44 100644 --- a/docs/en/operations/table_engines/summingmergetree.md +++ b/docs/en/operations/table_engines/summingmergetree.md @@ -7,7 +7,7 @@ We recommend to use the engine together with `MergeTree`. Store complete data in ## Creating a Table -``` +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -38,7 +38,7 @@ When creating a `SummingMergeTree` table the same [clauses](mergetree.md) are re !!! attention Do not use this method in new projects and, if possible, switch the old projects to the method described above. -``` +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -68,8 +68,8 @@ ORDER BY key Insert data to it: -``` -:) INSERT INTO summtt Values(1,1),(1,2),(2,1) +```sql +INSERT INTO summtt Values(1,1),(1,2),(2,1) ``` ClickHouse may sum all the rows not completely ([see below](#data-processing)), so we use an aggregate function `sum` and `GROUP BY` clause in the query. @@ -78,7 +78,7 @@ ClickHouse may sum all the rows not completely ([see below](#data-processing)), SELECT key, sum(value) FROM summtt GROUP BY key ``` -``` +```text ┌─key─┬─sum(value)─┐ │ 2 │ 1 │ │ 1 │ 3 │ @@ -119,7 +119,7 @@ then this nested table is interpreted as a mapping of `key => (values...)`, and Examples: -``` +```text [(1, 100)] + [(2, 150)] -> [(1, 100), (2, 150)] [(1, 100)] + [(1, 150)] -> [(1, 250)] [(1, 100)] + [(1, 150), (2, 150)] -> [(1, 250), (2, 150)] diff --git a/docs/en/operations/table_engines/url.md b/docs/en/operations/table_engines/url.md index dcae1d5d6d1..6521604171c 100644 --- a/docs/en/operations/table_engines/url.md +++ b/docs/en/operations/table_engines/url.md @@ -21,7 +21,7 @@ respectively. For processing `POST` requests, the remote server must support **1.** Create a `url_engine_table` table on the server : -``` sql +```sql CREATE TABLE url_engine_table (word String, value UInt64) ENGINE=URL('http://127.0.0.1:12345/', CSV) ``` @@ -46,16 +46,16 @@ if __name__ == "__main__": ``` ```bash -python3 server.py +$ python3 server.py ``` **3.** Request data: -``` sql +```sql SELECT * FROM url_engine_table ``` -``` +```text ┌─word──┬─value─┐ │ Hello │ 1 │ │ World │ 2 │ diff --git a/docs/en/operations/table_engines/versionedcollapsingmergetree.md b/docs/en/operations/table_engines/versionedcollapsingmergetree.md index 2dd417d85bf..547e6625744 100644 --- a/docs/en/operations/table_engines/versionedcollapsingmergetree.md +++ b/docs/en/operations/table_engines/versionedcollapsingmergetree.md @@ -29,7 +29,7 @@ For a description of query parameters, see the [query description](../../query_l **Engine Parameters** -``` +```sql VersionedCollapsingMergeTree(sign, version) ``` @@ -81,7 +81,7 @@ Use the `Sign` column when writing the row. If `Sign = 1` it means that the row For example, we want to calculate how many pages users visited on some site and how long they were there. At some point in time we write the following row with the state of user activity: -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | └─────────────────────┴───────────┴──────────┴──────┴─────────┘ @@ -89,7 +89,7 @@ For example, we want to calculate how many pages users visited on some site and At some point later we register the change of user activity and write it with the following two rows. -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | │ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 | @@ -102,7 +102,7 @@ The second row contains the current state. Because we need only the last state of user activity, the rows -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | │ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | @@ -139,7 +139,7 @@ If you need to extract the data with "collapsing" but without aggregation (for e Example data: -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | │ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | @@ -175,11 +175,11 @@ We use two `INSERT` queries to create two different data parts. If we insert the Getting the data: -``` +```sql SELECT * FROM UAct ``` -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┴─────────┘ @@ -205,7 +205,7 @@ FROM UAct GROUP BY UserID, Version HAVING sum(Sign) > 0 ``` -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Version─┐ │ 4324182021466249494 │ 6 │ 185 │ 2 │ └─────────────────────┴───────────┴──────────┴─────────┘ @@ -216,7 +216,7 @@ If we don't need aggregation and want to force collapsing, we can use the `FINAL ```sql SELECT * FROM UAct FINAL ``` -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 │ └─────────────────────┴───────────┴──────────┴──────┴─────────┘ diff --git a/docs/en/operations/tips.md b/docs/en/operations/tips.md index 6a347af7e22..c3418de4be1 100644 --- a/docs/en/operations/tips.md +++ b/docs/en/operations/tips.md @@ -5,7 +5,7 @@ Always use the `performance` scaling governor. The `on-demand` scaling governor works much worse with constantly high demand. ```bash -echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor +$ echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor ``` ## CPU Limitations @@ -20,8 +20,8 @@ For large amounts of data and when processing interactive (online) queries, you Even for data volumes of ~50 TB per server, using 128 GB of RAM significantly improves query performance compared to 64 GB. Do not disable overcommit. The value `cat /proc/sys/vm/overcommit_memory` should be 0 or 1. Run -``` -echo 0 | sudo tee /proc/sys/vm/overcommit_memory +```bash +$ echo 0 | sudo tee /proc/sys/vm/overcommit_memory ``` ## Huge Pages @@ -29,7 +29,7 @@ echo 0 | sudo tee /proc/sys/vm/overcommit_memory Always disable transparent huge pages. It interferes with memory allocators, which leads to significant performance degradation. ```bash -echo 'never' | sudo tee /sys/kernel/mm/transparent_hugepage/enabled +$ echo 'never' | sudo tee /sys/kernel/mm/transparent_hugepage/enabled ``` Use `perf top` to watch the time spent in the kernel for memory management. @@ -54,7 +54,7 @@ If you have more than 4 disks, use RAID-6 (preferred) or RAID-50, instead of RAI When using RAID-5, RAID-6 or RAID-50, always increase stripe_cache_size, since the default value is usually not the best choice. ```bash -echo 4096 | sudo tee /sys/block/md2/md/stripe_cache_size +$ echo 4096 | sudo tee /sys/block/md2/md/stripe_cache_size ``` Calculate the exact number from the number of devices and the block size, using the formula: `2 * num_devices * chunk_size_in_bytes / 4096`. @@ -163,7 +163,7 @@ dynamicConfigFile=/etc/zookeeper-{{ cluster['name'] }}/conf/zoo.cfg.dynamic Java version: -``` +```text Java(TM) SE Runtime Environment (build 1.8.0_25-b17) Java HotSpot(TM) 64-Bit Server VM (build 25.25-b02, mixed mode) ``` @@ -211,7 +211,7 @@ JAVA_OPTS="-Xms{{ cluster.get('xms','128M') }} \ Salt init: -``` +```text description "zookeeper-{{ cluster['name'] }} centralized coordination service" start on runlevel [2345] diff --git a/docs/en/operations/troubleshooting.md b/docs/en/operations/troubleshooting.md index fdfff6b26b2..656a12bad2a 100644 --- a/docs/en/operations/troubleshooting.md +++ b/docs/en/operations/troubleshooting.md @@ -26,14 +26,14 @@ Possible issues: Command: -``` -sudo service clickhouse-server status +```bash +$ sudo service clickhouse-server status ``` If the server is not running, start it with the command: -``` -sudo service clickhouse-server start +```bash +$ sudo service clickhouse-server start ``` **Check logs** @@ -47,19 +47,19 @@ If the server started successfully, you should see the strings: If `clickhouse-server` start failed with a configuration error, you should see the `` string with an error description. For example: -``` +```text 2019.01.11 15:23:25.549505 [ 45 ] {} ExternalDictionaries: Failed reloading 'event2id' external dictionary: Poco::Exception. Code: 1000, e.code() = 111, e.displayText() = Connection refused, e.what() = Connection refused ``` If you don't see an error at the end of the file, look through the entire file starting from the string: -``` +```text Application: starting up. ``` If you try to start a second instance of `clickhouse-server` on the server, you see the following log: -``` +```text 2019.01.11 15:25:11.151730 [ 1 ] {} : Starting ClickHouse 19.1.0 with revision 54413 2019.01.11 15:25:11.154578 [ 1 ] {} Application: starting up 2019.01.11 15:25:11.156361 [ 1 ] {} StatusFile: Status file ./status already exists - unclean restart. Contents: @@ -77,14 +77,14 @@ Revision: 54413 If you don't find any useful information in `clickhouse-server` logs or there aren't any logs, you can view `system.d` logs using the command: -``` -sudo journalctl -u clickhouse-server +```bash +$ sudo journalctl -u clickhouse-server ``` **Start clickhouse-server in interactive mode** -``` -sudo -u clickhouse /usr/bin/clickhouse-server --config-file /etc/clickhouse-server/config.xml +```bash +$ sudo -u clickhouse /usr/bin/clickhouse-server --config-file /etc/clickhouse-server/config.xml ``` This command starts the server as an interactive app with standard parameters of the autostart script. In this mode `clickhouse-server` prints all the event messages in the console. diff --git a/docs/en/operations/update.md b/docs/en/operations/update.md index 9a2268a0793..d008a0e3c61 100644 --- a/docs/en/operations/update.md +++ b/docs/en/operations/update.md @@ -2,10 +2,10 @@ If ClickHouse was installed from deb packages, execute the following commands on the server: -``` -sudo apt-get update -sudo apt-get install clickhouse-client clickhouse-server -sudo service clickhouse-server restart +```bash +$ sudo apt-get update +$ sudo apt-get install clickhouse-client clickhouse-server +$ sudo service clickhouse-server restart ``` If you installed ClickHouse using something other than the recommended deb packages, use the appropriate update method. diff --git a/docs/en/operations/utils/clickhouse-copier.md b/docs/en/operations/utils/clickhouse-copier.md index 57358d49f90..08388aab7db 100644 --- a/docs/en/operations/utils/clickhouse-copier.md +++ b/docs/en/operations/utils/clickhouse-copier.md @@ -24,7 +24,7 @@ To reduce network traffic, we recommend running `clickhouse-copier` on the same The utility should be run manually: ```bash -clickhouse-copier copier --daemon --config zookeeper.xml --task-path /task/path --base-dir /path/to/dir +$ clickhouse-copier copier --daemon --config zookeeper.xml --task-path /task/path --base-dir /path/to/dir ``` Parameters: diff --git a/docs/en/operations/utils/clickhouse-local.md b/docs/en/operations/utils/clickhouse-local.md index e91d1eda2d7..618a90585d2 100644 --- a/docs/en/operations/utils/clickhouse-local.md +++ b/docs/en/operations/utils/clickhouse-local.md @@ -17,8 +17,8 @@ By default `clickhouse-local` does not have access to data on the same host, but Basic usage: -``` bash -clickhouse-local --structure "table_structure" --input-format "format_of_incoming_data" -q "query" +```bash +$ clickhouse-local --structure "table_structure" --input-format "format_of_incoming_data" -q "query" ``` Arguments: @@ -40,8 +40,8 @@ Also there are arguments for each ClickHouse configuration variable which are mo ## Examples -``` bash -echo -e "1,2\n3,4" | clickhouse-local -S "a Int64, b Int64" -if "CSV" -q "SELECT * FROM table" +```bash +$ echo -e "1,2\n3,4" | clickhouse-local -S "a Int64, b Int64" -if "CSV" -q "SELECT * FROM table" Read 2 rows, 32.00 B in 0.000 sec., 5182 rows/sec., 80.97 KiB/sec. 1 2 3 4 @@ -49,7 +49,7 @@ Read 2 rows, 32.00 B in 0.000 sec., 5182 rows/sec., 80.97 KiB/sec. Previous example is the same as: -``` bash +```bash $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table" Read 2 rows, 32.00 B in 0.000 sec., 4987 rows/sec., 77.93 KiB/sec. 1 2 @@ -58,8 +58,10 @@ Read 2 rows, 32.00 B in 0.000 sec., 4987 rows/sec., 77.93 KiB/sec. Now let's output memory user for each Unix user: -``` bash +```bash $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' | clickhouse-local -S "user String, mem Float64" -q "SELECT user, round(sum(mem), 2) as memTotal FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty" +``` +```text Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec. ┏━━━━━━━━━━┳━━━━━━━━━━┓ ┃ user ┃ memTotal ┃ diff --git a/docs/en/query_language/agg_functions/combinators.md b/docs/en/query_language/agg_functions/combinators.md index d2cf85c3d75..2f4662ba21e 100644 --- a/docs/en/query_language/agg_functions/combinators.md +++ b/docs/en/query_language/agg_functions/combinators.md @@ -48,7 +48,7 @@ Converts an aggregate function for tables into an aggregate function for arrays Allows to divide data by groups, and then separately aggregates the data in those groups. Groups are created by splitting the values of one of the columns into intervals. -``` +```sql Resample(start, end, step)(, resampling_key) ``` diff --git a/docs/en/query_language/agg_functions/index.md b/docs/en/query_language/agg_functions/index.md index 93da97357d5..42b9bf2a511 100644 --- a/docs/en/query_language/agg_functions/index.md +++ b/docs/en/query_language/agg_functions/index.md @@ -15,7 +15,7 @@ During aggregation, all `NULL`s are skipped. Consider this table: -``` +```text ┌─x─┬────y─┐ │ 1 │ 2 │ │ 2 │ ᴺᵁᴸᴸ │ @@ -27,34 +27,27 @@ Consider this table: Let's say you need to total the values in the `y` column: +```sql +SELECT sum(y) FROM t_null_big +``` ``` -:) SELECT sum(y) FROM t_null_big - -SELECT sum(y) -FROM t_null_big - ┌─sum(y)─┐ │ 7 │ └────────┘ - -1 rows in set. Elapsed: 0.002 sec. ``` The `sum` function interprets `NULL` as `0`. In particular, this means that if the function receives input of a selection where all the values are `NULL`, then the result will be `0`, not `NULL`. Now you can use the `groupArray` function to create an array from the `y` column: +```sql +SELECT groupArray(y) FROM t_null_big ``` -:) SELECT groupArray(y) FROM t_null_big - -SELECT groupArray(y) -FROM t_null_big - +```text ┌─groupArray(y)─┐ │ [2,2,3] │ └───────────────┘ -1 rows in set. Elapsed: 0.002 sec. ``` `groupArray` does not include `NULL` in the resulting array. diff --git a/docs/en/query_language/agg_functions/parametric_functions.md b/docs/en/query_language/agg_functions/parametric_functions.md index 3ea26de5937..edf96df987f 100644 --- a/docs/en/query_language/agg_functions/parametric_functions.md +++ b/docs/en/query_language/agg_functions/parametric_functions.md @@ -6,7 +6,7 @@ Some aggregate functions can accept not only argument columns (used for compress Calculates an adaptive histogram. It doesn't guarantee precise results. -``` +```sql histogram(number_of_bins)(values) ``` @@ -90,7 +90,7 @@ Example: `sequenceMatch ('(?1).*(?2)')(EventTime, URL LIKE '%company%', URL LIKE This is a singular example. You could write it using other aggregate functions: -``` +```sql minIf(EventTime, URL LIKE '%company%') < maxIf(EventTime, URL LIKE '%cart%'). ``` @@ -153,7 +153,7 @@ Set the following chain of events: To find out how far the user `user_id` could get through the chain in an hour in January of 2017, make the query: -``` +```sql SELECT level, count() AS c @@ -184,7 +184,7 @@ Consider you are doing a website analytics, intend to calculate the retention of This could be easily calculate by `retention` -``` +```sql SELECT sum(r[1]) AS r1, sum(r[2]) AS r2, @@ -218,7 +218,7 @@ It works as fast as possible, except for cases when a large N value is used and Usage example: -``` +```text Problem: Generate a report that shows only keywords that produced at least 5 unique users. Solution: Write in the GROUP BY query SearchPhrase HAVING uniqUpTo(4)(UserID) >= 5 ``` diff --git a/docs/en/query_language/agg_functions/reference.md b/docs/en/query_language/agg_functions/reference.md index a43e84112b0..9dd5a1586db 100644 --- a/docs/en/query_language/agg_functions/reference.md +++ b/docs/en/query_language/agg_functions/reference.md @@ -79,7 +79,7 @@ When a `SELECT` query has the `GROUP BY` clause or at least one aggregate functi Selects a frequently occurring value using the [heavy hitters](http://www.cs.umd.edu/~samir/498/karp.pdf) algorithm. If there is a value that occurs more than in half the cases in each of the query's execution threads, this value is returned. Normally, the result is nondeterministic. -``` +```sql anyHeavy(column) ``` @@ -91,12 +91,12 @@ anyHeavy(column) Take the [OnTime](../../getting_started/example_datasets/ontime.md) data set and select any frequently occurring value in the `AirlineID` column. -``` sql +```sql SELECT anyHeavy(AirlineID) AS res FROM ontime ``` -``` +```text ┌───res─┐ │ 19690 │ └───────┘ @@ -111,7 +111,7 @@ The result is just as indeterminate as for the `any` function. Applies bitwise `AND` for series of numbers. -``` +```sql groupBitAnd(expr) ``` @@ -127,7 +127,7 @@ Value of the `UInt*` type. Test data: -``` +```text binary decimal 00101100 = 44 00011100 = 28 @@ -137,7 +137,7 @@ binary decimal Query: -``` +```sql SELECT groupBitAnd(num) FROM t ``` @@ -145,7 +145,7 @@ Where `num` is the column with the test data. Result: -``` +```text binary decimal 00000100 = 4 ``` @@ -154,7 +154,7 @@ binary decimal Applies bitwise `OR` for series of numbers. -``` +```sql groupBitOr(expr) ``` @@ -170,7 +170,7 @@ Value of the `UInt*` type. Test data: -``` +```text binary decimal 00101100 = 44 00011100 = 28 @@ -180,7 +180,7 @@ binary decimal Query: -``` +```sql SELECT groupBitOr(num) FROM t ``` @@ -188,7 +188,7 @@ Where `num` is the column with the test data. Result: -``` +```text binary decimal 01111101 = 125 ``` @@ -197,7 +197,7 @@ binary decimal Applies bitwise `XOR` for series of numbers. -``` +```sql groupBitXor(expr) ``` @@ -213,7 +213,7 @@ Value of the `UInt*` type. Test data: -``` +```text binary decimal 00101100 = 44 00011100 = 28 @@ -223,7 +223,7 @@ binary decimal Query: -``` +```sql SELECT groupBitXor(num) FROM t ``` @@ -231,7 +231,7 @@ Where `num` is the column with the test data. Result: -``` +```text binary decimal 01101000 = 104 ``` @@ -241,7 +241,7 @@ binary decimal Bitmap or Aggregate calculations from a unsigned integer column, return cardinality of type UInt64, if add suffix -State, then return [bitmap object](../functions/bitmap_functions.md). -``` +```sql groupBitmap(expr) ``` @@ -257,7 +257,7 @@ Value of the `UInt64` type. Test data: -``` +```text UserID 1 1 @@ -267,13 +267,13 @@ UserID Query: -``` +```sql SELECT groupBitmap(UserID) as num FROM t ``` Result: -``` +```text num 3 ``` @@ -291,15 +291,17 @@ Calculates the maximum. Calculates the 'arg' value for a minimal 'val' value. If there are several different values of 'arg' for minimal values of 'val', the first of these values encountered is output. **Example:** -``` +```text ┌─user─────┬─salary─┐ │ director │ 5000 │ │ manager │ 3000 │ │ worker │ 1000 │ └──────────┴────────┘ - +``` +```sql SELECT argMin(user, salary) FROM salary - +``` +```text ┌─argMin(user, salary)─┐ │ worker │ └──────────────────────┘ @@ -330,7 +332,7 @@ Returns a tuple of two arrays: keys in sorted order, and values ​​summed for Example: -``` sql +```sql CREATE TABLE sum_map( date Date, timeslot DateTime, @@ -351,7 +353,7 @@ FROM sum_map GROUP BY timeslot ``` -``` +```text ┌────────────timeslot─┬─sumMap(statusMap.status, statusMap.requests)─┐ │ 2000-01-01 00:00:00 │ ([1,2,3,4,5],[10,10,20,10,10]) │ │ 2000-01-01 00:01:00 │ ([4,5,6,7,8],[10,10,20,10,10]) │ @@ -362,7 +364,7 @@ GROUP BY timeslot Computes the [skewness](https://en.wikipedia.org/wiki/Skewness) of a sequence. -``` +```sql skewPop(expr) ``` @@ -386,7 +388,7 @@ Computes the [sample skewness](https://en.wikipedia.org/wiki/Skewness) of a sequ It represents an unbiased estimate of the skewness of a random variable if passed values form its sample. -``` +```sql skewSamp(expr) ``` @@ -408,7 +410,7 @@ SELECT skewSamp(value) FROM series_with_value_column Computes the [kurtosis](https://en.wikipedia.org/wiki/Kurtosis) of a sequence. -``` +```sql kurtPop(expr) ``` @@ -432,7 +434,7 @@ Computes the [sample kurtosis](https://en.wikipedia.org/wiki/Kurtosis) of a sequ It represents an unbiased estimate of the kurtosis of a random variable if passed values form its sample. -``` +```sql kurtSamp(expr) ``` @@ -463,7 +465,7 @@ The function returns array of tuples with `(timestamp, aggregated_value)` pairs. Before using this function make sure `timestamp` is in ascending order. Example: -``` +```text ┌─uid─┬─timestamp─┬─value─┐ │ 1 │ 2 │ 0.2 │ │ 1 │ 7 │ 0.7 │ @@ -477,7 +479,7 @@ Example: │ 2 │ 24 │ 4.8 │ └─────┴───────────┴───────┘ ``` -``` +```sql CREATE TABLE time_series( uid UInt64, timestamp Int64, @@ -493,7 +495,7 @@ FROM ( ); ``` And the result will be: -``` +```text [(2,0.2),(3,0.9),(7,2.1),(8,2.4),(12,3.6),(17,5.1),(18,5.4),(24,7.2),(25,2.5)] ``` @@ -502,7 +504,7 @@ Similarly timeSeriesGroupRateSum, timeSeriesGroupRateSum will Calculate the rate Also, timestamp should be in ascend order before use this function. Use this function, the result above case will be: -``` +```text [(2,0),(3,0.1),(7,0.3),(8,0.3),(12,0.3),(17,0.3),(18,0.3),(24,0.3),(25,0.1)] ``` @@ -516,7 +518,7 @@ The result is always Float64. Calculates the approximate number of different values of the argument. -``` +```sql uniq(x[, ...]) ``` @@ -551,7 +553,7 @@ We recommend using this function in almost all scenarios. Calculates the approximate number of different argument values. -``` +```sql uniqCombined(HLL_precision)(x[, ...]) ``` @@ -595,7 +597,7 @@ Compared to the [uniq](#agg_function-uniq) function, the `uniqCombined`: Calculates the approximate number of different argument values, using the [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) algorithm. -``` +```sql uniqHLL12(x[, ...]) ``` @@ -631,7 +633,7 @@ We don't recommend using this function. In most cases, use the [uniq](#agg_funct Calculates the exact number of different argument values. -``` +```sql uniqExact(x[, ...]) ``` @@ -676,7 +678,7 @@ Optional parameters: Calculates the moving sum of input values. -``` +```sql groupArrayMovingSum(numbers_for_summing) groupArrayMovingSum(window_size)(numbers_for_summing) ``` @@ -745,7 +747,7 @@ FROM t Calculates the moving average of input values. -``` +```sql groupArrayMovingAvg(numbers_for_summing) groupArrayMovingAvg(window_size)(numbers_for_summing) ``` @@ -850,7 +852,7 @@ Don't use this function for calculating timings. There is a more suitable functi Computes the quantile of the specified level with determined precision. The function intended for calculating quantiles of page loading time in milliseconds. -``` +```sql quantileTiming(level)(expr) ``` @@ -955,7 +957,7 @@ Returns an array of the most frequent values in the specified column. The result Implements the [ Filtered Space-Saving](http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf) algorithm for analyzing TopK, based on the reduce-and-combine algorithm from [Parallel Space Saving](https://arxiv.org/pdf/1401.0702.pdf). -``` +```sql topK(N)(column) ``` @@ -972,12 +974,12 @@ We recommend using the `N < 10 ` value; performance is reduced with large `N` va Take the [OnTime](../../getting_started/example_datasets/ontime.md) data set and select the three most frequently occurring values in the `AirlineID` column. -``` sql +```sql SELECT topK(3)(AirlineID) AS res FROM ontime ``` -``` +```text ┌─res─────────────────┐ │ [19393,19790,19805] │ └─────────────────────┘ @@ -1001,7 +1003,7 @@ Calculates the Pearson correlation coefficient: `Σ((x - x̅)(y - y̅)) / sqrt( Performs simple (unidimensional) linear regression. -``` +```sql simpleLinearRegression(x, y) ``` diff --git a/docs/en/query_language/alter.md b/docs/en/query_language/alter.md index 84ff4f390a8..53461d5edcd 100644 --- a/docs/en/query_language/alter.md +++ b/docs/en/query_language/alter.md @@ -6,7 +6,7 @@ The `ALTER` query is only supported for `*MergeTree` tables, as well as `Merge`a Changing the table structure. -``` sql +```sql ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|CLEAR|COMMENT|MODIFY COLUMN ... ``` @@ -25,7 +25,7 @@ These actions are described in detail below. #### ADD COLUMN {#alter_add-column} -``` sql +```sql ADD COLUMN [IF NOT EXISTS] name [type] [default_expr] [AFTER name_after] ``` @@ -39,13 +39,13 @@ This approach allows us to complete the `ALTER` query instantly, without increas Example: -``` sql +```sql ALTER TABLE visits ADD COLUMN browser String AFTER user_id ``` #### DROP COLUMN {#alter_drop-column} -``` sql +```sql DROP COLUMN [IF EXISTS] name ``` @@ -55,13 +55,13 @@ Deletes data from the file system. Since this deletes entire files, the query is Example: -``` sql +```sql ALTER TABLE visits DROP COLUMN browser ``` #### CLEAR COLUMN {#alter_clear-column} -``` sql +```sql CLEAR COLUMN [IF EXISTS] name IN PARTITION partition_name ``` @@ -71,13 +71,13 @@ If the `IF EXISTS` clause is specified, the query won't return an error if the c Example: -``` sql +```sql ALTER TABLE visits CLEAR COLUMN browser IN PARTITION tuple() ``` #### COMMENT COLUMN {#alter_comment-column} -``` sql +```sql COMMENT COLUMN [IF EXISTS] name 'comment' ``` @@ -89,13 +89,13 @@ Comments are stored in the `comment_expression` column returned by the [DESCRIBE Example: -``` sql +```sql ALTER TABLE visits COMMENT COLUMN browser 'The table shows the browser used for accessing the site.' ``` #### MODIFY COLUMN {#alter_modify-column} -``` sql +```sql MODIFY COLUMN [IF EXISTS] name [type] [default_expr] ``` @@ -105,7 +105,7 @@ When changing the type, values are converted as if the [toType](functions/type_c Example: -``` sql +```sql ALTER TABLE visits MODIFY COLUMN browser Array(String) ``` @@ -139,7 +139,7 @@ For tables that don't store data themselves (such as `Merge` and `Distributed`), The following command is supported: -``` sql +```sql MODIFY ORDER BY new_expression ``` @@ -171,7 +171,7 @@ Also, they are replicated (syncing indices metadata through ZooKeeper). See more on [constraints](create.md#constraints) Constraints could be added or deleted using following syntax: -``` +```sql ALTER TABLE [db].name ADD CONSTRAINT constraint_name CHECK expression; ALTER TABLE [db].name DROP CONSTRAINT constraint_name; ``` @@ -197,7 +197,7 @@ The following operations with [partitions](../operations/table_engines/custom_pa #### DETACH PARTITION {#alter_detach-partition} -``` sql +```sql ALTER TABLE table_name DETACH PARTITION partition_expr ``` @@ -205,7 +205,7 @@ Moves all data for the specified partition to the `detached` directory. The serv Example: -``` sql +```sql ALTER TABLE visits DETACH PARTITION 201901 ``` @@ -217,7 +217,7 @@ This query is replicated – it moves the data to the `detached` directory on al #### DROP PARTITION {#alter_drop-partition} -``` sql +```sql ALTER TABLE table_name DROP PARTITION partition_expr ``` @@ -245,7 +245,7 @@ ALTER TABLE table_name ATTACH PARTITION|PART partition_expr Adds data to the table from the `detached` directory. It is possible to add data for an entire partition or for a separate part. Examples: -``` sql +```sql ALTER TABLE visits ATTACH PARTITION 201901; ALTER TABLE visits ATTACH PART 201901_2_2_0; ``` @@ -258,7 +258,7 @@ So you can put data to the `detached` directory on one replica, and use the `ALT #### REPLACE PARTITION {#alter_replace-partition} -``` sql +```sql ALTER TABLE table2 REPLACE PARTITION partition_expr FROM table1 ``` @@ -271,7 +271,7 @@ For the query to run successfully, the following conditions must be met: #### CLEAR COLUMN IN PARTITION {#alter_clear-column-partition} -``` sql +```sql ALTER TABLE table_name CLEAR COLUMN column_name IN PARTITION partition_expr ``` @@ -279,13 +279,13 @@ Resets all values in the specified column in a partition. If the `DEFAULT` claus Example: -``` sql +```sql ALTER TABLE visits CLEAR COLUMN hour in PARTITION 201902 ``` #### FREEZE PARTITION {#alter_freeze-partition} -``` sql +```sql ALTER TABLE table_name FREEZE [PARTITION partition_expr] ``` @@ -321,7 +321,7 @@ For more information about backups and restoring data, see the [Data Backup](../ #### CLEAR INDEX IN PARTITION {#alter_clear-index-partition} -``` sql +```sql ALTER TABLE table_name CLEAR INDEX index_name IN PARTITION partition_expr ``` @@ -329,7 +329,7 @@ The query works similar to `CLEAR COLUMN`, but it resets an index instead of a c #### FETCH PARTITION {#alter_fetch-partition} -``` sql +```sql ALTER TABLE table_name FETCH PARTITION partition_expr FROM 'path-in-zookeeper' ``` @@ -342,7 +342,7 @@ The query does the following: For example: -``` sql +```sql ALTER TABLE users FETCH PARTITION 201902 FROM '/clickhouse/tables/01-01/visits'; ALTER TABLE users ATTACH PARTITION 201902; ``` @@ -370,7 +370,7 @@ For old-style tables, you can specify the partition either as a number `201901` All the rules above are also true for the [OPTIMIZE](misc.md#misc_operations-optimize) query. If you need to specify the only partition when optimizing a non-partitioned table, set the expression `PARTITION tuple()`. For example: -``` sql +```sql OPTIMIZE TABLE table_not_partitioned PARTITION tuple() FINAL; ``` @@ -393,19 +393,19 @@ Existing tables are ready for mutations as-is (no conversion necessary), but aft Currently available commands: -``` sql +```sql ALTER TABLE [db.]table DELETE WHERE filter_expr ``` The `filter_expr` must be of type UInt8. The query deletes rows in the table for which this expression takes a non-zero value. -``` sql +```sql ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] WHERE filter_expr ``` The command is available starting with the 18.12.14 version. The `filter_expr` must be of type UInt8. This query updates values of specified columns to the values of corresponding expressions in rows for which the `filter_expr` takes a non-zero value. Values are casted to the column type using the `CAST` operator. Updating columns that are used in the calculation of the primary or the partition key is not supported. -``` sql +```sql ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name ``` diff --git a/docs/en/query_language/create.md b/docs/en/query_language/create.md index 20790ffb834..773eee88d55 100644 --- a/docs/en/query_language/create.md +++ b/docs/en/query_language/create.md @@ -4,7 +4,7 @@ Creates database. -``` sql +```sql CREATE DATABASE [IF NOT EXISTS] db_name [ON CLUSTER cluster] [ENGINE = engine(...)] ``` @@ -48,19 +48,19 @@ The structure of the table is a list of column descriptions. If indexes are supp A column description is `name type` in the simplest case. Example: `RegionID UInt32`. Expressions can also be defined for default values (see below). -``` sql +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name AS [db2.]name2 [ENGINE = engine] ``` Creates a table with the same structure as another table. You can specify a different engine for the table. If the engine is not specified, the same engine will be used as for the `db2.name2` table. -``` sql +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name AS table_fucntion() ``` Creates a table with the structure and data returned by a [table function](table_functions/index.md). -``` sql +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ... ``` @@ -221,7 +221,7 @@ In most cases, temporary tables are not created manually, but when using externa The `CREATE`, `DROP`, `ALTER`, and `RENAME` queries support distributed execution on a cluster. For example, the following query creates the `all_hits` `Distributed` table on each host in `cluster`: -``` sql +```sql CREATE TABLE IF NOT EXISTS all_hits ON CLUSTER cluster (p Date, i Int32) ENGINE = Distributed(cluster, default, hits) ``` @@ -231,7 +231,7 @@ The local version of the query will eventually be implemented on each host in th ## CREATE VIEW -``` sql +```sql CREATE [MATERIALIZED] VIEW [IF NOT EXISTS] [db.]table_name [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ... ``` @@ -241,19 +241,19 @@ Normal views don't store any data, but just perform a read from another table. I As an example, assume you've created a view: -``` sql +```sql CREATE VIEW view AS SELECT ... ``` and written a query: -``` sql +```sql SELECT a, b, c FROM view ``` This query is fully equivalent to using the subquery: -``` sql +```sql SELECT a, b, c FROM (SELECT ...) ``` diff --git a/docs/en/query_language/dicts/external_dicts_dict_layout.md b/docs/en/query_language/dicts/external_dicts_dict_layout.md index a9a80dbe761..15bb4850cb9 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_layout.md +++ b/docs/en/query_language/dicts/external_dicts_dict_layout.md @@ -112,7 +112,7 @@ This storage method works the same way as hashed and allows using date/time (arb Example: The table contains discounts for each advertiser in the format: -``` +```text +---------------+---------------------+-------------------+--------+ | advertiser id | discount start date | discount end date | amount | +===============+=====================+===================+========+ @@ -146,7 +146,7 @@ Example: To work with these dictionaries, you need to pass an additional argument to the `dictGetT` function, for which a range is selected: -``` +```sql dictGetT('dict_name', 'attr_name', id, date) ``` @@ -240,7 +240,7 @@ This type of storage is for mapping network prefixes (IP addresses) to metadata Example: The table contains network prefixes and their corresponding AS number and country code: -``` +```text +-----------------+-------+--------+ | prefix | asn | cca2 | +=================+=======+========+ @@ -283,13 +283,13 @@ The key must have only one String type attribute that contains an allowed IP pre For queries, you must use the same functions (`dictGetT` with a tuple) as for dictionaries with composite keys: -``` +```sql dictGetT('dict_name', 'attr_name', tuple(ip)) ``` The function takes either `UInt32` for IPv4, or `FixedString(16)` for IPv6: -``` +```sql dictGetString('prefix', 'asn', tuple(IPv6StringToNum('2001:db8::1'))) ``` diff --git a/docs/en/query_language/dicts/external_dicts_dict_sources.md b/docs/en/query_language/dicts/external_dicts_dict_sources.md index d670c24c5fb..493b75a9cbb 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_sources.md +++ b/docs/en/query_language/dicts/external_dicts_dict_sources.md @@ -131,7 +131,7 @@ If you have a problems with encodings when using Oracle, see the corresponding [ Let's configure unixODBC for PostgreSQL. Content of `/etc/odbc.ini`: -``` +```text [gregtest] Driver = /usr/lib/psqlodbca.so Servername = localhost @@ -144,7 +144,7 @@ PASSWORD = test If you then make a query such as -``` +```sql SELECT * FROM odbc('DSN=gregtest;Servername=some-server.com', 'test_db'); ``` @@ -155,12 +155,13 @@ ODBC driver will send values of `USERNAME` and `PASSWORD` from `odbc.ini` to `so Ubuntu OS. Installing unixODBC and the ODBC driver for PostgreSQL: - - sudo apt-get install -y unixodbc odbcinst odbc-postgresql +```bash +$ sudo apt-get install -y unixodbc odbcinst odbc-postgresql +``` Configuring `/etc/odbc.ini` (or `~/.odbc.ini`): -``` +```text [DEFAULT] Driver = myconnection @@ -222,13 +223,13 @@ Ubuntu OS. Installing the driver: : -``` - sudo apt-get install tdsodbc freetds-bin sqsh +```bash +$ sudo apt-get install tdsodbc freetds-bin sqsh ``` -Configuring the driver: : +Configuring the driver: -``` +```bash $ cat /etc/freetds/freetds.conf ... diff --git a/docs/en/query_language/functions/arithmetic_functions.md b/docs/en/query_language/functions/arithmetic_functions.md index 0f4795ec0b9..fb0c3939a2a 100644 --- a/docs/en/query_language/functions/arithmetic_functions.md +++ b/docs/en/query_language/functions/arithmetic_functions.md @@ -4,11 +4,11 @@ For all arithmetic functions, the result type is calculated as the smallest numb Example: -``` sql +```sql SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 + 0 + 0) ``` -``` +```text ┌─toTypeName(0)─┬─toTypeName(plus(0, 0))─┬─toTypeName(plus(plus(0, 0), 0))─┬─toTypeName(plus(plus(plus(0, 0), 0), 0))─┐ │ UInt8 │ UInt16 │ UInt32 │ UInt64 │ └───────────────┴────────────────────────┴─────────────────────────────────┴──────────────────────────────────────────┘ diff --git a/docs/en/query_language/functions/array_functions.md b/docs/en/query_language/functions/array_functions.md index 5065d428994..bea2c0a6ec6 100644 --- a/docs/en/query_language/functions/array_functions.md +++ b/docs/en/query_language/functions/array_functions.md @@ -49,7 +49,7 @@ Returns an 'Array(T)' type result, where 'T' is the smallest common type out of Combines arrays passed as arguments. -``` +```sql arrayConcat(arrays) ``` @@ -82,9 +82,10 @@ Returns 0 if the the element is not in the array, or 1 if it is. `NULL` is processed as a value. -``` +```sql SELECT has([1, 2, NULL], NULL) - +``` +```text ┌─has([1, 2, NULL], NULL)─┐ │ 1 │ └─────────────────────────┘ @@ -94,7 +95,7 @@ SELECT has([1, 2, NULL], NULL) Checks whether one array is a subset of another. -``` +```sql hasAll(set, subset) ``` @@ -132,7 +133,7 @@ hasAll(set, subset) Checks whether two arrays have intersection by some elements. -``` +```sql hasAny(array1, array2) ``` @@ -169,10 +170,10 @@ Returns the index of the first 'x' element (starting from 1) if it is in the arr Example: -``` -:) SELECT indexOf([1,3,NULL,NULL],NULL) - +```sql SELECT indexOf([1, 3, NULL, NULL], NULL) +``` +```text ┌─indexOf([1, 3, NULL, NULL], NULL)─┐ │ 3 │ @@ -189,9 +190,10 @@ Returns the number of elements in the array equal to x. Equivalent to arrayCount Example: -``` +```sql SELECT countEqual([1, 2, NULL, NULL], NULL) - +``` +```text ┌─countEqual([1, 2, NULL, NULL], NULL)─┐ │ 2 │ └──────────────────────────────────────┘ @@ -293,7 +295,7 @@ This is necessary when using ARRAY JOIN with a nested data structure and further Removes the last item from the array. -``` +```sql arrayPopBack(array) ``` @@ -316,7 +318,7 @@ SELECT arrayPopBack([1, 2, 3]) AS res Removes the first item from the array. -``` +```sql arrayPopFront(array) ``` @@ -339,7 +341,7 @@ SELECT arrayPopFront([1, 2, 3]) AS res Adds one item to the end of the array. -``` +```sql arrayPushBack(array, single_value) ``` @@ -363,7 +365,7 @@ SELECT arrayPushBack(['a'], 'b') AS res Adds one element to the beginning of the array. -``` +```sql arrayPushFront(array, single_value) ``` @@ -387,7 +389,7 @@ SELECT arrayPushBack(['b'], 'a') AS res Changes the length of the array. -``` +```sql arrayResize(array, size[, extender]) ``` @@ -405,17 +407,19 @@ An array of length `size`. **Examples of calls** -``` +```sql SELECT arrayResize([1], 3) - +``` +```text ┌─arrayResize([1], 3)─┐ │ [1,0,0] │ └─────────────────────┘ ``` -``` +```sql SELECT arrayResize([1], 3, NULL) - +``` +```text ┌─arrayResize([1], 3, NULL)─┐ │ [1,NULL,NULL] │ └───────────────────────────┘ @@ -425,7 +429,7 @@ SELECT arrayResize([1], 3, NULL) Returns a slice of the array. -``` +```sql arraySlice(array, offset[, length]) ``` @@ -653,7 +657,7 @@ Takes an array, returns an array with the difference between all pairs of neighb SELECT arrayDifference([1, 2, 3, 4]) ``` -``` +```text ┌─arrayDifference([1, 2, 3, 4])─┐ │ [0,1,1,1] │ └───────────────────────────────┘ @@ -667,7 +671,7 @@ Takes an array, returns an array containing the different elements in all the ar SELECT arrayDistinct([1, 2, 2, 3, 1]) ``` -``` +```text ┌─arrayDistinct([1, 2, 2, 3, 1])─┐ │ [1,2,3] │ └────────────────────────────────┘ @@ -687,7 +691,7 @@ SELECT arrayIntersect([1, 2], [1, 3], [1, 4]) AS intersect ``` -``` +```text ┌─no_intersect─┬─intersect─┐ │ [] │ [1] │ └──────────────┴───────────┘ diff --git a/docs/en/query_language/functions/array_join.md b/docs/en/query_language/functions/array_join.md index ede5b5e9d41..25dfc626581 100644 --- a/docs/en/query_language/functions/array_join.md +++ b/docs/en/query_language/functions/array_join.md @@ -19,7 +19,7 @@ Example: SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src ``` -``` +```text ┌─dst─┬─\'Hello\'─┬─src─────┐ │ 1 │ Hello │ [1,2,3] │ │ 2 │ Hello │ [1,2,3] │ diff --git a/docs/en/query_language/functions/bitmap_functions.md b/docs/en/query_language/functions/bitmap_functions.md index 20cfefa4f1e..f50097b08cb 100644 --- a/docs/en/query_language/functions/bitmap_functions.md +++ b/docs/en/query_language/functions/bitmap_functions.md @@ -13,7 +13,7 @@ For more information on RoaringBitmap, see: [CRoaring](https://github.com/Roarin Build a bitmap from unsigned integer array. -``` +```sql bitmapBuild(array) ``` @@ -36,7 +36,7 @@ SELECT bitmapBuild([1, 2, 3, 4, 5]) AS res, toTypeName(res) Convert bitmap to integer array. -``` +```sql bitmapToArray(bitmap) ``` @@ -50,7 +50,7 @@ bitmapToArray(bitmap) SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res ``` -``` +```text ┌─res─────────┐ │ [1,2,3,4,5] │ └─────────────┘ @@ -60,7 +60,7 @@ SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res Return subset in specified range (not include the range_end). -``` +```sql bitmapSubsetInRange(bitmap, range_start, range_end) ``` @@ -72,11 +72,11 @@ bitmapSubsetInRange(bitmap, range_start, range_end) **Example** -``` sql +```sql SELECT bitmapToArray(bitmapSubsetInRange(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,100,200,500]), toUInt32(30), toUInt32(200))) AS res ``` -``` +```text ┌─res───────────────┐ │ [30,31,32,33,100] │ └───────────────────┘ @@ -112,7 +112,7 @@ SELECT bitmapToArray(bitmapSubsetLimit(bitmapBuild([0,1,2,3,4,5,6,7,8,9,10,11,12 Checks whether the bitmap contains an element. -``` +```sql bitmapContains(haystack, needle) ``` @@ -143,7 +143,7 @@ SELECT bitmapContains(bitmapBuild([1,5,7,9]), toUInt32(9)) AS res Checks whether two bitmaps have intersection by some elements. -``` +```sql bitmapHasAny(bitmap1, bitmap2) ``` @@ -160,11 +160,11 @@ If you are sure that `bitmap2` contains strictly one element, consider using the **Example** -``` sql +```sql SELECT bitmapHasAny(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res ``` -``` +```text ┌─res─┐ │ 1 │ └─────┘ @@ -175,7 +175,7 @@ SELECT bitmapHasAny(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res Analogous to `hasAll(array, array)` returns 1 if the first bitmap contains all the elements of the second one, 0 otherwise. If the second argument is an empty bitmap then returns 1. -``` +```sql bitmapHasAll(bitmap,bitmap) ``` @@ -185,11 +185,11 @@ bitmapHasAll(bitmap,bitmap) **Example** -``` sql +```sql SELECT bitmapHasAll(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res ``` -``` +```text ┌─res─┐ │ 0 │ └─────┘ @@ -200,7 +200,7 @@ SELECT bitmapHasAll(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res Two bitmap and calculation, the result is a new bitmap. -``` +```sql bitmapAnd(bitmap,bitmap) ``` @@ -214,7 +214,7 @@ bitmapAnd(bitmap,bitmap) SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -``` +```text ┌─res─┐ │ [3] │ └─────┘ @@ -225,7 +225,7 @@ SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS re Two bitmap or calculation, the result is a new bitmap. -``` +```sql bitmapOr(bitmap,bitmap) ``` @@ -235,11 +235,11 @@ bitmapOr(bitmap,bitmap) **Example** -``` sql +```sql SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -``` +```text ┌─res─────────┐ │ [1,2,3,4,5] │ └─────────────┘ @@ -249,7 +249,7 @@ SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res Two bitmap xor calculation, the result is a new bitmap. -``` +```sql bitmapXor(bitmap,bitmap) ``` @@ -259,11 +259,11 @@ bitmapXor(bitmap,bitmap) **Example** -``` sql +```sql SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -``` +```text ┌─res───────┐ │ [1,2,4,5] │ └───────────┘ @@ -273,7 +273,7 @@ SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS re Two bitmap andnot calculation, the result is a new bitmap. -``` +```sql bitmapAndnot(bitmap,bitmap) ``` @@ -283,11 +283,11 @@ bitmapAndnot(bitmap,bitmap) **Example** -``` sql +```sql SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -``` +```text ┌─res───┐ │ [1,2] │ └───────┘ @@ -298,7 +298,7 @@ SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS Retrun bitmap cardinality of type UInt64. -``` +```sql bitmapCardinality(bitmap) ``` @@ -308,11 +308,11 @@ bitmapCardinality(bitmap) **Example** -``` sql +```sql SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res ``` -``` +```text ┌─res─┐ │ 5 │ └─────┘ @@ -373,7 +373,7 @@ SELECT bitmapMax(bitmapBuild([1, 2, 3, 4, 5])) AS res Two bitmap and calculation, return cardinality of type UInt64. -``` +```sql bitmapAndCardinality(bitmap,bitmap) ``` @@ -383,11 +383,11 @@ bitmapAndCardinality(bitmap,bitmap) **Example** -``` sql +```sql SELECT bitmapAndCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -``` +```text ┌─res─┐ │ 1 │ └─────┘ @@ -398,7 +398,7 @@ SELECT bitmapAndCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; Two bitmap or calculation, return cardinality of type UInt64. -``` +```sql bitmapOrCardinality(bitmap,bitmap) ``` @@ -408,11 +408,11 @@ bitmapOrCardinality(bitmap,bitmap) **Example** -``` sql +```sql SELECT bitmapOrCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -``` +```text ┌─res─┐ │ 5 │ └─────┘ @@ -422,7 +422,7 @@ SELECT bitmapOrCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; Two bitmap xor calculation, return cardinality of type UInt64. -``` +```sql bitmapXorCardinality(bitmap,bitmap) ``` @@ -432,11 +432,11 @@ bitmapXorCardinality(bitmap,bitmap) **Example** -``` sql +```sql SELECT bitmapXorCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -``` +```text ┌─res─┐ │ 4 │ └─────┘ @@ -447,7 +447,7 @@ SELECT bitmapXorCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; Two bitmap andnot calculation, return cardinality of type UInt64. -``` +```sql bitmapAndnotCardinality(bitmap,bitmap) ``` @@ -457,11 +457,11 @@ bitmapAndnotCardinality(bitmap,bitmap) **Example** -``` sql +```sql SELECT bitmapAndnotCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -``` +```text ┌─res─┐ │ 2 │ └─────┘ diff --git a/docs/en/query_language/functions/conditional_functions.md b/docs/en/query_language/functions/conditional_functions.md index b5c7ce583cc..074df25303f 100644 --- a/docs/en/query_language/functions/conditional_functions.md +++ b/docs/en/query_language/functions/conditional_functions.md @@ -11,7 +11,7 @@ Returns `then` if `cond != 0`, or `else` if `cond = 0`. Allows you to write the [CASE](../operators.md#operator_case) operator more compactly in the query. -``` +```sql multiIf(cond_1, then_1, cond_2, then_2...else) ``` @@ -31,7 +31,7 @@ The function returns one of the values `then_N` or `else`, depending on the cond Take the table -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -40,7 +40,7 @@ Take the table Run the query `SELECT multiIf(isNull(y) x, y < 3, y, NULL) FROM t_null`. Result: -``` +```text ┌─multiIf(isNull(y), x, less(y, 3), y, NULL)─┐ │ 1 │ │ ᴺᵁᴸᴸ │ diff --git a/docs/en/query_language/functions/date_time_functions.md b/docs/en/query_language/functions/date_time_functions.md index e8716d8c542..c22457d3fcc 100644 --- a/docs/en/query_language/functions/date_time_functions.md +++ b/docs/en/query_language/functions/date_time_functions.md @@ -4,7 +4,7 @@ Support for time zones All functions for working with the date and time that have a logical use for the time zone can accept a second optional time zone argument. Example: Asia/Yekaterinburg. In this case, they use the specified time zone instead of the local (default) one. -``` sql +```sql SELECT toDateTime('2016-06-15 23:00:00') AS time, toDate(time) AS date_local, @@ -12,7 +12,7 @@ SELECT toString(time, 'US/Samoa') AS time_samoa ``` -``` +```text ┌────────────────time─┬─date_local─┬─date_yekat─┬─time_samoa──────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-16 │ 2016-06-15 09:00:00 │ └─────────────────────┴────────────┴────────────┴─────────────────────┘ @@ -201,7 +201,7 @@ For mode values with a meaning of “with 4 or more days this year,” weeks are For mode values with a meaning of “contains January 1”, the week contains January 1 is week 1. It doesn't matter how many days in the new year the week contained, even if it contained only one day. -``` +```sql toWeek(date, [, mode][, Timezone]) ``` **Parameters** @@ -212,11 +212,11 @@ toWeek(date, [, mode][, Timezone]) **Example** -``` sql +```sql SELECT toDate('2016-12-27') AS date, toWeek(date) AS week0, toWeek(date,1) AS week1, toWeek(date,9) AS week9; ``` -``` +```text ┌───────date─┬─week0─┬─week1─┬─week9─┐ │ 2016-12-27 │ 52 │ 52 │ 1 │ └────────────┴───────┴───────┴───────┘ @@ -231,11 +231,11 @@ The mode argument works exactly like the mode argument to toWeek(). For the sing **Example** -``` sql +```sql SELECT toDate('2016-12-27') AS date, toYearWeek(date) AS yearWeek0, toYearWeek(date,1) AS yearWeek1, toYearWeek(date,9) AS yearWeek9; ``` -``` +```text ┌───────date─┬─yearWeek0─┬─yearWeek1─┬─yearWeek9─┐ │ 2016-12-27 │ 201652 │ 201652 │ 201701 │ └────────────┴───────────┴───────────┴───────────┘ @@ -286,7 +286,7 @@ SELECT addYears(date_time, 1) AS add_years_with_date_time ``` -``` +```text ┌─add_years_with_date─┬─add_years_with_date_time─┐ │ 2019-01-01 │ 2019-01-01 00:00:00 │ └─────────────────────┴──────────────────────────┘ @@ -305,7 +305,7 @@ SELECT subtractYears(date_time, 1) AS subtract_years_with_date_time ``` -``` +```text ┌─subtract_years_with_date─┬─subtract_years_with_date_time─┐ │ 2018-01-01 │ 2018-01-01 00:00:00 │ └──────────────────────────┴───────────────────────────────┘ diff --git a/docs/en/query_language/functions/ext_dict_functions.md b/docs/en/query_language/functions/ext_dict_functions.md index 95ddb0eaef6..5af21d6014a 100644 --- a/docs/en/query_language/functions/ext_dict_functions.md +++ b/docs/en/query_language/functions/ext_dict_functions.md @@ -6,7 +6,7 @@ For information on connecting and configuring external dictionaries, see [Extern Retrieves a value from an external dictionary. -``` +```sql dictGet('dict_name', 'attr_name', id_expr) dictGetOrDefault('dict_name', 'attr_name', id_expr, default_value_expr) ``` @@ -95,7 +95,7 @@ LIMIT 3 Checks whether a key is present in a dictionary. -``` +```sql dictHas('dict_name', id_expr) ``` @@ -115,7 +115,7 @@ Type: `UInt8`. For the hierarchical dictionary, returns an array of dictionary keys starting from the passed `id_expr` and continuing along the chain of parent elements. -``` +```sql dictGetHierarchy('dict_name', id_expr) ``` @@ -134,7 +134,7 @@ Type: Array(UInt64). Checks the ancestor of a key through the whole hierarchical chain in the dictionary. -``` +```sql dictIsIn('dict_name', child_id_expr, ancestor_id_expr) ``` @@ -169,7 +169,7 @@ All these functions have the `OrDefault` modification. For example, `dictGetDate Syntax: -``` +```sql dictGet[Type]('dict_name', 'attr_name', id_expr) dictGet[Type]OrDefault('dict_name', 'attr_name', id_expr, default_value_expr) ``` diff --git a/docs/en/query_language/functions/functions_for_nulls.md b/docs/en/query_language/functions/functions_for_nulls.md index 41fec479d0d..a5116096947 100644 --- a/docs/en/query_language/functions/functions_for_nulls.md +++ b/docs/en/query_language/functions/functions_for_nulls.md @@ -4,7 +4,7 @@ Checks whether the argument is [NULL](../syntax.md#null). -``` +```sql isNull(x) ``` @@ -21,7 +21,7 @@ isNull(x) Input table -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -30,25 +30,21 @@ Input table Query +```sql +SELECT x FROM t_null WHERE isNull(y) ``` -:) SELECT x FROM t_null WHERE isNull(y) - -SELECT x -FROM t_null -WHERE isNull(y) - +```text ┌─x─┐ │ 1 │ └───┘ -1 rows in set. Elapsed: 0.010 sec. ``` ## isNotNull Checks whether the argument is [NULL](../syntax.md#null). -``` +```sql isNotNull(x) ``` @@ -65,7 +61,7 @@ isNotNull(x) Input table -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -74,25 +70,21 @@ Input table Query +```sql +SELECT x FROM t_null WHERE isNotNull(y) ``` -:) SELECT x FROM t_null WHERE isNotNull(y) - -SELECT x -FROM t_null -WHERE isNotNull(y) - +```text ┌─x─┐ │ 2 │ └───┘ -1 rows in set. Elapsed: 0.010 sec. ``` ## coalesce Checks from left to right whether `NULL` arguments were passed and returns the first non-`NULL` argument. -``` +```sql coalesce(x,...) ``` @@ -109,7 +101,7 @@ coalesce(x,...) Consider a list of contacts that may specify multiple ways to contact a customer. -``` +```text ┌─name─────┬─mail─┬─phone─────┬──icq─┐ │ client 1 │ ᴺᵁᴸᴸ │ 123-45-67 │ 123 │ │ client 2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ @@ -120,25 +112,22 @@ The `mail` and `phone` fields are of type String, but the `icq` field is `UInt32 Get the first available contact method for the customer from the contact list: +```sql +SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook ``` -:) SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook - -SELECT coalesce(mail, phone, CAST(icq, 'Nullable(String)')) -FROM aBook - +```text ┌─name─────┬─coalesce(mail, phone, CAST(icq, 'Nullable(String)'))─┐ │ client 1 │ 123-45-67 │ │ client 2 │ ᴺᵁᴸᴸ │ └──────────┴──────────────────────────────────────────────────────┘ -2 rows in set. Elapsed: 0.006 sec. ``` ## ifNull Returns an alternative value if the main argument is `NULL`. -``` +```sql ifNull(x,alt) ``` @@ -154,17 +143,19 @@ ifNull(x,alt) **Example** -``` +```sql SELECT ifNull('a', 'b') - +``` +```text ┌─ifNull('a', 'b')─┐ │ a │ └──────────────────┘ ``` -``` +```sql SELECT ifNull(NULL, 'b') - +``` +```text ┌─ifNull(NULL, 'b')─┐ │ b │ └───────────────────┘ @@ -174,7 +165,7 @@ SELECT ifNull(NULL, 'b') Returns `NULL` if the arguments are equal. -``` +```sql nullIf(x, y) ``` @@ -189,17 +180,19 @@ nullIf(x, y) **Example** -``` +```sql SELECT nullIf(1, 1) - +``` +```text ┌─nullIf(1, 1)─┐ │ ᴺᵁᴸᴸ │ └──────────────┘ ``` -``` +```sql SELECT nullIf(1, 2) - +``` +```text ┌─nullIf(1, 2)─┐ │ 1 │ └──────────────┘ @@ -209,7 +202,7 @@ SELECT nullIf(1, 2) Results in a value of type [Nullable](../../data_types/nullable.md) for a non- `Nullable`, if the value is not `NULL`. -``` +```sql assumeNotNull(x) ``` @@ -226,15 +219,16 @@ assumeNotNull(x) Consider the `t_null` table. -``` +```sql SHOW CREATE TABLE t_null - +``` +```text ┌─statement─────────────────────────────────────────────────────────────────┐ │ CREATE TABLE default.t_null ( x Int8, y Nullable(Int8)) ENGINE = TinyLog │ └───────────────────────────────────────────────────────────────────────────┘ ``` -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -243,18 +237,20 @@ SHOW CREATE TABLE t_null Apply the `assumeNotNull` function to the `y` column. -``` +```sql SELECT assumeNotNull(y) FROM t_null - +``` +```text ┌─assumeNotNull(y)─┐ │ 0 │ │ 3 │ └──────────────────┘ ``` -``` +```sql SELECT toTypeName(assumeNotNull(y)) FROM t_null - +``` +```text ┌─toTypeName(assumeNotNull(y))─┐ │ Int8 │ │ Int8 │ @@ -265,7 +261,7 @@ SELECT toTypeName(assumeNotNull(y)) FROM t_null Converts the argument type to `Nullable`. -``` +```sql toNullable(x) ``` @@ -279,15 +275,18 @@ toNullable(x) **Example** -``` +```sql SELECT toTypeName(10) - +``` +```text ┌─toTypeName(10)─┐ │ UInt8 │ └────────────────┘ - +``` +```sql SELECT toTypeName(toNullable(10)) - +``` +```text ┌─toTypeName(toNullable(10))─┐ │ Nullable(UInt8) │ └────────────────────────────┘ diff --git a/docs/en/query_language/functions/geo.md b/docs/en/query_language/functions/geo.md index 79b8390d59f..49cdcecc9f9 100644 --- a/docs/en/query_language/functions/geo.md +++ b/docs/en/query_language/functions/geo.md @@ -4,7 +4,7 @@ Calculate the distance between two points on the Earth's surface using [the great-circle formula](https://en.wikipedia.org/wiki/Great-circle_distance). -``` +```sql greatCircleDistance(lon1Deg, lat1Deg, lon2Deg, lat2Deg) ``` @@ -25,11 +25,11 @@ Generates an exception when the input parameter values fall outside of the range **Example** -``` sql +```sql SELECT greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673) ``` -``` +```text ┌─greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673)─┐ │ 14132374.194975413 │ └───────────────────────────────────────────────────────────────────┘ @@ -40,7 +40,7 @@ SELECT greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673) Checks whether the point belongs to at least one of the ellipses. Coordinates are geometric in the Cartesian coordinate system. -``` +```sql pointInEllipses(x, y, x₀, y₀, a₀, b₀,...,xₙ, yₙ, aₙ, bₙ) ``` @@ -58,11 +58,11 @@ The input parameters must be `2+4⋅n`, where `n` is the number of ellipses. **Example** -``` sql +```sql SELECT pointInEllipses(10., 10., 10., 9.1, 1., 0.9999) ``` -``` +```text ┌─pointInEllipses(10., 10., 10., 9.1, 1., 0.9999)─┐ │ 1 │ └─────────────────────────────────────────────────┘ @@ -72,7 +72,7 @@ SELECT pointInEllipses(10., 10., 10., 9.1, 1., 0.9999) Checks whether the point belongs to the polygon on the plane. -``` +```sql pointInPolygon((x, y), [(a, b), (c, d) ...], ...) ``` @@ -89,11 +89,11 @@ If the point is on the polygon boundary, the function may return either 0 or 1. **Example** -``` sql +```sql SELECT pointInPolygon((3., 3.), [(6, 0), (8, 4), (5, 8), (0, 2)]) AS res ``` -``` +```text ┌─res─┐ │ 1 │ └─────┘ @@ -102,7 +102,7 @@ SELECT pointInPolygon((3., 3.), [(6, 0), (8, 4), (5, 8), (0, 2)]) AS res ## geohashEncode Encodes latitude and longitude as a geohash-string, please see (http://geohash.org/, https://en.wikipedia.org/wiki/Geohash). -``` +```sql geohashEncode(longitude, latitude, [precision]) ``` @@ -118,11 +118,11 @@ geohashEncode(longitude, latitude, [precision]) **Example** -``` sql +```sql SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res ``` -``` +```text ┌─res──────────┐ │ ezs42d000000 │ └──────────────┘ @@ -142,11 +142,11 @@ Decodes any geohash-encoded string into longitude and latitude. **Example** -``` sql +```sql SELECT geohashDecode('ezs42') AS res ``` -``` +```text ┌─res─────────────────────────────┐ │ (-5.60302734375,42.60498046875) │ └─────────────────────────────────┘ @@ -156,7 +156,7 @@ SELECT geohashDecode('ezs42') AS res Calculates [H3](https://uber.github.io/h3/#/documentation/overview/introduction) point index `(lon, lat)` with specified resolution. -``` +```sql geoToH3(lon, lat, resolution) ``` @@ -175,10 +175,10 @@ Type: [UInt64](../../data_types/int_uint.md). **Example** -``` sql +```sql SELECT geoToH3(37.79506683, 55.71290588, 15) as h3Index ``` -``` +```text ┌────────────h3Index─┐ │ 644325524701193974 │ └────────────────────┘ @@ -207,10 +207,10 @@ Please note that function will throw an exception if resulting array is over 10' **Example** -``` +```sql SELECT geohashesInBox(24.48, 40.56, 24.785, 40.81, 4) AS thasos ``` -``` +```text ┌─thasos──────────────────────────────────────┐ │ ['sx1q','sx1r','sx32','sx1w','sx1x','sx38'] │ └─────────────────────────────────────────────┘ diff --git a/docs/en/query_language/functions/hash_functions.md b/docs/en/query_language/functions/hash_functions.md index b384dead609..1de34badcc2 100644 --- a/docs/en/query_language/functions/hash_functions.md +++ b/docs/en/query_language/functions/hash_functions.md @@ -6,7 +6,7 @@ Hash functions can be used for the deterministic pseudo-random shuffling of elem [Interprets](../../query_language/functions/type_conversion_functions.md#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the [MD5](https://en.wikipedia.org/wiki/MD5) hash value for each of them. Then combines hashes, takes the first 8 bytes of the hash of the resulting string, and interprets them as `UInt64` in big-endian byte order. -``` +```sql halfMD5(par1, ...) ``` @@ -42,7 +42,7 @@ If you want to get the same result as output by the md5sum utility, use lower(he Produces a 64-bit [SipHash](https://131002.net/siphash/) hash value. -``` +```sql sipHash64(par1,...) ``` @@ -68,7 +68,7 @@ A [UInt64](../../data_types/int_uint.md) data type hash value. ```sql SELECT sipHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS SipHash, toTypeName(SipHash) AS type ``` -``` +```text ┌──────────────SipHash─┬─type───┐ │ 13726873534472839665 │ UInt64 │ └──────────────────────┴────────┘ @@ -84,7 +84,7 @@ Differs from sipHash64 in that the final xor-folding state is only done up to 12 Produces a 64-bit [CityHash](https://github.com/google/cityhash) hash value. -``` +```sql cityHash64(par1,...) ``` @@ -150,7 +150,7 @@ Levels are the same as in URLHierarchy. This function is specific to Yandex.Metr Produces a 64-bit [FarmHash](https://github.com/google/farmhash) hash value. -``` +```sql farmHash64(par1, ...) ``` @@ -191,7 +191,7 @@ This is just [JavaHash](#hash_functions-javahash) with zeroed out sign bit. This Produces a 64-bit [MetroHash](http://www.jandrewrogers.com/2015/05/27/metrohash/) hash value. -``` +```sql metroHash64(par1, ...) ``` @@ -224,7 +224,7 @@ For more information, see the link: [JumpConsistentHash](https://arxiv.org/pdf/1 Produces a [MurmurHash2](https://github.com/aappleby/smhasher) hash value. -``` +```sql murmurHash2_32(par1, ...) murmurHash2_64(par1, ...) ``` @@ -253,7 +253,7 @@ SELECT murmurHash2_64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23: Produces a [MurmurHash3](https://github.com/aappleby/smhasher) hash value. -``` +```sql murmurHash3_32(par1, ...) murmurHash3_64(par1, ...) ``` @@ -282,7 +282,7 @@ SELECT murmurHash3_32(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23: Produces a 128-bit [MurmurHash3](https://github.com/aappleby/smhasher) hash value. -``` +```sql murmurHash3_128( expr ) ``` diff --git a/docs/en/query_language/functions/higher_order_functions.md b/docs/en/query_language/functions/higher_order_functions.md index 5f401219a33..c114f912980 100644 --- a/docs/en/query_language/functions/higher_order_functions.md +++ b/docs/en/query_language/functions/higher_order_functions.md @@ -25,18 +25,20 @@ Returns an array obtained from the original application of the `func` function t Examples: -``` sql +```sql SELECT arrayMap(x -> (x + 2), [1, 2, 3]) as res; - +``` +```text ┌─res─────┐ │ [3,4,5] │ └─────────┘ ``` The following example shows how to create a tuple of elements from different arrays: -``` sql +```sql SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res - +``` +```text ┌─res─────────────────┐ │ [(1,4),(2,5),(3,6)] │ └─────────────────────┘ @@ -50,17 +52,17 @@ Returns an array containing only the elements in `arr1` for which `func` returns Examples: -``` sql +```sql SELECT arrayFilter(x -> x LIKE '%World%', ['Hello', 'abc World']) AS res ``` -``` +```text ┌─res───────────┐ │ ['abc World'] │ └───────────────┘ ``` -``` sql +```sql SELECT arrayFilter( (i, x) -> x LIKE '%World%', @@ -69,7 +71,7 @@ SELECT AS res ``` -``` +```text ┌─res─┐ │ [2] │ └─────┘ @@ -111,11 +113,11 @@ Returns an array of partial sums of elements in the source array (a running sum) Example: -``` sql +```sql SELECT arrayCumSum([1, 1, 1, 1]) AS res ``` -``` +```text ┌─res──────────┐ │ [1, 2, 3, 4] │ └──────────────┘ @@ -125,11 +127,11 @@ SELECT arrayCumSum([1, 1, 1, 1]) AS res Same as `arrayCumSum`, returns an array of partial sums of elements in the source array (a running sum). Different `arrayCumSum`, when then returned value contains a value less than zero, the value is replace with zero and the subsequent calculation is performed with zero parameters. For example: -``` sql +```sql SELECT arrayCumSumNonNegative([1, 1, -4, 1]) AS res ``` -``` +```text ┌─res───────┐ │ [1,2,0,1] │ └───────────┘ @@ -143,11 +145,11 @@ The [Schwartzian transform](https://en.wikipedia.org/wiki/Schwartzian_transform) Example: -``` sql +```sql SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]); ``` -``` +```text ┌─res────────────────┐ │ ['world', 'hello'] │ └────────────────────┘ @@ -161,10 +163,10 @@ Returns an array as result of sorting the elements of `arr1` in descending order Example: -``` sql +```sql SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; ``` -``` sql +```text ┌─res───────────────┐ │ ['hello','world'] │ └───────────────────┘ diff --git a/docs/en/query_language/functions/ip_address_functions.md b/docs/en/query_language/functions/ip_address_functions.md index ce58a187853..411b882d6b2 100644 --- a/docs/en/query_language/functions/ip_address_functions.md +++ b/docs/en/query_language/functions/ip_address_functions.md @@ -14,7 +14,7 @@ Similar to IPv4NumToString, but using xxx instead of the last octet. Example: -``` sql +```sql SELECT IPv4NumToStringClassC(ClientIP) AS k, count() AS c @@ -24,7 +24,7 @@ ORDER BY c DESC LIMIT 10 ``` -``` +```text ┌─k──────────────┬─────c─┐ │ 83.149.9.xxx │ 26238 │ │ 217.118.81.xxx │ 26074 │ @@ -46,17 +46,17 @@ Since using 'xxx' is highly unusual, this may be changed in the future. We recom Accepts a FixedString(16) value containing the IPv6 address in binary format. Returns a string containing this address in text format. IPv6-mapped IPv4 addresses are output in the format ::ffff:111.222.33.44. Examples: -``` sql +```sql SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr ``` -``` +```text ┌─addr─────────┐ │ 2a02:6b8::11 │ └──────────────┘ ``` -``` sql +```sql SELECT IPv6NumToString(ClientIP6 AS k), count() AS c @@ -67,7 +67,7 @@ ORDER BY c DESC LIMIT 10 ``` -``` +```text ┌─IPv6NumToString(ClientIP6)──────────────┬─────c─┐ │ 2a02:2168:aaa:bbbb::2 │ 24695 │ │ 2a02:2698:abcd:abcd:abcd:abcd:8888:5555 │ 22408 │ @@ -82,7 +82,7 @@ LIMIT 10 └─────────────────────────────────────────┴───────┘ ``` -``` sql +```sql SELECT IPv6NumToString(ClientIP6 AS k), count() AS c @@ -93,7 +93,7 @@ ORDER BY c DESC LIMIT 10 ``` -``` +```text ┌─IPv6NumToString(ClientIP6)─┬──────c─┐ │ ::ffff:94.26.111.111 │ 747440 │ │ ::ffff:37.143.222.4 │ 529483 │ @@ -117,11 +117,11 @@ HEX can be uppercase or lowercase. Takes a `UInt32` number. Interprets it as an IPv4 address in [big endian](https://en.wikipedia.org/wiki/Endianness). Returns a `FixedString(16)` value containing the IPv6 address in binary format. Examples: -``` sql +```sql SELECT IPv6NumToString(IPv4ToIPv6(IPv4StringToNum('192.168.0.1'))) AS addr ``` -``` +```text ┌─addr───────────────┐ │ ::ffff:192.168.0.1 │ └────────────────────┘ @@ -131,7 +131,7 @@ SELECT IPv6NumToString(IPv4ToIPv6(IPv4StringToNum('192.168.0.1'))) AS addr Accepts a FixedString(16) value containing the IPv6 address in binary format. Returns a string containing the address of the specified number of bits removed in text format. For example: -``` sql +```sql WITH IPv6StringToNum('2001:0DB8:AC10:FE01:FEED:BABE:CAFE:F00D') AS ipv6, IPv4ToIPv6(IPv4StringToNum('192.168.0.1')) AS ipv4 @@ -141,7 +141,7 @@ SELECT ``` -``` +```text ┌─cutIPv6(ipv6, 2, 0)─────────────────┬─cutIPv6(ipv4, 0, 2)─┐ │ 2001:db8:ac10:fe01:feed:babe:cafe:0 │ ::ffff:192.168.0.0 │ └─────────────────────────────────────┴─────────────────────┘ @@ -155,7 +155,7 @@ Accepts an IPv4 and an UInt8 value containing the [CIDR](https://en.wikipedia.or ```sql SELECT IPv4CIDRToRange(toIPv4('192.168.5.2'), 16) ``` -``` +```text ┌─IPv4CIDRToRange(toIPv4('192.168.5.2'), 16)─┐ │ ('192.168.0.0','192.168.255.255') │ └────────────────────────────────────────────┘ @@ -171,7 +171,7 @@ Accepts an IPv6 and an UInt8 value containing the CIDR. Return a tuple with two SELECT IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32); ``` -``` +```text ┌─IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32)─┐ │ ('2001:db8::','2001:db8:ffff:ffff:ffff:ffff:ffff:ffff') │ └────────────────────────────────────────────────────────────────────────┘ @@ -181,7 +181,7 @@ SELECT IPv6CIDRToRange(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32); An alias to `IPv4StringToNum()` that takes a string form of IPv4 address and returns value of [IPv4](../../data_types/domains/ipv4.md) type, which is binary equal to value returned by `IPv4StringToNum()`. -``` sql +```sql WITH '171.225.130.45' as IPv4_string SELECT @@ -189,13 +189,13 @@ SELECT toTypeName(toIPv4(IPv4_string)) ``` -``` +```text ┌─toTypeName(IPv4StringToNum(IPv4_string))─┬─toTypeName(toIPv4(IPv4_string))─┐ │ UInt32 │ IPv4 │ └──────────────────────────────────────────┴─────────────────────────────────┘ ``` -``` sql +```sql WITH '171.225.130.45' as IPv4_string SELECT @@ -203,7 +203,7 @@ SELECT hex(toIPv4(IPv4_string)) ``` -``` +```text ┌─hex(IPv4StringToNum(IPv4_string))─┬─hex(toIPv4(IPv4_string))─┐ │ ABE1822D │ ABE1822D │ └───────────────────────────────────┴──────────────────────────┘ @@ -213,7 +213,7 @@ SELECT An alias to `IPv6StringToNum()` that takes a string form of IPv6 address and returns value of [IPv6](../../data_types/domains/ipv6.md) type, which is binary equal to value returned by `IPv6StringToNum()`. -``` sql +```sql WITH '2001:438:ffff::407d:1bc1' as IPv6_string SELECT @@ -221,13 +221,13 @@ SELECT toTypeName(toIPv6(IPv6_string)) ``` -``` +```text ┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐ │ FixedString(16) │ IPv6 │ └──────────────────────────────────────────┴─────────────────────────────────┘ ``` -``` sql +```sql WITH '2001:438:ffff::407d:1bc1' as IPv6_string SELECT @@ -235,7 +235,7 @@ SELECT hex(toIPv6(IPv6_string)) ``` -``` +```text ┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐ │ 20010438FFFF000000000000407D1BC1 │ 20010438FFFF000000000000407D1BC1 │ └───────────────────────────────────┴──────────────────────────────────┘ diff --git a/docs/en/query_language/functions/json_functions.md b/docs/en/query_language/functions/json_functions.md index 21e346e9ef9..4de06ae53fb 100644 --- a/docs/en/query_language/functions/json_functions.md +++ b/docs/en/query_language/functions/json_functions.md @@ -35,7 +35,7 @@ Returns the value of a field, including separators. Examples: -``` +```sql visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"' visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}' ``` @@ -46,7 +46,7 @@ Parses the string in double quotes. The value is unescaped. If unescaping failed Examples: -``` +```sql visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺' visitParamExtractString('{"abc":"\\u263"}', 'abc') = '' @@ -65,9 +65,9 @@ If the value does not exist, `0` will be returned. Examples: -``` -select JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 1 -select JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4) = 0 +```sql +SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 1 +SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4) = 0 ``` `indices_or_keys` is a list of zero or more arguments each of them can be either string or integer. @@ -82,12 +82,12 @@ You may use integers to access both JSON arrays and JSON objects. So, for example: -``` -select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'a' -select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 2) = 'b' -select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -1) = 'b' -select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' -select JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' +```sql +SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'a' +SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 2) = 'b' +SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -1) = 'b' +SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' +SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' ``` ## JSONLength(json[, indices_or_keys]...) @@ -98,9 +98,9 @@ If the value does not exist or has a wrong type, `0` will be returned. Examples: -``` -select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 -select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 +```sql +SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 +SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 ``` ## JSONType(json[, indices_or_keys]...) @@ -111,10 +111,10 @@ If the value does not exist, `Null` will be returned. Examples: -``` -select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}') = 'Object' -select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' -select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' +```sql +SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}') = 'Object' +SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' +SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' ``` ## JSONExtractUInt(json[, indices_or_keys]...) @@ -128,10 +128,10 @@ If the value does not exist or has a wrong type, `0` will be returned. Examples: -``` -select JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 -select JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200.0 -select JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 +```sql +SELECT JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 +SELECT JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200.0 +SELECT JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 ``` ## JSONExtractString(json[, indices_or_keys]...) @@ -144,12 +144,12 @@ The value is unescaped. If unescaping failed, it returns an empty string. Examples: -``` -select JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'hello' -select JSONExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' -select JSONExtractString('{"abc":"\\u263a"}', 'abc') = '☺' -select JSONExtractString('{"abc":"\\u263"}', 'abc') = '' -select JSONExtractString('{"abc":"hello}', 'abc') = '' +```sql +SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'hello' +SELECT JSONExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' +SELECT JSONExtractString('{"abc":"\\u263a"}', 'abc') = '☺' +SELECT JSONExtractString('{"abc":"\\u263"}', 'abc') = '' +SELECT JSONExtractString('{"abc":"hello}', 'abc') = '' ``` ## JSONExtract(json[, indices_or_keys...], return_type) @@ -163,7 +163,7 @@ This means Examples: -``` +```sql SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(String, Array(Float64))') = ('hello',[-100,200,300]) SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(b Array(Float64), a String)') = ([-100,200,300],'hello') SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 'Array(Nullable(Int8))') = [-100, NULL, NULL] @@ -179,7 +179,7 @@ Parse key-value pairs from a JSON where the values are of the given ClickHouse d Example: -``` +```sql SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') = [('a',5),('b',7),('c',11)]; ``` @@ -191,8 +191,8 @@ If the part does not exist or has a wrong type, an empty string will be returned Example: -``` -select JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]' +```sql +SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]' ``` [Original article](https://clickhouse.yandex/docs/en/query_language/functions/json_functions/) diff --git a/docs/en/query_language/functions/math_functions.md b/docs/en/query_language/functions/math_functions.md index 31deb337fdb..d2673ddda41 100644 --- a/docs/en/query_language/functions/math_functions.md +++ b/docs/en/query_language/functions/math_functions.md @@ -48,11 +48,11 @@ If 'x' is non-negative, then erf(x / σ√2) is the probability that a random Example (three sigma rule): -``` sql +```sql SELECT erf(3 / sqrt(2)) ``` -``` +```text ┌─erf(divide(3, sqrt(2)))─┐ │ 0.9973002039367398 │ └─────────────────────────┘ diff --git a/docs/en/query_language/functions/other_functions.md b/docs/en/query_language/functions/other_functions.md index 131f961b22a..b1471b7dfe4 100644 --- a/docs/en/query_language/functions/other_functions.md +++ b/docs/en/query_language/functions/other_functions.md @@ -8,7 +8,7 @@ Returns a string with the name of the host that this function was performed on. Extracts the trailing part of a string after the last slash or backslash. This function if often used to extract the filename from a path. -``` +```sql basename( expr ) ``` @@ -60,9 +60,10 @@ This function is used by the system for implementing Pretty formats. `NULL` is represented as a string corresponding to `NULL` in `Pretty` formats. -``` +```sql SELECT visibleWidth(NULL) - +``` +```text ┌─visibleWidth(NULL)─┐ │ 4 │ └────────────────────┘ @@ -139,7 +140,7 @@ The band is drawn with accuracy to one eighth of a symbol. Example: -``` sql +```sql SELECT toHour(EventTime) AS h, count() AS c, @@ -149,7 +150,7 @@ GROUP BY h ORDER BY h ASC ``` -``` +```text ┌──h─┬──────c─┬─bar────────────────┐ │ 0 │ 292907 │ █████████▋ │ │ 1 │ 180563 │ ██████ │ @@ -208,7 +209,7 @@ If the 'x' value is equal to one of the elements in the 'array_from' array, it r Example: -``` sql +```sql SELECT transform(SearchEngineID, [2, 3], ['Yandex', 'Google'], 'Other') AS title, count() AS c @@ -218,7 +219,7 @@ GROUP BY title ORDER BY c DESC ``` -``` +```text ┌─title─────┬──────c─┐ │ Yandex │ 498635 │ │ Google │ 229872 │ @@ -237,7 +238,7 @@ Types: Example: -``` sql +```sql SELECT transform(domain(Referer), ['yandex.ru', 'google.ru', 'vk.com'], ['www.yandex', 'example.com']) AS s, count() AS c @@ -247,7 +248,7 @@ ORDER BY count() DESC LIMIT 10 ``` -``` +```text ┌─s──────────────┬───────c─┐ │ │ 2906259 │ │ www.yandex │ 867767 │ @@ -267,13 +268,13 @@ Accepts the size (number of bytes). Returns a rounded size with a suffix (KiB, M Example: -``` sql +```sql SELECT arrayJoin([1, 1024, 1024*1024, 192851925]) AS filesize_bytes, formatReadableSize(filesize_bytes) AS filesize ``` -``` +```text ┌─filesize_bytes─┬─filesize───┐ │ 1 │ 1.00 B │ │ 1024 │ 1.00 KiB │ @@ -325,7 +326,7 @@ If you make a subquery with ORDER BY and call the function from outside the subq If `offset` value is outside block bounds, a default value for `column` returned. If `default_value` is given, then it will be used. This function can be used to compute year-over-year metric value: -``` sql +```sql WITH toDate('2018-01-01') AS start_date SELECT toStartOfMonth(start_date + (number * 32)) AS month, @@ -335,7 +336,7 @@ SELECT FROM numbers(16) ``` -``` +```text ┌──────month─┬─money─┬─prev_year─┬─year_over_year─┐ │ 2018-01-01 │ 32 │ 0 │ 0 │ │ 2018-02-01 │ 63 │ 0 │ 0 │ @@ -367,7 +368,7 @@ If you make a subquery with ORDER BY and call the function from outside the subq Example: -``` sql +```sql SELECT EventID, EventTime, @@ -384,7 +385,7 @@ FROM ) ``` -``` +```text ┌─EventID─┬───────────EventTime─┬─delta─┐ │ 1106 │ 2016-11-24 00:00:04 │ 0 │ │ 1107 │ 2016-11-24 00:00:05 │ 1 │ @@ -396,19 +397,22 @@ FROM Please note - block size affects the result. With each new block, the `runningDifference` state is reset. -``` sql +```sql SELECT number, runningDifference(number + 1) AS diff FROM numbers(100000) WHERE diff != 1 +``` +```text ┌─number─┬─diff─┐ │ 0 │ 0 │ └────────┴──────┘ ┌─number─┬─diff─┐ │ 65536 │ 0 │ └────────┴──────┘ - +``` +```sql set max_block_size=100000 -- default value is 65536! SELECT @@ -416,6 +420,8 @@ SELECT runningDifference(number + 1) AS diff FROM numbers(100000) WHERE diff != 1 +``` +```text ┌─number─┬─diff─┐ │ 0 │ 0 │ └────────┴──────┘ @@ -441,7 +447,7 @@ Accepts a MAC address in the format AA:BB:CC:DD:EE:FF (colon-separated numbers i Returns the number of fields in [Enum](../../data_types/enum.md). -``` +```sql getSizeOfEnumType(value) ``` @@ -456,9 +462,10 @@ getSizeOfEnumType(value) **Example** -``` +```sql SELECT getSizeOfEnumType( CAST('a' AS Enum8('a' = 1, 'b' = 2) ) ) AS x - +``` +```text ┌─x─┐ │ 2 │ └───┘ @@ -468,7 +475,7 @@ SELECT getSizeOfEnumType( CAST('a' AS Enum8('a' = 1, 'b' = 2) ) ) AS x Returns the name of the class that represents the data type of the column in RAM. -``` +```sql toColumnTypeName(value) ``` @@ -482,21 +489,18 @@ toColumnTypeName(value) **Example of the difference between` toTypeName ' and ' toColumnTypeName`** +```sql +SELECT toTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) ``` -:) select toTypeName(cast('2018-01-01 01:02:03' AS DateTime)) - -SELECT toTypeName(CAST('2018-01-01 01:02:03', 'DateTime')) - +```text ┌─toTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ │ DateTime │ └─────────────────────────────────────────────────────┘ - -1 rows in set. Elapsed: 0.008 sec. - -:) select toColumnTypeName(cast('2018-01-01 01:02:03' AS DateTime)) - -SELECT toColumnTypeName(CAST('2018-01-01 01:02:03', 'DateTime')) - +``` +```sql +SELECT toColumnTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) +``` +```text ┌─toColumnTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ │ Const(UInt32) │ └───────────────────────────────────────────────────────────┘ @@ -508,7 +512,7 @@ The example shows that the `DateTime` data type is stored in memory as `Const(UI Outputs a detailed description of data structures in RAM -``` +```sql dumpColumnStructure(value) ``` @@ -522,9 +526,10 @@ dumpColumnStructure(value) **Example** -``` +```sql SELECT dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime')) - +``` +```text ┌─dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ │ DateTime, Const(size = 1, UInt32(size = 1)) │ └──────────────────────────────────────────────────────────────┘ @@ -536,7 +541,7 @@ Outputs the default value for the data type. Does not include default values for custom columns set by the user. -``` +```sql defaultValueOfArgumentType(expression) ``` @@ -552,26 +557,21 @@ defaultValueOfArgumentType(expression) **Example** +```sql +SELECT defaultValueOfArgumentType( CAST(1 AS Int8) ) ``` -:) SELECT defaultValueOfArgumentType( CAST(1 AS Int8) ) - -SELECT defaultValueOfArgumentType(CAST(1, 'Int8')) - +```text ┌─defaultValueOfArgumentType(CAST(1, 'Int8'))─┐ │ 0 │ └─────────────────────────────────────────────┘ - -1 rows in set. Elapsed: 0.002 sec. - -:) SELECT defaultValueOfArgumentType( CAST(1 AS Nullable(Int8) ) ) - -SELECT defaultValueOfArgumentType(CAST(1, 'Nullable(Int8)')) - +``` +```sql +SELECT defaultValueOfArgumentType( CAST(1 AS Nullable(Int8) ) ) +``` +```text ┌─defaultValueOfArgumentType(CAST(1, 'Nullable(Int8)'))─┐ │ ᴺᵁᴸᴸ │ └───────────────────────────────────────────────────────┘ - -1 rows in set. Elapsed: 0.002 sec. ``` ## indexHint @@ -588,9 +588,10 @@ The expression passed to the function is not calculated, but ClickHouse applies Here is a table with the test data for [ontime](../../getting_started/example_datasets/ontime.md). -``` +```sql SELECT count() FROM ontime - +``` +```text ┌─count()─┐ │ 4276457 │ └─────────┘ @@ -600,15 +601,10 @@ The table has indexes for the fields `(FlightDate, (Year, FlightDate))`. Create a selection by date like this: +```sql +SELECT FlightDate AS k, count() FROM ontime GROUP BY k ORDER BY k ``` -:) SELECT FlightDate AS k, count() FROM ontime GROUP BY k ORDER BY k - -SELECT - FlightDate AS k, - count() -FROM ontime -GROUP BY k -ORDER BY k ASC +```text ┌──────────k─┬─count()─┐ │ 2017-01-01 │ 13970 │ @@ -618,37 +614,24 @@ ORDER BY k ASC │ 2017-09-29 │ 16384 │ │ 2017-09-30 │ 12520 │ └────────────┴─────────┘ - -273 rows in set. Elapsed: 0.072 sec. Processed 4.28 million rows, 8.55 MB (59.00 million rows/s., 118.01 MB/s.) ``` In this selection, the index is not used and ClickHouse processed the entire table (`Processed 4.28 million rows`). To apply the index, select a specific date and run the following query: +```sql +SELECT FlightDate AS k, count() FROM ontime WHERE k = '2017-09-15' GROUP BY k ORDER BY k ``` -:) SELECT FlightDate AS k, count() FROM ontime WHERE k = '2017-09-15' GROUP BY k ORDER BY k - -SELECT - FlightDate AS k, - count() -FROM ontime -WHERE k = '2017-09-15' -GROUP BY k -ORDER BY k ASC - +```text ┌──────────k─┬─count()─┐ │ 2017-09-15 │ 16428 │ └────────────┴─────────┘ - -1 rows in set. Elapsed: 0.014 sec. Processed 32.74 thousand rows, 65.49 KB (2.31 million rows/s., 4.63 MB/s.) ``` The last line of output shows that by using the index, ClickHouse processed a significantly smaller number of rows (`Processed 32.74 thousand rows`). Now pass the expression `k = '2017-09-15'` to the `indexHint` function: -``` -:) SELECT FlightDate AS k, count() FROM ontime WHERE indexHint(k = '2017-09-15') GROUP BY k ORDER BY k - +```sql SELECT FlightDate AS k, count() @@ -656,15 +639,14 @@ FROM ontime WHERE indexHint(k = '2017-09-15') GROUP BY k ORDER BY k ASC - +``` +```text ┌──────────k─┬─count()─┐ │ 2017-09-14 │ 7071 │ │ 2017-09-15 │ 16428 │ │ 2017-09-16 │ 1077 │ │ 2017-09-30 │ 8167 │ └────────────┴─────────┘ - -4 rows in set. Elapsed: 0.004 sec. Processed 32.74 thousand rows, 65.49 KB (8.97 million rows/s., 17.94 MB/s.) ``` The response to the request shows that ClickHouse applied the index in the same way as the previous time (`Processed 32.74 thousand rows`). However, the resulting set of rows shows that the expression `k = '2017-09-15'` was not used when generating the result. @@ -677,7 +659,7 @@ Creates an array with a single value. Used for internal implementation of [arrayJoin](array_join.md#functions_arrayjoin). -``` +```sql replicate(x, arr) ``` @@ -692,9 +674,10 @@ replicate(x, arr) **Example** -``` +```sql SELECT replicate(1, ['a', 'b', 'c']) - +``` +```text ┌─replicate(1, ['a', 'b', 'c'])─┐ │ [1,1,1] │ └───────────────────────────────┘ @@ -704,7 +687,7 @@ SELECT replicate(1, ['a', 'b', 'c']) Returns the amount of remaining space in the filesystem where the files of the databases located. See the [path](../../operations/server_settings/settings.md#server_settings-path) server setting description. -``` +```sql filesystemAvailable() ``` @@ -756,7 +739,8 @@ custom_message - is an optional parameter: a constant string, provides an error ```sql SELECT throwIf(number = 3, 'Too many') FROM numbers(10); - +``` +```text ↙ Progress: 0.00 rows, 0.00 B (0.00 rows/s., 0.00 B/s.) Received exception from server (version 19.14.1): Code: 395. DB::Exception: Received from localhost:9000. DB::Exception: Too many. ``` @@ -767,7 +751,8 @@ Returns the same value that was used as its argument. ```sql SELECT identity(42) - +``` +```text ┌─identity(42)─┐ │ 42 │ └──────────────┘ diff --git a/docs/en/query_language/functions/rounding_functions.md b/docs/en/query_language/functions/rounding_functions.md index 2a52db6c865..ea2d899b13c 100644 --- a/docs/en/query_language/functions/rounding_functions.md +++ b/docs/en/query_language/functions/rounding_functions.md @@ -22,7 +22,7 @@ Rounds a value to a specified number of decimal places. The function returns the nearest number of the specified order. In case when given number has equal distance to surrounding numbers the function returns the number having the nearest even digit (banker's rounding). -``` +```sql round(expression [, decimal_places]) ``` @@ -42,10 +42,10 @@ The rounded number of the same type as the input number. **Example of use** -``` sql +```sql SELECT number / 2 AS x, round(x) FROM system.numbers LIMIT 3 ``` -``` +```text ┌───x─┬─round(divide(number, 2))─┐ │ 0 │ 0 │ │ 0.5 │ 0 │ @@ -57,7 +57,7 @@ SELECT number / 2 AS x, round(x) FROM system.numbers LIMIT 3 Rounding to the nearest number. -``` +```text round(3.2, 0) = 3 round(4.1267, 2) = 4.13 round(22,-1) = 20 @@ -67,7 +67,7 @@ round(-467,-2) = -500 Banker's rounding. -``` +```text round(3.5) = 4 round(4.5) = 4 round(3.55, 1) = 3.6 diff --git a/docs/en/query_language/functions/splitting_merging_functions.md b/docs/en/query_language/functions/splitting_merging_functions.md index 0e1cf98ee20..8c4e1ff8b45 100644 --- a/docs/en/query_language/functions/splitting_merging_functions.md +++ b/docs/en/query_language/functions/splitting_merging_functions.md @@ -20,9 +20,10 @@ Selects substrings of consecutive bytes from the ranges a-z and A-Z.Returns an a **Example:** -``` +```sql SELECT alphaTokens('abca1abc') - +``` +```text ┌─alphaTokens('abca1abc')─┐ │ ['abca','abc'] │ └─────────────────────────┘ diff --git a/docs/en/query_language/functions/string_functions.md b/docs/en/query_language/functions/string_functions.md index 1eca9c0e815..0f60749d307 100644 --- a/docs/en/query_language/functions/string_functions.md +++ b/docs/en/query_language/functions/string_functions.md @@ -64,7 +64,7 @@ Returns 1, if the set of bytes is valid UTF-8 encoded, otherwise 0. Replaces invalid UTF-8 characters by the `�` (U+FFFD) character. All running in a row invalid characters are collapsed into the one replacement character. -``` +```sql toValidUTF8( input_string ) ``` @@ -100,13 +100,16 @@ Formatting constant pattern with the string listed in the arguments. `pattern` i ```sql SELECT format('{1} {0} {1}', 'World', 'Hello') - +``` +```text ┌─format('{1} {0} {1}', 'World', 'Hello')─┐ │ Hello World Hello │ └─────────────────────────────────────────┘ - +``` +```sql SELECT format('{} {}', 'Hello', 'World') - +``` +```text ┌─format('{} {}', 'Hello', 'World')─┐ │ Hello World │ └───────────────────────────────────┘ diff --git a/docs/en/query_language/functions/string_replace_functions.md b/docs/en/query_language/functions/string_replace_functions.md index 19339dd474d..22c0a27bd41 100644 --- a/docs/en/query_language/functions/string_replace_functions.md +++ b/docs/en/query_language/functions/string_replace_functions.md @@ -19,7 +19,7 @@ Also keep in mind that a string literal requires an extra escape. Example 1. Converting the date to American format: -``` sql +```sql SELECT DISTINCT EventDate, replaceRegexpOne(toString(EventDate), '(\\d{4})-(\\d{2})-(\\d{2})', '\\2/\\3/\\1') AS res @@ -28,7 +28,7 @@ LIMIT 7 FORMAT TabSeparated ``` -``` +```text 2014-03-17 03/17/2014 2014-03-18 03/18/2014 2014-03-19 03/19/2014 @@ -40,11 +40,11 @@ FORMAT TabSeparated Example 2. Copying a string ten times: -``` sql +```sql SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') AS res ``` -``` +```text ┌─res────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World! │ └────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ @@ -54,11 +54,11 @@ SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') This does the same thing, but replaces all the occurrences. Example: -``` sql +```sql SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res ``` -``` +```text ┌─res────────────────────────┐ │ HHeelllloo,, WWoorrlldd!! │ └────────────────────────────┘ @@ -67,11 +67,11 @@ SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res As an exception, if a regular expression worked on an empty substring, the replacement is not made more than once. Example: -``` sql +```sql SELECT replaceRegexpAll('Hello, World!', '^', 'here: ') AS res ``` -``` +```text ┌─res─────────────────┐ │ here: Hello, World! │ └─────────────────────┘ diff --git a/docs/en/query_language/functions/type_conversion_functions.md b/docs/en/query_language/functions/type_conversion_functions.md index 3930c942462..9245ec00120 100644 --- a/docs/en/query_language/functions/type_conversion_functions.md +++ b/docs/en/query_language/functions/type_conversion_functions.md @@ -194,7 +194,7 @@ When converting dates with times to numbers or vice versa, the date with time co The date and date-with-time formats for the toDate/toDateTime functions are defined as follows: -``` +```text YYYY-MM-DD YYYY-MM-DD hh:mm:ss ``` @@ -207,13 +207,13 @@ Conversion between numeric types uses the same rules as assignments between diff Additionally, the toString function of the DateTime argument can take a second String argument containing the name of the time zone. Example: `Asia/Yekaterinburg` In this case, the time is formatted according to the specified time zone. -``` sql +```sql SELECT now() AS now_local, toString(now(), 'Asia/Yekaterinburg') AS now_yekat ``` -``` +```text ┌───────────now_local─┬─now_yekat───────────┐ │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ └─────────────────────┴─────────────────────┘ @@ -232,21 +232,21 @@ Accepts a String or FixedString argument. Returns the String with the content tr Example: -``` sql +```sql SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut ``` -``` +```text ┌─s─────────────┬─s_cut─┐ │ foo\0\0\0\0\0 │ foo │ └───────────────┴───────┘ ``` -``` sql +```sql SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut ``` -``` +```text ┌─s──────────┬─s_cut─┐ │ foo\0bar\0 │ foo │ └────────────┴───────┘ @@ -278,7 +278,7 @@ Converts 'x' to the 't' data type. The syntax CAST(x AS t) is also supported. Example: -``` sql +```sql SELECT '2016-06-15 23:00:00' AS timestamp, CAST(timestamp AS DateTime) AS datetime, @@ -287,7 +287,7 @@ SELECT CAST(timestamp, 'FixedString(22)') AS fixed_string ``` -``` +```text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ └─────────────────────┴─────────────────────┴────────────┴─────────────────────┴───────────────────────────┘ @@ -297,16 +297,19 @@ Conversion to FixedString(N) only works for arguments of type String or FixedStr Type conversion to [Nullable](../../data_types/nullable.md) and back is supported. Example: -``` +```sql SELECT toTypeName(x) FROM t_null - +``` +```text ┌─toTypeName(x)─┐ │ Int8 │ │ Int8 │ └───────────────┘ - +``` +```sql SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null - +``` +```text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ │ Nullable(UInt16) │ @@ -328,7 +331,7 @@ SELECT date + interval_to_week ``` -``` +```text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ └───────────────────────────┴──────────────────────────────┘ diff --git a/docs/en/query_language/functions/url_functions.md b/docs/en/query_language/functions/url_functions.md index f4ff63ce021..43b92633653 100644 --- a/docs/en/query_language/functions/url_functions.md +++ b/docs/en/query_language/functions/url_functions.md @@ -16,7 +16,7 @@ Examples of typical returned values: http, https, ftp, mailto, tel, magnet... Extracts the hostname from a URL. -``` +```sql domain(url) ``` @@ -27,7 +27,7 @@ domain(url) The URL can be specified with or without a scheme. Examples: -``` +```text svn+ssh://some.svn-hosting.com:80/repo/trunk some.svn-hosting.com:80/repo/trunk https://yandex.com/time/ @@ -35,7 +35,7 @@ https://yandex.com/time/ For these examples, the `domain` function returns the following results: -``` +```text some.svn-hosting.com some.svn-hosting.com yandex.com @@ -67,7 +67,7 @@ Returns the domain and removes no more than one 'www.' from the beginning of it, Extracts the the top-level domain from a URL. -``` +```sql topLevelDomain(url) ``` @@ -77,7 +77,7 @@ topLevelDomain(url) The URL can be specified with or without a scheme. Examples: -``` +```text svn+ssh://some.svn-hosting.com:80/repo/trunk some.svn-hosting.com:80/repo/trunk https://yandex.com/time/ @@ -151,7 +151,7 @@ Returns an array containing the URL, truncated at the end by the symbols /,? in The same as above, but without the protocol and host in the result. The / element (root) is not included. Example: the function is used to implement tree reports the URL in Yandex. Metric. -``` +```text URLPathHierarchy('https://example.com/browse/CONV-6788') = [ '/browse/', @@ -164,11 +164,11 @@ URLPathHierarchy('https://example.com/browse/CONV-6788') = Returns the decoded URL. Example: -``` sql +```sql SELECT decodeURLComponent('http://127.0.0.1:8123/?query=SELECT%201%3B') AS DecodedURL; ``` -``` +```text ┌─DecodedURL─────────────────────────────┐ │ http://127.0.0.1:8123/?query=SELECT 1; │ └────────────────────────────────────────┘ diff --git a/docs/en/query_language/functions/uuid_functions.md b/docs/en/query_language/functions/uuid_functions.md index e353ed58f86..2ab89dabe9f 100644 --- a/docs/en/query_language/functions/uuid_functions.md +++ b/docs/en/query_language/functions/uuid_functions.md @@ -18,13 +18,14 @@ The UUID type value. This example demonstrates creating a table with the UUID type column and inserting a value into the table. -``` sql -:) CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog +```sql +CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog -:) INSERT INTO t_uuid SELECT generateUUIDv4() - -:) SELECT * FROM t_uuid +INSERT INTO t_uuid SELECT generateUUIDv4() +SELECT * FROM t_uuid +``` +```text ┌────────────────────────────────────x─┐ │ f4bf890f-f9dc-4332-ad5c-0c18e73f28e9 │ └──────────────────────────────────────┘ @@ -44,9 +45,10 @@ The UUID type value. **Usage example** -``` sql -:) SELECT toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0') AS uuid - +```sql +SELECT toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0') AS uuid +``` +```text ┌─────────────────────────────────uuid─┐ │ 61f0c404-5cb3-11e7-907b-a6006ad3dba0 │ └──────────────────────────────────────┘ @@ -56,7 +58,7 @@ The UUID type value. Accepts a string containing 36 characters in the format `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`, and returns it as a set of bytes in a [FixedString(16)](../../data_types/fixedstring.md). -``` sql +```sql UUIDStringToNum(String) ``` @@ -66,10 +68,12 @@ FixedString(16) **Usage examples** -``` sql -:) SELECT +```sql +SELECT '612f3c40-5d3b-217e-707b-6a546a3d7b29' AS uuid, UUIDStringToNum(uuid) AS bytes +``` +```text ┌─uuid─────────────────────────────────┬─bytes────────────┐ │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ a/<@];!~p{jTj={) │ @@ -80,7 +84,7 @@ FixedString(16) Accepts a [FixedString(16)](../../data_types/fixedstring.md) value, and returns a string containing 36 characters in text format. -``` sql +```sql UUIDNumToString(FixedString(16)) ``` @@ -90,11 +94,12 @@ String. **Usage example** -``` sql +```sql SELECT 'a/<@];!~p{jTj={)' AS bytes, UUIDNumToString(toFixedString(bytes, 16)) AS uuid - +``` +```text ┌─bytes────────────┬─uuid─────────────────────────────────┐ │ a/<@];!~p{jTj={) │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ └──────────────────┴──────────────────────────────────────┘ diff --git a/docs/en/query_language/functions/ym_dict_functions.md b/docs/en/query_language/functions/ym_dict_functions.md index 8039c77edea..b04c16dab33 100644 --- a/docs/en/query_language/functions/ym_dict_functions.md +++ b/docs/en/query_language/functions/ym_dict_functions.md @@ -20,7 +20,7 @@ All the dictionaries are re-loaded in runtime (once every certain number of seco All functions for working with regions have an optional argument at the end – the dictionary key. It is referred to as the geobase. Example: -``` +```sql regionToCountry(RegionID) – Uses the default dictionary: /opt/geo/regions_hierarchy.txt regionToCountry(RegionID, '') – Uses the default dictionary: /opt/geo/regions_hierarchy.txt regionToCountry(RegionID, 'ua') – Uses the dictionary for the 'ua' key: /opt/geo/regions_hierarchy_ua.txt @@ -34,13 +34,13 @@ Accepts a UInt32 number – the region ID from the Yandex geobase. If this regio Converts a region to an area (type 5 in the geobase). In every other way, this function is the same as 'regionToCity'. -``` sql +```sql SELECT DISTINCT regionToName(regionToArea(toUInt32(number), 'ua')) FROM system.numbers LIMIT 15 ``` -``` +```text ┌─regionToName(regionToArea(toUInt32(number), \'ua\'))─┐ │ │ │ Moscow and Moscow region │ @@ -64,13 +64,13 @@ LIMIT 15 Converts a region to a federal district (type 4 in the geobase). In every other way, this function is the same as 'regionToCity'. -``` sql +```sql SELECT DISTINCT regionToName(regionToDistrict(toUInt32(number), 'ua')) FROM system.numbers LIMIT 15 ``` -``` +```text ┌─regionToName(regionToDistrict(toUInt32(number), \'ua\'))─┐ │ │ │ Central federal district │ diff --git a/docs/en/query_language/insert_into.md b/docs/en/query_language/insert_into.md index d954a470847..2b361fd5a18 100644 --- a/docs/en/query_language/insert_into.md +++ b/docs/en/query_language/insert_into.md @@ -5,7 +5,7 @@ Adding data. Basic query format: -``` sql +```sql INSERT INTO [db.]table [(c1, c2, c3)] VALUES (v11, v12, v13), (v21, v22, v23), ... ``` @@ -18,13 +18,13 @@ If [strict_insert_defaults=1](../operations/settings/settings.md), columns that Data can be passed to the INSERT in any [format](../interfaces/formats.md#formats) supported by ClickHouse. The format must be specified explicitly in the query: -``` sql +```sql INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format_name data_set ``` For example, the following query format is identical to the basic version of INSERT ... VALUES: -``` sql +```sql INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ... ``` @@ -32,7 +32,7 @@ ClickHouse removes all spaces and one line feed (if there is one) before the dat Example: -``` sql +```sql INSERT INTO t FORMAT TabSeparated 11 Hello, world! 22 Qwerty @@ -46,7 +46,7 @@ If table has [constraints](create.md#constraints), their expressions will be che ### Inserting The Results of `SELECT` {#insert_query_insert-select} -``` sql +```sql INSERT INTO [db.]table [(c1, c2, c3)] SELECT ... ``` diff --git a/docs/en/query_language/misc.md b/docs/en/query_language/misc.md index 337049d6624..3a2fa03100e 100644 --- a/docs/en/query_language/misc.md +++ b/docs/en/query_language/misc.md @@ -10,7 +10,7 @@ After executing an ATTACH query, the server will know about the existence of the If the table was previously detached (``DETACH``), meaning that its structure is known, you can use shorthand without defining the structure. -``` sql +```sql ATTACH TABLE [IF NOT EXISTS] [db.]name [ON CLUSTER cluster] ``` @@ -20,7 +20,7 @@ This query is used when starting the server. The server stores table metadata as Checks if the data in the table is corrupted. -``` sql +```sql CHECK TABLE [db.]name ``` @@ -56,7 +56,7 @@ If the table is corrupted, you can copy the non-corrupted data to another table. ## DESCRIBE TABLE {#misc-describe-table} -``` sql +```sql DESC|DESCRIBE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format] ``` @@ -74,7 +74,7 @@ Nested data structures are output in "expanded" format. Each column is shown sep Deletes information about the 'name' table from the server. The server stops knowing about the table's existence. -``` sql +```sql DETACH TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] ``` @@ -87,14 +87,14 @@ There is no `DETACH DATABASE` query. This query has two types: `DROP DATABASE` and `DROP TABLE`. -``` sql +```sql DROP DATABASE [IF EXISTS] db [ON CLUSTER cluster] ``` Deletes all tables inside the 'db' database, then deletes the 'db' database itself. If `IF EXISTS` is specified, it doesn't return an error if the database doesn't exist. -``` sql +```sql DROP [TEMPORARY] TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] ``` @@ -103,7 +103,7 @@ If `IF EXISTS` is specified, it doesn't return an error if the table doesn't exi ## EXISTS -``` sql +```sql EXISTS [TEMPORARY] TABLE [db.]name [INTO OUTFILE filename] [FORMAT format] ``` @@ -111,7 +111,7 @@ Returns a single `UInt8`-type column, which contains the single value `0` if the ## KILL QUERY -``` sql +```sql KILL QUERY [ON CLUSTER cluster] WHERE [SYNC|ASYNC|TEST] @@ -123,7 +123,7 @@ The queries to terminate are selected from the system.processes table using the Examples: -``` sql +```sql -- Forcibly terminates all queries with the specified query_id: KILL QUERY WHERE query_id='2-857d-4a57-9ee0-327da5d60a90' @@ -173,7 +173,7 @@ Changes already made by the mutation are not rolled back. ## OPTIMIZE {#misc_operations-optimize} -``` sql +```sql OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition] [FINAL] ``` @@ -192,7 +192,7 @@ When `OPTIMIZE` is used with [ReplicatedMergeTree](../operations/table_engines/r Renames one or more tables. -``` sql +```sql RENAME TABLE [db11.]name11 TO [db12.]name12, [db21.]name21 TO [db22.]name22, ... [ON CLUSTER cluster] ``` @@ -216,7 +216,7 @@ For more information, see [Settings](../operations/settings/settings.md). ## SHOW CREATE TABLE -``` sql +```sql SHOW CREATE [TEMPORARY] TABLE [db.]table [INTO OUTFILE filename] [FORMAT format] ``` @@ -224,7 +224,7 @@ Returns a single `String`-type 'statement' column, which contains a single value ## SHOW DATABASES {#show-databases} -``` sql +```sql SHOW DATABASES [INTO OUTFILE filename] [FORMAT format] ``` @@ -235,7 +235,7 @@ See also the section "Formats". ## SHOW PROCESSLIST -``` sql +```sql SHOW PROCESSLIST [INTO OUTFILE filename] [FORMAT format] ``` @@ -262,12 +262,12 @@ This query is nearly identical to: `SELECT * FROM system.processes`. The differe Tip (execute in the console): ```bash -watch -n1 "clickhouse-client --query='SHOW PROCESSLIST'" +$ watch -n1 "clickhouse-client --query='SHOW PROCESSLIST'" ``` ## SHOW TABLES -``` sql +```sql SHOW [TEMPORARY] TABLES [FROM db] [LIKE 'pattern'] [INTO OUTFILE filename] [FORMAT format] ``` @@ -282,7 +282,7 @@ See also the section "LIKE operator". ## TRUNCATE -``` sql +```sql TRUNCATE TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] ``` @@ -292,7 +292,7 @@ The `TRUNCATE` query is not supported for [View](../operations/table_engines/vie ## USE -``` sql +```sql USE db ``` diff --git a/docs/en/query_language/operators.md b/docs/en/query_language/operators.md index 4e79c9e805f..534336ca0a9 100644 --- a/docs/en/query_language/operators.md +++ b/docs/en/query_language/operators.md @@ -67,7 +67,7 @@ Groups of operators are listed in order of priority (the higher it is in the lis ## Operator for Working With Dates and Times {#operators-datetime} -``` sql +```sql EXTRACT(part FROM date); ``` @@ -88,7 +88,7 @@ The `date` parameter specifies the date or the time to process. Either [Date](.. Examples: -``` sql +```sql SELECT EXTRACT(DAY FROM toDate('2017-06-15')); SELECT EXTRACT(MONTH FROM toDate('2017-06-15')); SELECT EXTRACT(YEAR FROM toDate('2017-06-15')); @@ -96,7 +96,7 @@ SELECT EXTRACT(YEAR FROM toDate('2017-06-15')); In the following example we create a table and insert into it a value with the `DateTime` type. -``` sql +```sql CREATE TABLE test.Orders ( OrderId UInt64, @@ -106,10 +106,10 @@ CREATE TABLE test.Orders ENGINE = Log; ``` -``` sql +```sql INSERT INTO test.Orders VALUES (1, 'Jarlsberg Cheese', toDateTime('2008-10-11 13:23:44')); ``` -``` sql +```sql SELECT toYear(OrderDate) AS OrderYear, toMonth(OrderDate) AS OrderMonth, @@ -118,6 +118,8 @@ SELECT toMinute(OrderDate) AS OrderMinute, toSecond(OrderDate) AS OrderSecond FROM test.Orders; +``` +```text ┌─OrderYear─┬─OrderMonth─┬─OrderDay─┬─OrderHour─┬─OrderMinute─┬─OrderSecond─┐ │ 2008 │ 10 │ 11 │ 13 │ 23 │ 44 │ @@ -148,7 +150,7 @@ The conditional operator calculates the values of b and c, then checks whether c ## Conditional Expression {#operator_case} -``` sql +```sql CASE [x] WHEN a THEN b [WHEN ... THEN ...] @@ -198,18 +200,13 @@ ClickHouse supports the `IS NULL` and `IS NOT NULL` operators. - `0` otherwise. - For other values, the `IS NULL` operator always returns `0`. -```bash -:) SELECT x+100 FROM t_null WHERE y IS NULL - -SELECT x + 100 -FROM t_null -WHERE isNull(y) - +```sql +SELECT x+100 FROM t_null WHERE y IS NULL +``` +```text ┌─plus(x, 100)─┐ │ 101 │ └──────────────┘ - -1 rows in set. Elapsed: 0.002 sec. ``` @@ -220,18 +217,13 @@ WHERE isNull(y) - `1` otherwise. - For other values, the `IS NOT NULL` operator always returns `1`. -```bash -:) SELECT * FROM t_null WHERE y IS NOT NULL - -SELECT * -FROM t_null -WHERE isNotNull(y) - +```sql +SELECT * FROM t_null WHERE y IS NOT NULL +``` +```text ┌─x─┬─y─┐ │ 2 │ 3 │ └───┴───┘ - -1 rows in set. Elapsed: 0.002 sec. ``` [Original article](https://clickhouse.yandex/docs/en/query_language/operators/) diff --git a/docs/en/query_language/select.md b/docs/en/query_language/select.md index afa2df73c3d..5310b6dfa12 100644 --- a/docs/en/query_language/select.md +++ b/docs/en/query_language/select.md @@ -2,7 +2,7 @@ `SELECT` performs data retrieval. -``` sql +```sql [WITH expr_list|(subquery)] SELECT [DISTINCT] expr_list [FROM [db.]table | (subquery) | table_function] [FINAL] @@ -35,7 +35,7 @@ This section provides support for Common Table Expressions ([CTE](https://en.wik Results of WITH clause expressions can be used inside SELECT clause. Example 1: Using constant expression as "variable" -``` +```sql WITH '2019-08-01 15:23:00' as ts_upper_bound SELECT * FROM hits @@ -45,7 +45,7 @@ WHERE ``` Example 2: Evicting sum(bytes) expression result from SELECT clause column list -``` +```sql WITH sum(bytes) as s SELECT formatReadableSize(s), @@ -56,7 +56,7 @@ ORDER BY s ``` Example 3: Using results of scalar subquery -``` +```sql /* this example would return TOP 10 of most huge tables */ WITH ( @@ -75,7 +75,7 @@ LIMIT 10 Example 4: Re-using expression in subquery As a workaround for current limitation for expression usage in subqueries, you may duplicate it. -``` +```sql WITH ['hello'] AS hello SELECT hello, @@ -85,7 +85,8 @@ FROM WITH ['hello'] AS hello SELECT hello ) - +``` +```text ┌─hello─────┬─hello─────┐ │ ['hello'] │ ['hello'] │ └───────────┴───────────┘ @@ -149,7 +150,7 @@ Here `k` is the number from 0 to 1 (both fractional and decimal notations are su In a `SAMPLE k` clause, the sample is taken from the `k` fraction of data. The example is shown below: -``` sql +```sql SELECT Title, count() * 10 AS PageViews @@ -177,27 +178,27 @@ The `_sample_factor` column contains relative coefficients that are calculated d Let's consider the table `visits`, which contains the statistics about site visits. The first example shows how to calculate the number of page views: -``` sql +```sql SELECT sum(PageViews * _sample_factor) FROM visits SAMPLE 10000000 -``` +``` The next example shows how to calculate the total number of visits: -``` sql +```sql SELECT sum(_sample_factor) FROM visits SAMPLE 10000000 -``` +``` The example below shows how to calculate the average session duration. Note that you don't need to use the relative coefficient to calculate the average values. -``` sql +```sql SELECT avg(Duration) FROM visits SAMPLE 10000000 -``` +``` #### SAMPLE k OFFSET m {#select-sample-offset} @@ -205,7 +206,7 @@ Here `k` and `m` are numbers from 0 to 1. Examples are shown below. **Example 1** -``` sql +```sql SAMPLE 1/10 ``` @@ -215,7 +216,7 @@ In this example, the sample is 1/10th of all data: **Example 2** -``` sql +```sql SAMPLE 1/10 OFFSET 1/2 ``` @@ -227,7 +228,7 @@ Here, a sample of 10% is taken from the second half of the data. Allows executing `JOIN` with an array or nested data structure. The intent is similar to the [arrayJoin](functions/array_join.md#functions_arrayjoin) function, but its functionality is broader. -``` sql +```sql SELECT FROM [LEFT] ARRAY JOIN @@ -246,7 +247,7 @@ Supported types of `ARRAY JOIN` are listed below: The examples below demonstrate the usage of the `ARRAY JOIN` and `LEFT ARRAY JOIN` clauses. Let's create a table with an [Array](../data_types/array.md) type column and insert values into it: -``` sql +```sql CREATE TABLE arrays_test ( s String, @@ -256,7 +257,7 @@ CREATE TABLE arrays_test INSERT INTO arrays_test VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []); ``` -``` +```text ┌─s───────────┬─arr─────┐ │ Hello │ [1,2] │ │ World │ [3,4,5] │ @@ -266,12 +267,12 @@ VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []); The example below uses the `ARRAY JOIN` clause: -``` sql +```sql SELECT s, arr FROM arrays_test ARRAY JOIN arr; ``` -``` +```text ┌─s─────┬─arr─┐ │ Hello │ 1 │ │ Hello │ 2 │ @@ -283,12 +284,12 @@ ARRAY JOIN arr; The next example uses the `LEFT ARRAY JOIN` clause: -``` sql +```sql SELECT s, arr FROM arrays_test LEFT ARRAY JOIN arr; ``` -``` +```text ┌─s───────────┬─arr─┐ │ Hello │ 1 │ │ Hello │ 2 │ @@ -303,13 +304,13 @@ LEFT ARRAY JOIN arr; An alias can be specified for an array in the `ARRAY JOIN` clause. In this case, an array item can be accessed by this alias, but the array itself is accessed by the original name. Example: -``` sql +```sql SELECT s, arr, a FROM arrays_test ARRAY JOIN arr AS a; ``` -``` +```text ┌─s─────┬─arr─────┬─a─┐ │ Hello │ [1,2] │ 1 │ │ Hello │ [1,2] │ 2 │ @@ -321,13 +322,13 @@ ARRAY JOIN arr AS a; Using aliases, you can perform `ARRAY JOIN` with an external array. For example: -``` sql +```sql SELECT s, arr_external FROM arrays_test ARRAY JOIN [1, 2, 3] AS arr_external; ``` -``` +```text ┌─s───────────┬─arr_external─┐ │ Hello │ 1 │ │ Hello │ 2 │ @@ -343,13 +344,13 @@ ARRAY JOIN [1, 2, 3] AS arr_external; Multiple arrays can be comma-separated in the `ARRAY JOIN` clause. In this case, `JOIN` is performed with them simultaneously (the direct sum, not the cartesian product). Note that all the arrays must have the same size. Example: -``` sql +```sql SELECT s, arr, a, num, mapped FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS mapped; ``` -``` +```text ┌─s─────┬─arr─────┬─a─┬─num─┬─mapped─┐ │ Hello │ [1,2] │ 1 │ 1 │ 2 │ │ Hello │ [1,2] │ 2 │ 2 │ 3 │ @@ -361,13 +362,13 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS ma The example below uses the [arrayEnumerate](functions/array_functions.md#array_functions-arrayenumerate) function: -``` sql +```sql SELECT s, arr, a, num, arrayEnumerate(arr) FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num; ``` -``` +```text ┌─s─────┬─arr─────┬─a─┬─num─┬─arrayEnumerate(arr)─┐ │ Hello │ [1,2] │ 1 │ 1 │ [1,2] │ │ Hello │ [1,2] │ 2 │ 2 │ [1,2] │ @@ -381,7 +382,7 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num; `ARRAY `JOIN`` also works with [nested data structures](../data_types/nested_data_structures/nested.md). Example: -``` sql +```sql CREATE TABLE nested_test ( s String, @@ -394,7 +395,7 @@ INSERT INTO nested_test VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [], []); ``` -``` +```text ┌─s───────┬─nest.x──┬─nest.y─────┐ │ Hello │ [1,2] │ [10,20] │ │ World │ [3,4,5] │ [30,40,50] │ @@ -402,13 +403,13 @@ VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [] └─────────┴─────────┴────────────┘ ``` -``` sql +```sql SELECT s, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN nest; ``` -``` +```text ┌─s─────┬─nest.x─┬─nest.y─┐ │ Hello │ 1 │ 10 │ │ Hello │ 2 │ 20 │ @@ -420,13 +421,13 @@ ARRAY JOIN nest; When specifying names of nested data structures in `ARRAY JOIN`, the meaning is the same as `ARRAY JOIN` with all the array elements that it consists of. Examples are listed below: -``` sql +```sql SELECT s, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN `nest.x`, `nest.y`; ``` -``` +```text ┌─s─────┬─nest.x─┬─nest.y─┐ │ Hello │ 1 │ 10 │ │ Hello │ 2 │ 20 │ @@ -438,13 +439,13 @@ ARRAY JOIN `nest.x`, `nest.y`; This variation also makes sense: -``` sql +```sql SELECT s, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN `nest.x`; ``` -``` +```text ┌─s─────┬─nest.x─┬─nest.y─────┐ │ Hello │ 1 │ [10,20] │ │ Hello │ 2 │ [10,20] │ @@ -456,13 +457,13 @@ ARRAY JOIN `nest.x`; An alias may be used for a nested data structure, in order to select either the `JOIN` result or the source array. Example: -``` sql +```sql SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN nest AS n; ``` -``` +```text ┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┐ │ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ │ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ @@ -474,13 +475,13 @@ ARRAY JOIN nest AS n; Example of using the [arrayEnumerate](functions/array_functions.md#array_functions-arrayenumerate) function: -``` sql +```sql SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num FROM nested_test ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num; ``` -``` +```text ┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┬─num─┐ │ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ 1 │ │ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ 2 │ @@ -497,7 +498,7 @@ Joins the data in the normal [SQL JOIN](https://en.wikipedia.org/wiki/Join_(SQL) !!! info "Note" Not related to [ARRAY JOIN](#select-array-join-clause). -``` sql +```sql SELECT FROM [GLOBAL] [ANY|ALL] [INNER|LEFT|RIGHT|FULL|CROSS] [OUTER] JOIN @@ -524,13 +525,13 @@ If a query contains the `WHERE` clause, ClickHouse tries to pushdown filters fro We recommend the `JOIN ON` or `JOIN USING` syntax for creating queries. For example: -``` +```sql SELECT * FROM t1 JOIN t2 ON t1.a = t2.a JOIN t3 ON t1.a = t3.a ``` You can use comma-separated lists of tables in the `FROM` clause. This works only with the [allow_experimental_cross_to_join_conversion = 1](../operations/settings/settings.md#settings-allow_experimental_cross_to_join_conversion) setting. For example: -``` +```sql SELECT * FROM t1, t2, t3 WHERE t1.a = t2.a AND t1.a = t3.a ``` @@ -576,7 +577,7 @@ You can use the following types of syntax: For example, consider the following tables: -``` +```text table_1 table_2 event | ev_time | user_id event | ev_time | user_id @@ -610,7 +611,7 @@ When running a `JOIN`, there is no optimization of the order of execution in rel Example: -``` sql +```sql SELECT CounterID, hits, @@ -634,7 +635,7 @@ ORDER BY hits DESC LIMIT 10 ``` -``` +```text ┌─CounterID─┬───hits─┬─visits─┐ │ 1143050 │ 523264 │ 13665 │ │ 731962 │ 475698 │ 102716 │ @@ -724,7 +725,7 @@ If a query contains only table columns inside aggregate functions, the GROUP BY Example: -``` sql +```sql SELECT count(), median(FetchTiming > 60 ? 60 : FetchTiming), @@ -738,7 +739,7 @@ As opposed to MySQL (and conforming to standard SQL), you can't get some value o Example: -``` sql +```sql SELECT domainWithoutWWW(URL) AS domain, count(), @@ -761,7 +762,7 @@ Here's an example to show what this means. Assume you have this table: -``` +```text ┌─x─┬────y─┐ │ 1 │ 2 │ │ 2 │ ᴺᵁᴸᴸ │ @@ -773,7 +774,7 @@ Assume you have this table: The query `SELECT sum(x), y FROM t_null_big GROUP BY y` results in: -``` +```text ┌─sum(x)─┬────y─┐ │ 4 │ 2 │ │ 3 │ 3 │ @@ -877,7 +878,7 @@ The `SELECT * FROM limit_by ORDER BY id, val LIMIT 2 OFFSET 1 BY id` query retur The following query returns the top 5 referrers for each `domain, device_type` pair with a maximum of 100 rows in total (`LIMIT n BY + LIMIT`). -``` sql +```sql SELECT domainWithoutWWW(URL) AS domain, domainWithoutWWW(REFERRER_URL) AS referrer, @@ -918,7 +919,7 @@ Example: For the table -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 2 │ @@ -935,7 +936,7 @@ For the table Run the query `SELECT * FROM t_null_nan ORDER BY y NULLS FIRST` to get: -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 7 │ ᴺᵁᴸᴸ │ @@ -1031,7 +1032,7 @@ If there isn't an `ORDER BY` clause that explicitly sorts results, the result ma You can use UNION ALL to combine any number of queries. Example: -``` sql +```sql SELECT CounterID, 1 AS table, toInt64(count()) AS c FROM test.hits GROUP BY CounterID @@ -1078,7 +1079,7 @@ The left side of the operator is either a single column or a tuple. Examples: -``` sql +```sql SELECT UserID IN (123, 456) FROM ... SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... ``` @@ -1096,7 +1097,7 @@ If the right side of the operator is a table name that has the Set engine (a pre The subquery may specify more than one column for filtering tuples. Example: -``` sql +```sql SELECT (CounterID, UserID) IN (SELECT CounterID, UserID FROM ...) FROM ... ``` @@ -1105,7 +1106,7 @@ The columns to the left and right of the IN operator should have the same type. The IN operator and subquery may occur in any part of the query, including in aggregate functions and lambda functions. Example: -``` sql +```sql SELECT EventDate, avg(UserID IN @@ -1119,7 +1120,7 @@ GROUP BY EventDate ORDER BY EventDate ASC ``` -``` +```text ┌──EventDate─┬────ratio─┐ │ 2014-03-17 │ 1 │ │ 2014-03-18 │ 0.807696 │ @@ -1140,7 +1141,7 @@ During request processing, the IN operator assumes that the result of an operati Here is an example with the `t_null` table: -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -1149,7 +1150,7 @@ Here is an example with the `t_null` table: Running the query `SELECT x FROM t_null WHERE y IN (NULL,3)` gives you the following result: -``` +```text ┌─x─┐ │ 2 │ └───┘ @@ -1157,10 +1158,11 @@ Running the query `SELECT x FROM t_null WHERE y IN (NULL,3)` gives you the follo You can see that the row in which `y = NULL` is thrown out of the query results. This is because ClickHouse can't decide whether `NULL` is included in the `(NULL,3)` set, returns `0` as the result of the operation, and `SELECT` excludes this row from the final output. -``` +```sql SELECT y IN (NULL, 3) FROM t_null - +``` +```text ┌─in(y, tuple(NULL, 3))─┐ │ 0 │ │ 1 │ @@ -1189,13 +1191,13 @@ For a query to the **distributed_table**, the query will be sent to all the remo For example, the query -``` sql +```sql SELECT uniq(UserID) FROM distributed_table ``` will be sent to all remote servers as -``` sql +```sql SELECT uniq(UserID) FROM local_table ``` @@ -1203,7 +1205,7 @@ and run on each of them in parallel, until it reaches the stage where intermedia Now let's examine a query with IN: -``` sql +```sql SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34) ``` @@ -1211,7 +1213,7 @@ SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID I This query will be sent to all remote servers as -``` sql +```sql SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34) ``` @@ -1221,19 +1223,19 @@ This will work correctly and optimally if you are prepared for this case and hav To correct how the query works when data is spread randomly across the cluster servers, you could specify **distributed_table** inside a subquery. The query would look like this: -``` sql +```sql SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) ``` This query will be sent to all remote servers as -``` sql +```sql SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) ``` The subquery will begin running on each remote server. Since the subquery uses a distributed table, the subquery that is on each remote server will be resent to every remote server as -``` sql +```sql SELECT UserID FROM local_table WHERE CounterID = 34 ``` @@ -1241,19 +1243,19 @@ For example, if you have a cluster of 100 servers, executing the entire query wi In such cases, you should always use GLOBAL IN instead of IN. Let's look at how it works for the query -``` sql +```sql SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID GLOBAL IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) ``` The requestor server will run the subquery -``` sql +```sql SELECT UserID FROM distributed_table WHERE CounterID = 34 ``` and the result will be put in a temporary table in RAM. Then the request will be sent to each remote server as -``` sql +```sql SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID GLOBAL IN _data1 ``` diff --git a/docs/en/query_language/syntax.md b/docs/en/query_language/syntax.md index 9c6e69f51bd..a67a0983961 100644 --- a/docs/en/query_language/syntax.md +++ b/docs/en/query_language/syntax.md @@ -4,7 +4,7 @@ There are two types of parsers in the system: the full SQL parser (a recursive d In all cases except the `INSERT` query, only the full SQL parser is used. The `INSERT` query uses both parsers: -``` sql +```sql INSERT INTO t VALUES (1, 'Hello, world'), (2, 'abc'), (3, 'def') ``` @@ -112,7 +112,7 @@ Data types and table engines in the `CREATE` query are written the same way as i An alias is a user-defined name for an expression in a query. -``` +```sql expr AS alias ``` @@ -140,7 +140,7 @@ If an alias is defined for the result columns in the `SELECT` clause of a subque Be careful with aliases that are the same as column or table names. Let's consider the following example: -``` +```sql CREATE TABLE t ( a Int, @@ -149,12 +149,13 @@ CREATE TABLE t ENGINE = TinyLog() ``` -``` +```sql SELECT argMax(a, b), sum(b) AS b FROM t - +``` +```text Received exception from server (version 18.14.17): Code: 184. DB::Exception: Received from localhost:9000, 127.0.0.1. DB::Exception: Aggregate function sum(b) is found inside another aggregate function in query. ``` diff --git a/docs/en/query_language/table_functions/file.md b/docs/en/query_language/table_functions/file.md index 0cb1f0d36bf..2e8bdc82017 100644 --- a/docs/en/query_language/table_functions/file.md +++ b/docs/en/query_language/table_functions/file.md @@ -3,7 +3,7 @@ Creates a table from a file. -``` +```sql file(path, format, structure) ``` @@ -39,14 +39,14 @@ FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 2 ``` -``` +```text ┌─column1─┬─column2─┬─column3─┐ │ 1 │ 2 │ 3 │ │ 3 │ 2 │ 1 │ └─────────┴─────────┴─────────┘ ``` -``` sql +```sql -- getting the first 10 lines of a table that contains 3 columns of UInt32 type from a CSV file SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 10 ``` diff --git a/docs/en/query_language/table_functions/hdfs.md b/docs/en/query_language/table_functions/hdfs.md index cce9b308101..1db9f21fefd 100644 --- a/docs/en/query_language/table_functions/hdfs.md +++ b/docs/en/query_language/table_functions/hdfs.md @@ -3,7 +3,7 @@ Creates a table from a file in HDFS. -``` +```sql hdfs(URI, format, structure) ``` @@ -27,7 +27,7 @@ FROM hdfs('hdfs://hdfs1:9000/test', 'TSV', 'column1 UInt32, column2 UInt32, colu LIMIT 2 ``` -``` +```text ┌─column1─┬─column2─┬─column3─┐ │ 1 │ 2 │ 3 │ │ 3 │ 2 │ 1 │ diff --git a/docs/en/query_language/table_functions/input.md b/docs/en/query_language/table_functions/input.md index 27cbaa4abbd..408ff7939ac 100644 --- a/docs/en/query_language/table_functions/input.md +++ b/docs/en/query_language/table_functions/input.md @@ -23,13 +23,13 @@ and data in `data.csv` has a different structure `(col1 String, col2 Date, col3 data from the `data.csv` into the `test` table with simultaneous conversion looks like this: ```bash -cat data.csv | clickhouse-client --query="INSERT INTO test SELECT lower(col1), col3 * col3 FROM input('col1 String, col2 Date, col3 Int32') FORMAT CSV"; +$ cat data.csv | clickhouse-client --query="INSERT INTO test SELECT lower(col1), col3 * col3 FROM input('col1 String, col2 Date, col3 Int32') FORMAT CSV"; ``` - If `data.csv` contains data of the same structure `test_structure` as the table `test` then these two queries are equal: ```bash -cat data.csv | clickhouse-client --query="INSERT INTO test FORMAT CSV" -cat data.csv | clickhouse-client --query="INSERT INTO test SELECT * FROM input('test_structure') FORMAT CSV" +$ cat data.csv | clickhouse-client --query="INSERT INTO test FORMAT CSV" +$ cat data.csv | clickhouse-client --query="INSERT INTO test SELECT * FROM input('test_structure') FORMAT CSV" ``` [Original article](https://clickhouse.yandex/docs/en/query_language/table_functions/input/) diff --git a/docs/en/query_language/table_functions/jdbc.md b/docs/en/query_language/table_functions/jdbc.md index c7108a5af9f..03cd1710e6b 100644 --- a/docs/en/query_language/table_functions/jdbc.md +++ b/docs/en/query_language/table_functions/jdbc.md @@ -8,15 +8,15 @@ It supports Nullable types (based on DDL of remote table that is queried). **Examples** -``` sql +```sql SELECT * FROM jdbc('jdbc:mysql://localhost:3306/?user=root&password=root', 'schema', 'table') ``` -``` sql +```sql SELECT * FROM jdbc('mysql://localhost:3306/?user=root&password=root', 'schema', 'table') ``` -``` sql +```sql SELECT * FROM jdbc('datasource://mysql-local', 'schema', 'table') ``` diff --git a/docs/en/query_language/table_functions/mysql.md b/docs/en/query_language/table_functions/mysql.md index f1d4773a6e8..4d643ba286d 100644 --- a/docs/en/query_language/table_functions/mysql.md +++ b/docs/en/query_language/table_functions/mysql.md @@ -2,7 +2,7 @@ Allows `SELECT` queries to be performed on data that is stored on a remote MySQL server. -``` +```sql mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']); ``` @@ -32,7 +32,7 @@ A table object with the same columns as the original MySQL table. Table in MySQL: -``` +```text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, diff --git a/docs/en/query_language/table_functions/numbers.md b/docs/en/query_language/table_functions/numbers.md index e8b025d922e..f3f64cf6399 100644 --- a/docs/en/query_language/table_functions/numbers.md +++ b/docs/en/query_language/table_functions/numbers.md @@ -7,7 +7,7 @@ Similar to the `system.numbers` table, it can be used for testing and generating The following queries are equivalent: -``` sql +```sql SELECT * FROM numbers(10); SELECT * FROM numbers(0, 10); SELECT * FROM system.numbers LIMIT 10; @@ -15,7 +15,7 @@ SELECT * FROM system.numbers LIMIT 10; Examples: -``` sql +```sql -- Generate a sequence of dates from 2010-01-01 to 2010-12-31 select toDate('2010-01-01') + number as d FROM numbers(365); ``` diff --git a/docs/en/query_language/table_functions/odbc.md b/docs/en/query_language/table_functions/odbc.md index 0c3204d4ca5..65036a6219d 100644 --- a/docs/en/query_language/table_functions/odbc.md +++ b/docs/en/query_language/table_functions/odbc.md @@ -2,7 +2,7 @@ Returns table that is connected via [ODBC](https://en.wikipedia.org/wiki/Open_Database_Connectivity). -``` +```sql odbc(connection_settings, external_database, external_table) ``` @@ -26,15 +26,17 @@ Ensure that unixODBC and MySQL Connector are installed. By default (if installed from packages), ClickHouse starts as user `clickhouse`. Thus you need to create and configure this user in the MySQL server. +```bash +$ sudo mysql ``` -sudo mysql +```sql mysql> CREATE USER 'clickhouse'@'localhost' IDENTIFIED BY 'clickhouse'; mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'clickhouse' WITH GRANT OPTION; ``` Then configure the connection in `/etc/odbc.ini`. -``` +```bash $ cat /etc/odbc.ini [mysqlconn] DRIVER = /usr/local/lib/libmyodbc5w.so @@ -47,8 +49,8 @@ PASSWORD = clickhouse You can check the connection using the `isql` utility from the unixODBC installation. -``` -isql -v mysqlconn +```bash +$ isql -v mysqlconn +---------------------------------------+ | Connected! | | | @@ -57,7 +59,7 @@ isql -v mysqlconn Table in MySQL: -``` +```text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, diff --git a/docs/en/query_language/table_functions/remote.md b/docs/en/query_language/table_functions/remote.md index 3b9263d959c..0543d17cade 100644 --- a/docs/en/query_language/table_functions/remote.md +++ b/docs/en/query_language/table_functions/remote.md @@ -5,7 +5,7 @@ Allows you to access remote servers without creating a `Distributed` table. Signatures: -``` sql +```sql remote('addresses_expr', db, table[, 'user'[, 'password']]) remote('addresses_expr', db.table[, 'user'[, 'password']]) ``` @@ -17,7 +17,7 @@ remote('addresses_expr', db.table[, 'user'[, 'password']]) Examples: -``` +```text example01-01-1 example01-01-1:9000 localhost @@ -30,19 +30,19 @@ Multiple addresses can be comma-separated. In this case, ClickHouse will use dis Example: -``` +```text example01-01-1,example01-02-1 ``` Part of the expression can be specified in curly brackets. The previous example can be written as follows: -``` +```text example01-0{1,2}-1 ``` Curly brackets can contain a range of numbers separated by two dots (non-negative integers). In this case, the range is expanded to a set of values that generate shard addresses. If the first number starts with zero, the values are formed with the same zero alignment. The previous example can be written as follows: -``` +```text example01-{01..02}-1 ``` @@ -52,7 +52,7 @@ Addresses and parts of addresses in curly brackets can be separated by the pipe Example: -``` +```text example01-{01..02}-{1|2} ``` diff --git a/docs/en/query_language/table_functions/url.md b/docs/en/query_language/table_functions/url.md index edeabdc6902..005a23b6bdc 100644 --- a/docs/en/query_language/table_functions/url.md +++ b/docs/en/query_language/table_functions/url.md @@ -12,7 +12,7 @@ structure - table structure in `'UserID UInt64, Name String'` format. Determines **Example** -``` sql +```sql -- getting the first 3 lines of a table that contains columns of String and UInt32 type from HTTP-server which answers in CSV format. SELECT * FROM url('http://127.0.0.1:12345/', CSV, 'column1 String, column2 UInt32') LIMIT 3 ``` diff --git a/docs/ru/data_types/array.md b/docs/ru/data_types/array.md index 890e314be99..c6cc6af4b13 100644 --- a/docs/ru/data_types/array.md +++ b/docs/ru/data_types/array.md @@ -8,41 +8,32 @@ Массив можно создать с помощью функции: -``` +```sql array(T) ``` Также можно использовать квадратные скобки -``` +```sql [] ``` Пример создания массива: +```sql +SELECT array(1, 2) AS x, toTypeName(x) ``` -:) SELECT array(1, 2) AS x, toTypeName(x) - -SELECT - [1, 2] AS x, - toTypeName(x) - +```text ┌─x─────┬─toTypeName(array(1, 2))─┐ │ [1,2] │ Array(UInt8) │ └───────┴─────────────────────────┘ - -1 rows in set. Elapsed: 0.002 sec. - -:) SELECT [1, 2] AS x, toTypeName(x) - -SELECT - [1, 2] AS x, - toTypeName(x) - +``` +```sql +SELECT [1, 2] AS x, toTypeName(x) +``` +```text ┌─x─────┬─toTypeName([1, 2])─┐ │ [1,2] │ Array(UInt8) │ └───────┴────────────────────┘ - -1 rows in set. Elapsed: 0.002 sec. ``` ## Особенности работы с типами данных @@ -53,31 +44,23 @@ SELECT Примеры автоматического определения типа данных: +```sql +SELECT array(1, 2, NULL) AS x, toTypeName(x) ``` -:) SELECT array(1, 2, NULL) AS x, toTypeName(x) - -SELECT - [1, 2, NULL] AS x, - toTypeName(x) - +```text ┌─x──────────┬─toTypeName(array(1, 2, NULL))─┐ │ [1,2,NULL] │ Array(Nullable(UInt8)) │ └────────────┴───────────────────────────────┘ - -1 rows in set. Elapsed: 0.002 sec. ``` Если попытаться создать массив из несовместимых типов данных, то ClickHouse выбросит исключение: +```sql +SELECT array(1, 'a') ``` -:) SELECT array(1, 'a') - -SELECT [1, 'a'] - +```text Received exception from server (version 1.1.54388): Code: 386. DB::Exception: Received from localhost:9000, 127.0.0.1. DB::Exception: There is no supertype for types UInt8, String because some of them are String/FixedString and some of them are not. - -0 rows in set. Elapsed: 0.246 sec. ``` [Оригинальная статья](https://clickhouse.yandex/docs/ru/data_types/array/) diff --git a/docs/ru/data_types/decimal.md b/docs/ru/data_types/decimal.md index 1dc2679213c..110eb84f31f 100644 --- a/docs/ru/data_types/decimal.md +++ b/docs/ru/data_types/decimal.md @@ -50,35 +50,35 @@ ## Проверка переполнений При выполнении операций над типом Decimal могут происходить целочисленные переполнения. Лишняя дробная часть отбрасывается (не округляется). Лишняя целочисленная часть приводит к исключению. -``` +```sql SELECT toDecimal32(2, 4) AS x, x / 3 ``` -``` +```text ┌──────x─┬─divide(toDecimal32(2, 4), 3)─┐ │ 2.0000 │ 0.6666 │ └────────┴──────────────────────────────┘ ``` -``` +```sql SELECT toDecimal32(4.2, 8) AS x, x * x ``` -``` +```text DB::Exception: Scale is out of bounds. ``` -``` +```sql SELECT toDecimal32(4.2, 8) AS x, 6 * x ``` -``` +```text DB::Exception: Decimal math overflow. ``` Проверка переполнения приводит к замедлению операций. При уверенности, что типа результата хватит для его записи проверку переполнения можно отключить настройкой decimal_check_overflow. В этом случае при переполнении вернется неверное значение: -``` +```sql SET decimal_check_overflow = 0; SELECT toDecimal32(4.2, 8) AS x, 6 * x ``` -``` +```text ┌──────────x─┬─multiply(6, toDecimal32(4.2, 8))─┐ │ 4.20000000 │ -17.74967296 │ └────────────┴──────────────────────────────────┘ @@ -86,10 +86,10 @@ SELECT toDecimal32(4.2, 8) AS x, 6 * x Переполнения происходят не только на арифметических операциях, но и на операциях сравнения. Отключать проверку стоит только при полной уверенности в корректности результата: -``` +```sql SELECT toDecimal32(1, 8) < 100 ``` -``` +```text DB::Exception: Can't compare. ``` diff --git a/docs/ru/data_types/domains/ipv4.md b/docs/ru/data_types/domains/ipv4.md index 03f8f5900aa..8d8b5bf9fc5 100644 --- a/docs/ru/data_types/domains/ipv4.md +++ b/docs/ru/data_types/domains/ipv4.md @@ -10,7 +10,7 @@ CREATE TABLE hits (url String, from IPv4) ENGINE = MergeTree() ORDER BY url; DESCRIBE TABLE hits; ``` -``` +```text ┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ │ url │ String │ │ │ │ │ │ from │ IPv4 │ │ │ │ │ @@ -31,7 +31,7 @@ INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '116.253.40.133')( SELECT * FROM hits; ``` -``` +```text ┌─url────────────────────────────────┬───────────from─┐ │ https://clickhouse.yandex/docs/en/ │ 116.106.34.242 │ │ https://wikipedia.org │ 116.253.40.133 │ @@ -45,7 +45,7 @@ SELECT * FROM hits; SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; ``` -``` +```text ┌─toTypeName(from)─┬─hex(from)─┐ │ IPv4 │ B7F7E83A │ └──────────────────┴───────────┘ @@ -58,7 +58,7 @@ SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; SELECT toTypeName(s), IPv4NumToString(from) AS s FROM hits LIMIT 1; ``` -``` +```text ┌─toTypeName(IPv4NumToString(from))─┬─s──────────────┐ │ String │ 183.247.232.58 │ └───────────────────────────────────┴────────────────┘ @@ -70,7 +70,7 @@ SELECT toTypeName(s), IPv4NumToString(from) AS s FROM hits LIMIT 1; SELECT toTypeName(i), CAST(from AS UInt32) AS i FROM hits LIMIT 1; ``` -``` +```text ┌─toTypeName(CAST(from, 'UInt32'))─┬──────────i─┐ │ UInt32 │ 3086477370 │ └──────────────────────────────────┴────────────┘ diff --git a/docs/ru/data_types/domains/ipv6.md b/docs/ru/data_types/domains/ipv6.md index e3f1c190060..796f8ef4040 100644 --- a/docs/ru/data_types/domains/ipv6.md +++ b/docs/ru/data_types/domains/ipv6.md @@ -10,7 +10,7 @@ CREATE TABLE hits (url String, from IPv6) ENGINE = MergeTree() ORDER BY url; DESCRIBE TABLE hits; ``` -``` +```text ┌─name─┬─type───┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┐ │ url │ String │ │ │ │ │ │ from │ IPv6 │ │ │ │ │ @@ -31,7 +31,7 @@ INSERT INTO hits (url, from) VALUES ('https://wikipedia.org', '2a02:aa08:e000:31 SELECT * FROM hits; ``` -``` +```text ┌─url────────────────────────────────┬─from──────────────────────────┐ │ https://clickhouse.yandex │ 2001:44c8:129:2632:33:0:252:2 │ │ https://clickhouse.yandex/docs/en/ │ 2a02:e980:1e::1 │ @@ -45,7 +45,7 @@ SELECT * FROM hits; SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; ``` -``` +```text ┌─toTypeName(from)─┬─hex(from)────────────────────────┐ │ IPv6 │ 200144C8012926320033000002520002 │ └──────────────────┴──────────────────────────────────┘ @@ -58,7 +58,7 @@ SELECT toTypeName(from), hex(from) FROM hits LIMIT 1; SELECT toTypeName(s), IPv6NumToString(from) AS s FROM hits LIMIT 1; ``` -``` +```text ┌─toTypeName(IPv6NumToString(from))─┬─s─────────────────────────────┐ │ String │ 2001:44c8:129:2632:33:0:252:2 │ └───────────────────────────────────┴───────────────────────────────┘ @@ -70,7 +70,7 @@ SELECT toTypeName(s), IPv6NumToString(from) AS s FROM hits LIMIT 1; SELECT toTypeName(i), CAST(from AS FixedString(16)) AS i FROM hits LIMIT 1; ``` -``` +```text ┌─toTypeName(CAST(from, 'FixedString(16)'))─┬─i───────┐ │ FixedString(16) │ ��� │ └───────────────────────────────────────────┴─────────┘ diff --git a/docs/ru/data_types/enum.md b/docs/ru/data_types/enum.md index 8d32cce1648..764df79eb74 100644 --- a/docs/ru/data_types/enum.md +++ b/docs/ru/data_types/enum.md @@ -26,19 +26,15 @@ ENGINE = TinyLog В столбец `x` можно сохранять только значения, перечисленные при определении типа, т.е. `'hello'` или `'world'`. Если вы попытаетесь сохранить любое другое значение, ClickHouse сгенерирует исключение. ClickHouse автоматически выберет размерность 8-bit для этого `Enum`. ```sql -:) INSERT INTO t_enum VALUES ('hello'), ('world'), ('hello') - -INSERT INTO t_enum VALUES - +INSERT INTO t_enum VALUES ('hello'), ('world'), ('hello') +``` +```text Ok. - -3 rows in set. Elapsed: 0.002 sec. - -:) insert into t_enum values('a') - -INSERT INTO t_enum VALUES - - +``` +```sql +insert into t_enum values('a') +``` +```text Exception on client: Code: 49. DB::Exception: Unknown element 'a' for type Enum('hello' = 1, 'world' = 2) ``` @@ -47,7 +43,8 @@ Code: 49. DB::Exception: Unknown element 'a' for type Enum('hello' = 1, 'world' ```sql SELECT * FROM t_enum - +``` +```text ┌─x─────┐ │ hello │ │ world │ @@ -59,7 +56,8 @@ SELECT * FROM t_enum ```sql SELECT CAST(x AS Int8) FROM t_enum - +``` +```text ┌─CAST(x, 'Int8')─┐ │ 1 │ │ 2 │ @@ -71,7 +69,8 @@ SELECT CAST(x AS Int8) FROM t_enum ```sql SELECT toTypeName(CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)')) - +``` +```text ┌─toTypeName(CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)'))─┐ │ Enum8('a' = 1, 'b' = 2) │ └─────────────────────────────────────────────────────┘ @@ -85,7 +84,7 @@ SELECT toTypeName(CAST('a', 'Enum(\'a\' = 1, \'b\' = 2)')) `Enum` может быть передан в тип [Nullable](nullable.md). Таким образом, если создать таблицу запросом -``` +```sql CREATE TABLE t_enum_nullable ( x Nullable( Enum8('hello' = 1, 'world' = 2) ) @@ -95,7 +94,7 @@ ENGINE = TinyLog , то в ней можно будет хранить не только `'hello'` и `'world'`, но и `NULL`. -``` +```sql INSERT INTO t_enum_nullable Values('hello'),('world'),(NULL) ``` diff --git a/docs/ru/data_types/fixedstring.md b/docs/ru/data_types/fixedstring.md index 1634a95bc67..64b8dbf6409 100644 --- a/docs/ru/data_types/fixedstring.md +++ b/docs/ru/data_types/fixedstring.md @@ -4,7 +4,7 @@ Чтобы объявить столбец типа `FixedString`, используйте следующий синтаксис: -``` +```sql FixedString(N) ``` @@ -30,7 +30,7 @@ Рассмотрим следующую таблицу с единственным столбцом типа `FixedString(2)`: -``` +```text ┌─name──┐ │ b │ └───────┘ @@ -38,15 +38,15 @@ Запрос `SELECT * FROM FixedStringTable WHERE a = 'b'` не возвращает необходимых данных. Необходимо дополнить шаблон фильтра нулевыми байтами. -``` +```sql SELECT * FROM FixedStringTable WHERE a = 'b\0' - +``` +```text ┌─a─┐ │ b │ └───┘ -1 rows in set. Elapsed: 0.002 sec. ``` Это поведение отличается от поведения MySQL для типа `CHAR`, где строки дополняются пробелами, а пробелы перед выводом вырезаются. diff --git a/docs/ru/data_types/float.md b/docs/ru/data_types/float.md index 395d53cda7d..ce5132dcb9c 100644 --- a/docs/ru/data_types/float.md +++ b/docs/ru/data_types/float.md @@ -13,10 +13,10 @@ - При вычислениях с числами с плавающей запятой возможна ошибка округления. -``` sql +```sql SELECT 1 - 0.9 ``` -``` +```text ┌───────minus(1, 0.9)─┐ │ 0.09999999999999998 │ └─────────────────────┘ @@ -32,11 +32,11 @@ SELECT 1 - 0.9 - `Inf` - бесконечность. -``` sql +```sql SELECT 0.5 / 0 ``` -``` +```text ┌─divide(0.5, 0)─┐ │ inf │ └────────────────┘ @@ -44,11 +44,11 @@ SELECT 0.5 / 0 - `-Inf` - отрицательная бесконечность; -``` sql +```sql SELECT -0.5 / 0 ``` -``` +```text ┌─divide(-0.5, 0)─┐ │ -inf │ └─────────────────┘ @@ -56,10 +56,10 @@ SELECT -0.5 / 0 - `NaN` - не число. -``` +```sql SELECT 0 / 0 ``` -``` +```text ┌─divide(0, 0)─┐ │ nan │ └──────────────┘ diff --git a/docs/ru/data_types/nested_data_structures/aggregatefunction.md b/docs/ru/data_types/nested_data_structures/aggregatefunction.md index 6e4cabc098a..b772aaf935b 100644 --- a/docs/ru/data_types/nested_data_structures/aggregatefunction.md +++ b/docs/ru/data_types/nested_data_structures/aggregatefunction.md @@ -33,7 +33,7 @@ CREATE TABLE t **Примеры функций** -``` +```sql uniqState(UserID) quantilesState(0.5, 0.9)(SendTiming) ``` diff --git a/docs/ru/data_types/nested_data_structures/nested.md b/docs/ru/data_types/nested_data_structures/nested.md index 06a62801e6e..58c7c48da3e 100644 --- a/docs/ru/data_types/nested_data_structures/nested.md +++ b/docs/ru/data_types/nested_data_structures/nested.md @@ -4,7 +4,7 @@ Пример: -``` sql +```sql CREATE TABLE test.visits ( CounterID UInt32, @@ -35,7 +35,7 @@ CREATE TABLE test.visits Пример: -``` sql +```sql SELECT Goals.ID, Goals.EventTime @@ -44,7 +44,7 @@ WHERE CounterID = 101500 AND length(Goals.ID) < 5 LIMIT 10 ``` -``` +```text ┌─Goals.ID───────────────────────┬─Goals.EventTime───────────────────────────────────────────────────────────────────────────┐ │ [1073752,591325,591325] │ ['2014-03-17 16:38:10','2014-03-17 16:38:48','2014-03-17 16:42:27'] │ │ [1073752] │ ['2014-03-17 00:28:25'] │ @@ -63,7 +63,7 @@ LIMIT 10 Единственное место, где в запросе SELECT можно указать имя целой вложенной структуры данных, а не отдельных столбцов - секция ARRAY JOIN. Подробнее см. раздел "Секция ARRAY JOIN". Пример: -``` sql +```sql SELECT Goal.ID, Goal.EventTime @@ -73,7 +73,7 @@ WHERE CounterID = 101500 AND length(Goals.ID) < 5 LIMIT 10 ``` -``` +```text ┌─Goal.ID─┬──────Goal.EventTime─┐ │ 1073752 │ 2014-03-17 16:38:10 │ │ 591325 │ 2014-03-17 16:38:48 │ diff --git a/docs/ru/data_types/nullable.md b/docs/ru/data_types/nullable.md index f9c3715519b..a953f6bb2b7 100644 --- a/docs/ru/data_types/nullable.md +++ b/docs/ru/data_types/nullable.md @@ -24,40 +24,21 @@ ## Пример использования +```sql +CREATE TABLE t_null(x Int8, y Nullable(Int8)) ENGINE TinyLog ``` -:) CREATE TABLE t_null(x Int8, y Nullable(Int8)) ENGINE TinyLog - -CREATE TABLE t_null -( - x Int8, - y Nullable(Int8) -) -ENGINE = TinyLog - -Ok. - -0 rows in set. Elapsed: 0.012 sec. - -:) INSERT INTO t_null VALUES (1, NULL), (2, 3) - -INSERT INTO t_null VALUES - -Ok. - -1 rows in set. Elapsed: 0.007 sec. - -:) SELECT x + y from t_null - -SELECT x + y -FROM t_null - +```sql +INSERT INTO t_null VALUES (1, NULL), (2, 3) +``` +```sql +SELECT x + y from t_null +``` +```text ┌─plus(x, y)─┐ │ ᴺᵁᴸᴸ │ │ 5 │ └────────────┘ -2 rows in set. Elapsed: 0.144 sec. - ``` [Оригинальная статья](https://clickhouse.yandex/docs/ru/data_types/nullable/) diff --git a/docs/ru/data_types/special_data_types/nothing.md b/docs/ru/data_types/special_data_types/nothing.md index 19f879e2242..65b90ca8c89 100644 --- a/docs/ru/data_types/special_data_types/nothing.md +++ b/docs/ru/data_types/special_data_types/nothing.md @@ -5,16 +5,13 @@ Невозможно создать значение типа `Nothing`, поэтому он используется там, где значение не подразумевается. Например, `NULL` записывается как `Nullable(Nothing)` ([Nullable](../../data_types/nullable.md) — это тип данных, позволяющий хранить `NULL` в таблицах). Также тип `Nothing` используется для обозначения пустых массивов: -```bash -:) SELECT toTypeName(Array()) - -SELECT toTypeName([]) - +```sql +SELECT toTypeName(Array()) +``` +```text ┌─toTypeName(array())─┐ │ Array(Nothing) │ └─────────────────────┘ - -1 rows in set. Elapsed: 0.062 sec. ``` [Оригинальная статья](https://clickhouse.yandex/docs/ru/data_types/special_data_types/nothing/) diff --git a/docs/ru/data_types/tuple.md b/docs/ru/data_types/tuple.md index baf06908087..7b6d11ca168 100644 --- a/docs/ru/data_types/tuple.md +++ b/docs/ru/data_types/tuple.md @@ -11,24 +11,19 @@ Кортеж можно создать с помощью функции -``` +```sql tuple(T1, T2, ...) ``` Пример создания кортежа: +```sql +SELECT tuple(1,'a') AS x, toTypeName(x) ``` -:) SELECT tuple(1,'a') AS x, toTypeName(x) - -SELECT - (1, 'a') AS x, - toTypeName(x) - +```text ┌─x───────┬─toTypeName(tuple(1, 'a'))─┐ │ (1,'a') │ Tuple(UInt8, String) │ └─────────┴───────────────────────────┘ - -1 rows in set. Elapsed: 0.021 sec. ``` ## Особенности работы с типами данных @@ -37,18 +32,13 @@ SELECT Пример автоматического определения типа данных: -``` +```sql SELECT tuple(1,NULL) AS x, toTypeName(x) - -SELECT - (1, NULL) AS x, - toTypeName(x) - +``` +```text ┌─x────────┬─toTypeName(tuple(1, NULL))──────┐ │ (1,NULL) │ Tuple(UInt8, Nullable(Nothing)) │ └──────────┴─────────────────────────────────┘ - -1 rows in set. Elapsed: 0.002 sec. ``` [Оригинальная статья](https://clickhouse.yandex/docs/ru/data_types/tuple/) diff --git a/docs/ru/data_types/uuid.md b/docs/ru/data_types/uuid.md index 19696f51f65..8c583bf72bf 100644 --- a/docs/ru/data_types/uuid.md +++ b/docs/ru/data_types/uuid.md @@ -4,13 +4,13 @@ Пример UUID значения представлен ниже: -``` +```text 61f0c404-5cb3-11e7-907b-a6006ad3dba0 ``` Если при вставке новой записи значение для UUID-колонки не указано, UUID идентификатор будет заполнен нулями: -``` +```text 00000000-0000-0000-0000-000000000000 ``` @@ -26,15 +26,16 @@ Этот пример демонстрирует, как создать таблицу с UUID-колонкой и добавить в нее сгенерированный UUID. -``` sql -:) CREATE TABLE t_uuid (x UUID, y String) ENGINE=TinyLog - -:) INSERT INTO t_uuid SELECT generateUUIDv4(), 'Example 1' - -:) SELECT * FROM t_uuid +```sql +CREATE TABLE t_uuid (x UUID, y String) ENGINE=TinyLog ``` - +```sql +INSERT INTO t_uuid SELECT generateUUIDv4(), 'Example 1' ``` +```sql +SELECT * FROM t_uuid +``` +```text ┌────────────────────────────────────x─┬─y─────────┐ │ 417ddc5d-e556-4d27-95dd-a34d84e46a50 │ Example 1 │ └──────────────────────────────────────┴───────────┘ @@ -44,13 +45,13 @@ В этом примере, при добавлении записи в таблицу значение для UUID-колонки не задано. UUID будет заполнен нулями. -``` sql -:) INSERT INTO t_uuid (y) VALUES ('Example 2') - -:) SELECT * FROM t_uuid +```sql +INSERT INTO t_uuid (y) VALUES ('Example 2') ``` - +```sql +SELECT * FROM t_uuid ``` +```text ┌────────────────────────────────────x─┬─y─────────┐ │ 417ddc5d-e556-4d27-95dd-a34d84e46a50 │ Example 1 │ │ 00000000-0000-0000-0000-000000000000 │ Example 2 │ diff --git a/docs/ru/database_engines/mysql.md b/docs/ru/database_engines/mysql.md index acfb71d839c..28922638744 100644 --- a/docs/ru/database_engines/mysql.md +++ b/docs/ru/database_engines/mysql.md @@ -51,8 +51,7 @@ ENGINE = MySQL('host:port', 'database', 'user', 'password') ## Примеры использования Таблица в MySQL: - -``` +```text mysql> USE test; Database changed @@ -73,7 +72,6 @@ mysql> select * from mysql_table; +--------+-------+ 1 row in set (0,00 sec) ``` - База данных в ClickHouse, позволяющая обмениваться данными с сервером MySQL: ```sql diff --git a/docs/ru/development/style.md b/docs/ru/development/style.md index 0920f35e817..d72a11d1d49 100644 --- a/docs/ru/development/style.md +++ b/docs/ru/development/style.md @@ -401,13 +401,13 @@ enum class CompressionMethod **15.** Все имена - по английски. Транслит с русского использовать нельзя. -``` +```text не Stroka ``` **16.** Сокращения (из нескольких букв разных слов) в именах можно использовать только если они являются общепринятыми (если для сокращения можно найти расшифровку в английской википедии или сделав поисковый запрос). -``` +```text `AST`, `SQL`. Не `NVDH` (что-то неведомое) diff --git a/docs/ru/faq/general.md b/docs/ru/faq/general.md index cc388b4ef5d..010926d2cf9 100644 --- a/docs/ru/faq/general.md +++ b/docs/ru/faq/general.md @@ -17,7 +17,7 @@ **Пример** -``` +```sql NLS_LANG=RUSSIAN_RUSSIA.UTF8 ``` diff --git a/docs/ru/getting_started/example_datasets/amplab_benchmark.md b/docs/ru/getting_started/example_datasets/amplab_benchmark.md index 87b8de2be43..f1ef7230ed4 100644 --- a/docs/ru/getting_started/example_datasets/amplab_benchmark.md +++ b/docs/ru/getting_started/example_datasets/amplab_benchmark.md @@ -8,16 +8,16 @@ Выполните следующее в консоли: ```bash -sudo apt-get install s3cmd -mkdir tiny; cd tiny; -s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ . -cd .. -mkdir 1node; cd 1node; -s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ . -cd .. -mkdir 5nodes; cd 5nodes; -s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ . -cd .. +$ sudo apt-get install s3cmd +$ mkdir tiny; cd tiny; +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/tiny/ . +$ cd .. +$ mkdir 1node; cd 1node; +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/1node/ . +$ cd .. +$ mkdir 5nodes; cd 5nodes; +$ s3cmd sync s3://big-data-benchmark/pavlo/text-deflate/5nodes/ . +$ cd .. ``` Выполните следующие запросы к ClickHouse: @@ -87,12 +87,12 @@ CREATE TABLE uservisits_5nodes_on_single Возвращаемся в консоль: ```bash -for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done -for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done -for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done -for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done -for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done -for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done +$ for i in tiny/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_tiny FORMAT CSV"; done +$ for i in tiny/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_tiny FORMAT CSV"; done +$ for i in 1node/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_1node FORMAT CSV"; done +$ for i in 1node/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_1node FORMAT CSV"; done +$ for i in 5nodes/rankings/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO rankings_5nodes_on_single FORMAT CSV"; done +$ for i in 5nodes/uservisits/*.deflate; do echo $i; zlib-flate -uncompress < $i | clickhouse-client --host=example-perftest01j --query="INSERT INTO uservisits_5nodes_on_single FORMAT CSV"; done ``` Запросы для получения выборок данных: diff --git a/docs/ru/getting_started/example_datasets/criteo.md b/docs/ru/getting_started/example_datasets/criteo.md index 32fc8e234dc..5afb8046a57 100644 --- a/docs/ru/getting_started/example_datasets/criteo.md +++ b/docs/ru/getting_started/example_datasets/criteo.md @@ -4,19 +4,19 @@ Создайте таблицу для импорта лога: -``` sql +```sql CREATE TABLE criteo_log (date Date, clicked UInt8, int1 Int32, int2 Int32, int3 Int32, int4 Int32, int5 Int32, int6 Int32, int7 Int32, int8 Int32, int9 Int32, int10 Int32, int11 Int32, int12 Int32, int13 Int32, cat1 String, cat2 String, cat3 String, cat4 String, cat5 String, cat6 String, cat7 String, cat8 String, cat9 String, cat10 String, cat11 String, cat12 String, cat13 String, cat14 String, cat15 String, cat16 String, cat17 String, cat18 String, cat19 String, cat20 String, cat21 String, cat22 String, cat23 String, cat24 String, cat25 String, cat26 String) ENGINE = Log ``` Загрузите данные: ```bash -for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done +$ for i in {00..23}; do echo $i; zcat datasets/criteo/day_${i#0}.gz | sed -r 's/^/2000-01-'${i/00/24}'\t/' | clickhouse-client --host=example-perftest01j --query="INSERT INTO criteo_log FORMAT TabSeparated"; done ``` Создайте таблицу для сконвертированных данных: -``` sql +```sql CREATE TABLE criteo ( date Date, @@ -65,7 +65,7 @@ CREATE TABLE criteo Преобразуем данные из сырого лога и положим во вторую таблицу: -``` sql +```sql INSERT INTO criteo SELECT date, clicked, int1, int2, int3, int4, int5, int6, int7, int8, int9, int10, int11, int12, int13, reinterpretAsUInt32(unhex(cat1)) AS icat1, reinterpretAsUInt32(unhex(cat2)) AS icat2, reinterpretAsUInt32(unhex(cat3)) AS icat3, reinterpretAsUInt32(unhex(cat4)) AS icat4, reinterpretAsUInt32(unhex(cat5)) AS icat5, reinterpretAsUInt32(unhex(cat6)) AS icat6, reinterpretAsUInt32(unhex(cat7)) AS icat7, reinterpretAsUInt32(unhex(cat8)) AS icat8, reinterpretAsUInt32(unhex(cat9)) AS icat9, reinterpretAsUInt32(unhex(cat10)) AS icat10, reinterpretAsUInt32(unhex(cat11)) AS icat11, reinterpretAsUInt32(unhex(cat12)) AS icat12, reinterpretAsUInt32(unhex(cat13)) AS icat13, reinterpretAsUInt32(unhex(cat14)) AS icat14, reinterpretAsUInt32(unhex(cat15)) AS icat15, reinterpretAsUInt32(unhex(cat16)) AS icat16, reinterpretAsUInt32(unhex(cat17)) AS icat17, reinterpretAsUInt32(unhex(cat18)) AS icat18, reinterpretAsUInt32(unhex(cat19)) AS icat19, reinterpretAsUInt32(unhex(cat20)) AS icat20, reinterpretAsUInt32(unhex(cat21)) AS icat21, reinterpretAsUInt32(unhex(cat22)) AS icat22, reinterpretAsUInt32(unhex(cat23)) AS icat23, reinterpretAsUInt32(unhex(cat24)) AS icat24, reinterpretAsUInt32(unhex(cat25)) AS icat25, reinterpretAsUInt32(unhex(cat26)) AS icat26 FROM criteo_log; DROP TABLE criteo_log; diff --git a/docs/ru/getting_started/example_datasets/metrica.md b/docs/ru/getting_started/example_datasets/metrica.md index 38c346d2b74..aade4d0f38d 100644 --- a/docs/ru/getting_started/example_datasets/metrica.md +++ b/docs/ru/getting_started/example_datasets/metrica.md @@ -4,47 +4,47 @@ ## Получение таблиц из партиций **Скачивание и импортирование партиций hits:** ```bash -curl -O https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar -tar xvf hits_v1.tar -C /var/lib/clickhouse # путь к папке с данными ClickHouse -# убедитесь, что установлены корректные права доступа на файлы -sudo service clickhouse-server restart -clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" +$ curl -O https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar +$ tar xvf hits_v1.tar -C /var/lib/clickhouse # путь к папке с данными ClickHouse +$ # убедитесь, что установлены корректные права доступа на файлы +$ sudo service clickhouse-server restart +$ clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" ``` **Скачивание и импортирование партиций visits:** ```bash -curl -O https://clickhouse-datasets.s3.yandex.net/visits/partitions/visits_v1.tar -tar xvf visits_v1.tar -C /var/lib/clickhouse # путь к папке с данными ClickHouse -# убедитесь, что установлены корректные права доступа на файлы -sudo service clickhouse-server restart -clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" +$ curl -O https://clickhouse-datasets.s3.yandex.net/visits/partitions/visits_v1.tar +$ tar xvf visits_v1.tar -C /var/lib/clickhouse # путь к папке с данными ClickHouse +$ # убедитесь, что установлены корректные права доступа на файлы +$ sudo service clickhouse-server restart +$ clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" ``` ## Получение таблиц из сжатых tsv-файлов **Скачивание и импортирование hits из сжатого tsv-файла** ```bash -curl https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv -# теперь создадим таблицу -clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" -clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192" -# импортируем данные -cat hits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.hits_v1 FORMAT TSV" --max_insert_block_size=100000 -# опционально можно оптимизировать таблицу -clickhouse-client --query "OPTIMIZE TABLE datasets.hits_v1 FINAL" -clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" +$ curl https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv +$ # теперь создадим таблицу +$ clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" +$ clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192" +$ # импортируем данные +$ cat hits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.hits_v1 FORMAT TSV" --max_insert_block_size=100000 +$ # опционально можно оптимизировать таблицу +$ clickhouse-client --query "OPTIMIZE TABLE datasets.hits_v1 FINAL" +$ clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1" ``` **Скачивание и импортирование visits из сжатого tsv-файла** ```bash -curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv -# теперь создадим таблицу -clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" -clickhouse-client --query "CREATE TABLE datasets.visits_v1 ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime, Price Int64, OrderID String, CurrencyID UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, TraficSource Nested(ID Int8, SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), Market Nested(Type UInt8, GoalID UInt32, OrderID String, OrderPrice Int64, PP UInt32, DirectPlaceID UInt32, DirectOrderID UInt32, DirectBannerID UInt32, GoodID String, GoodName String, GoodQuantity Int32, GoodPrice Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign)" -# импортируем данные -cat visits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.visits_v1 FORMAT TSV" --max_insert_block_size=100000 -# опционально можно оптимизировать таблицу -clickhouse-client --query "OPTIMIZE TABLE datasets.visits_v1 FINAL" -clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" +$ curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv +$ # теперь создадим таблицу +$ clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets" +$ clickhouse-client --query "CREATE TABLE datasets.visits_v1 ( CounterID UInt32, StartDate Date, Sign Int8, IsNew UInt8, VisitID UInt64, UserID UInt64, StartTime DateTime, Duration UInt32, UTCStartTime DateTime, PageViews Int32, Hits Int32, IsBounce UInt8, Referer String, StartURL String, RefererDomain String, StartURLDomain String, EndURL String, LinkURL String, IsDownload UInt8, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, PlaceID Int32, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), IsYandex UInt8, GoalReachesDepth Int32, GoalReachesURL Int32, GoalReachesAny Int32, SocialSourceNetworkID UInt8, SocialSourcePage String, MobilePhoneModel String, ClientEventTime DateTime, RegionID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RemoteIP UInt32, RemoteIP6 FixedString(16), IPNetworkID UInt32, SilverlightVersion3 UInt32, CodeVersion UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, UserAgentMajor UInt16, UserAgentMinor UInt16, WindowClientWidth UInt16, WindowClientHeight UInt16, SilverlightVersion2 UInt8, SilverlightVersion4 UInt16, FlashVersion3 UInt16, FlashVersion4 UInt16, ClientTimeZone Int16, OS UInt8, UserAgent UInt8, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, NetMajor UInt8, NetMinor UInt8, MobilePhone UInt8, SilverlightVersion1 UInt8, Age UInt8, Sex UInt8, Income UInt8, JavaEnable UInt8, CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, BrowserLanguage UInt16, BrowserCountry UInt16, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), Params Array(String), Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime, Price Int64, OrderID String, CurrencyID UInt32), WatchIDs Array(UInt64), ParamSumPrice Int64, ParamCurrency FixedString(3), ParamCurrencyID UInt16, ClickLogID UInt64, ClickEventID Int32, ClickGoodEvent Int32, ClickEventTime DateTime, ClickPriorityID Int32, ClickPhraseID Int32, ClickPageID Int32, ClickPlaceID Int32, ClickTypeID Int32, ClickResourceID Int32, ClickCost UInt32, ClickClientIP UInt32, ClickDomainID UInt32, ClickURL String, ClickAttempt UInt8, ClickOrderID UInt32, ClickBannerID UInt32, ClickMarketCategoryID UInt32, ClickMarketPP UInt32, ClickMarketCategoryName String, ClickMarketPPName String, ClickAWAPSCampaignName String, ClickPageName String, ClickTargetType UInt16, ClickTargetPhraseID UInt64, ClickContextType UInt8, ClickSelectType Int8, ClickOptions String, ClickGroupBannerID Int32, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, FirstVisit DateTime, PredLastVisit Date, LastVisit Date, TotalVisits UInt32, TraficSource Nested(ID Int8, SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String), Attendance FixedString(16), CLID UInt32, YCLID UInt64, NormalizedRefererHash UInt64, SearchPhraseHash UInt64, RefererDomainHash UInt64, NormalizedStartURLHash UInt64, StartURLDomainHash UInt64, NormalizedEndURLHash UInt64, TopLevelDomain UInt64, URLScheme UInt64, OpenstatServiceNameHash UInt64, OpenstatCampaignIDHash UInt64, OpenstatAdIDHash UInt64, OpenstatSourceIDHash UInt64, UTMSourceHash UInt64, UTMMediumHash UInt64, UTMCampaignHash UInt64, UTMContentHash UInt64, UTMTermHash UInt64, FromHash UInt64, WebVisorEnabled UInt8, WebVisorActivity UInt32, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), Market Nested(Type UInt8, GoalID UInt32, OrderID String, OrderPrice Int64, PP UInt32, DirectPlaceID UInt32, DirectOrderID UInt32, DirectBannerID UInt32, GoodID String, GoodName String, GoodQuantity Int32, GoodPrice Int64), IslandID FixedString(16)) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign)" +$ # импортируем данные +$ cat visits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.visits_v1 FORMAT TSV" --max_insert_block_size=100000 +$ # опционально можно оптимизировать таблицу +$ clickhouse-client --query "OPTIMIZE TABLE datasets.visits_v1 FINAL" +$ clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" ``` ## Запросы diff --git a/docs/ru/getting_started/example_datasets/nyc_taxi.md b/docs/ru/getting_started/example_datasets/nyc_taxi.md index 0f1bf99c5cb..1b23636c5c8 100644 --- a/docs/ru/getting_started/example_datasets/nyc_taxi.md +++ b/docs/ru/getting_started/example_datasets/nyc_taxi.md @@ -29,9 +29,9 @@ mv data/yellow_tripdata_2010-03.csv_ data/yellow_tripdata_2010-03.csv Проверить количество загруженных строк можно следующим образом: -``` -time psql nyc-taxi-data -c "SELECT count(*) FROM trips;" -## count +```bash +$ time psql nyc-taxi-data -c "SELECT count(*) FROM trips;" +## Count 1298979494 (1 row) @@ -44,7 +44,7 @@ real 7m9.164s Экспорт данных из PostgreSQL: -``` sql +```sql COPY ( SELECT trips.id, @@ -119,7 +119,7 @@ COPY Создание временной таблицы в ClickHouse: -``` sql +```sql CREATE TABLE trips ( trip_id UInt32, @@ -178,8 +178,8 @@ dropoff_puma Nullable(String) Она нужна для преобразование полей к более правильным типам данных и, если возможно, чтобы избавиться от NULL'ов. -``` -time clickhouse-client --query="INSERT INTO trips FORMAT TabSeparated" < trips.tsv +```bash +$ time clickhouse-client --query="INSERT INTO trips FORMAT TabSeparated" < trips.tsv real 75m56.214s ``` @@ -196,7 +196,7 @@ real 75m56.214s Создадим и заполним итоговую таблицу: -``` +```sql CREATE TABLE trips_mergetree ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) AS SELECT @@ -263,13 +263,10 @@ FROM trips Таблица заняла 126 Гб дискового пространства. +```sql +SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mergetree' AND active ``` -:) SELECT formatReadableSize(sum(bytes)) FROM system.parts WHERE table = 'trips_mergetree' AND active - -SELECT formatReadableSize(sum(bytes)) -FROM system.parts -WHERE (table = 'trips_mergetree') AND active - +```text ┌─formatReadableSize(sum(bytes))─┐ │ 126.18 GiB │ └────────────────────────────────┘ @@ -280,11 +277,11 @@ WHERE (table = 'trips_mergetree') AND active ## Скачивание готовых партиций ```bash -curl -O https://clickhouse-datasets.s3.yandex.net/trips_mergetree/partitions/trips_mergetree.tar -tar xvf trips_mergetree.tar -C /var/lib/clickhouse # путь к папке с данными ClickHouse -# убедитесь, что установлены корректные права доступа на файлы -sudo service clickhouse-server restart -clickhouse-client --query "SELECT COUNT(*) FROM datasets.trips_mergetree" +$ curl -O https://clickhouse-datasets.s3.yandex.net/trips_mergetree/partitions/trips_mergetree.tar +$ tar xvf trips_mergetree.tar -C /var/lib/clickhouse # путь к папке с данными ClickHouse +$ # убедитесь, что установлены корректные права доступа на файлы +$ sudo service clickhouse-server restart +$ clickhouse-client --query "SELECT COUNT(*) FROM datasets.trips_mergetree" ``` !!!info @@ -303,7 +300,7 @@ SELECT cab_type, count(*) FROM trips_mergetree GROUP BY cab_type Q2: -``` sql +```sql SELECT passenger_count, avg(total_amount) FROM trips_mergetree GROUP BY passenger_count ``` @@ -311,7 +308,7 @@ SELECT passenger_count, avg(total_amount) FROM trips_mergetree GROUP BY passenge Q3: -``` sql +```sql SELECT passenger_count, toYear(pickup_date) AS year, count(*) FROM trips_mergetree GROUP BY passenger_count, year ``` @@ -319,7 +316,7 @@ SELECT passenger_count, toYear(pickup_date) AS year, count(*) FROM trips_mergetr Q4: -``` sql +```sql SELECT passenger_count, toYear(pickup_date) AS year, round(trip_distance) AS distance, count(*) FROM trips_mergetree GROUP BY passenger_count, year, distance @@ -341,19 +338,19 @@ ORDER BY year, count(*) DESC На каждом сервере: -``` +```sql CREATE TABLE default.trips_mergetree_third ( trip_id UInt32, vendor_id Enum8('1' = 1, '2' = 2, 'CMT' = 3, 'VTS' = 4, 'DDS' = 5, 'B02512' = 10, 'B02598' = 11, 'B02617' = 12, 'B02682' = 13, 'B02764' = 14), pickup_date Date, pickup_datetime DateTime, dropoff_date Date, dropoff_datetime DateTime, store_and_fwd_flag UInt8, rate_code_id UInt8, pickup_longitude Float64, pickup_latitude Float64, dropoff_longitude Float64, dropoff_latitude Float64, passenger_count UInt8, trip_distance Float64, fare_amount Float32, extra Float32, mta_tax Float32, tip_amount Float32, tolls_amount Float32, ehail_fee Float32, improvement_surcharge Float32, total_amount Float32, payment_type_ Enum8('UNK' = 0, 'CSH' = 1, 'CRE' = 2, 'NOC' = 3, 'DIS' = 4), trip_type UInt8, pickup FixedString(25), dropoff FixedString(25), cab_type Enum8('yellow' = 1, 'green' = 2, 'uber' = 3), pickup_nyct2010_gid UInt8, pickup_ctlabel Float32, pickup_borocode UInt8, pickup_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), pickup_ct2010 FixedString(6), pickup_boroct2010 FixedString(7), pickup_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), pickup_ntacode FixedString(4), pickup_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), pickup_puma UInt16, dropoff_nyct2010_gid UInt8, dropoff_ctlabel Float32, dropoff_borocode UInt8, dropoff_boroname Enum8('' = 0, 'Manhattan' = 1, 'Bronx' = 2, 'Brooklyn' = 3, 'Queens' = 4, 'Staten Island' = 5), dropoff_ct2010 FixedString(6), dropoff_boroct2010 FixedString(7), dropoff_cdeligibil Enum8(' ' = 0, 'E' = 1, 'I' = 2), dropoff_ntacode FixedString(4), dropoff_ntaname Enum16('' = 0, 'Airport' = 1, 'Allerton-Pelham Gardens' = 2, 'Annadale-Huguenot-Prince\'s Bay-Eltingville' = 3, 'Arden Heights' = 4, 'Astoria' = 5, 'Auburndale' = 6, 'Baisley Park' = 7, 'Bath Beach' = 8, 'Battery Park City-Lower Manhattan' = 9, 'Bay Ridge' = 10, 'Bayside-Bayside Hills' = 11, 'Bedford' = 12, 'Bedford Park-Fordham North' = 13, 'Bellerose' = 14, 'Belmont' = 15, 'Bensonhurst East' = 16, 'Bensonhurst West' = 17, 'Borough Park' = 18, 'Breezy Point-Belle Harbor-Rockaway Park-Broad Channel' = 19, 'Briarwood-Jamaica Hills' = 20, 'Brighton Beach' = 21, 'Bronxdale' = 22, 'Brooklyn Heights-Cobble Hill' = 23, 'Brownsville' = 24, 'Bushwick North' = 25, 'Bushwick South' = 26, 'Cambria Heights' = 27, 'Canarsie' = 28, 'Carroll Gardens-Columbia Street-Red Hook' = 29, 'Central Harlem North-Polo Grounds' = 30, 'Central Harlem South' = 31, 'Charleston-Richmond Valley-Tottenville' = 32, 'Chinatown' = 33, 'Claremont-Bathgate' = 34, 'Clinton' = 35, 'Clinton Hill' = 36, 'Co-op City' = 37, 'College Point' = 38, 'Corona' = 39, 'Crotona Park East' = 40, 'Crown Heights North' = 41, 'Crown Heights South' = 42, 'Cypress Hills-City Line' = 43, 'DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill' = 44, 'Douglas Manor-Douglaston-Little Neck' = 45, 'Dyker Heights' = 46, 'East Concourse-Concourse Village' = 47, 'East Elmhurst' = 48, 'East Flatbush-Farragut' = 49, 'East Flushing' = 50, 'East Harlem North' = 51, 'East Harlem South' = 52, 'East New York' = 53, 'East New York (Pennsylvania Ave)' = 54, 'East Tremont' = 55, 'East Village' = 56, 'East Williamsburg' = 57, 'Eastchester-Edenwald-Baychester' = 58, 'Elmhurst' = 59, 'Elmhurst-Maspeth' = 60, 'Erasmus' = 61, 'Far Rockaway-Bayswater' = 62, 'Flatbush' = 63, 'Flatlands' = 64, 'Flushing' = 65, 'Fordham South' = 66, 'Forest Hills' = 67, 'Fort Greene' = 68, 'Fresh Meadows-Utopia' = 69, 'Ft. Totten-Bay Terrace-Clearview' = 70, 'Georgetown-Marine Park-Bergen Beach-Mill Basin' = 71, 'Glen Oaks-Floral Park-New Hyde Park' = 72, 'Glendale' = 73, 'Gramercy' = 74, 'Grasmere-Arrochar-Ft. Wadsworth' = 75, 'Gravesend' = 76, 'Great Kills' = 77, 'Greenpoint' = 78, 'Grymes Hill-Clifton-Fox Hills' = 79, 'Hamilton Heights' = 80, 'Hammels-Arverne-Edgemere' = 81, 'Highbridge' = 82, 'Hollis' = 83, 'Homecrest' = 84, 'Hudson Yards-Chelsea-Flatiron-Union Square' = 85, 'Hunters Point-Sunnyside-West Maspeth' = 86, 'Hunts Point' = 87, 'Jackson Heights' = 88, 'Jamaica' = 89, 'Jamaica Estates-Holliswood' = 90, 'Kensington-Ocean Parkway' = 91, 'Kew Gardens' = 92, 'Kew Gardens Hills' = 93, 'Kingsbridge Heights' = 94, 'Laurelton' = 95, 'Lenox Hill-Roosevelt Island' = 96, 'Lincoln Square' = 97, 'Lindenwood-Howard Beach' = 98, 'Longwood' = 99, 'Lower East Side' = 100, 'Madison' = 101, 'Manhattanville' = 102, 'Marble Hill-Inwood' = 103, 'Mariner\'s Harbor-Arlington-Port Ivory-Graniteville' = 104, 'Maspeth' = 105, 'Melrose South-Mott Haven North' = 106, 'Middle Village' = 107, 'Midtown-Midtown South' = 108, 'Midwood' = 109, 'Morningside Heights' = 110, 'Morrisania-Melrose' = 111, 'Mott Haven-Port Morris' = 112, 'Mount Hope' = 113, 'Murray Hill' = 114, 'Murray Hill-Kips Bay' = 115, 'New Brighton-Silver Lake' = 116, 'New Dorp-Midland Beach' = 117, 'New Springville-Bloomfield-Travis' = 118, 'North Corona' = 119, 'North Riverdale-Fieldston-Riverdale' = 120, 'North Side-South Side' = 121, 'Norwood' = 122, 'Oakland Gardens' = 123, 'Oakwood-Oakwood Beach' = 124, 'Ocean Hill' = 125, 'Ocean Parkway South' = 126, 'Old Astoria' = 127, 'Old Town-Dongan Hills-South Beach' = 128, 'Ozone Park' = 129, 'Park Slope-Gowanus' = 130, 'Parkchester' = 131, 'Pelham Bay-Country Club-City Island' = 132, 'Pelham Parkway' = 133, 'Pomonok-Flushing Heights-Hillcrest' = 134, 'Port Richmond' = 135, 'Prospect Heights' = 136, 'Prospect Lefferts Gardens-Wingate' = 137, 'Queens Village' = 138, 'Queensboro Hill' = 139, 'Queensbridge-Ravenswood-Long Island City' = 140, 'Rego Park' = 141, 'Richmond Hill' = 142, 'Ridgewood' = 143, 'Rikers Island' = 144, 'Rosedale' = 145, 'Rossville-Woodrow' = 146, 'Rugby-Remsen Village' = 147, 'Schuylerville-Throgs Neck-Edgewater Park' = 148, 'Seagate-Coney Island' = 149, 'Sheepshead Bay-Gerritsen Beach-Manhattan Beach' = 150, 'SoHo-TriBeCa-Civic Center-Little Italy' = 151, 'Soundview-Bruckner' = 152, 'Soundview-Castle Hill-Clason Point-Harding Park' = 153, 'South Jamaica' = 154, 'South Ozone Park' = 155, 'Springfield Gardens North' = 156, 'Springfield Gardens South-Brookville' = 157, 'Spuyten Duyvil-Kingsbridge' = 158, 'St. Albans' = 159, 'Stapleton-Rosebank' = 160, 'Starrett City' = 161, 'Steinway' = 162, 'Stuyvesant Heights' = 163, 'Stuyvesant Town-Cooper Village' = 164, 'Sunset Park East' = 165, 'Sunset Park West' = 166, 'Todt Hill-Emerson Hill-Heartland Village-Lighthouse Hill' = 167, 'Turtle Bay-East Midtown' = 168, 'University Heights-Morris Heights' = 169, 'Upper East Side-Carnegie Hill' = 170, 'Upper West Side' = 171, 'Van Cortlandt Village' = 172, 'Van Nest-Morris Park-Westchester Square' = 173, 'Washington Heights North' = 174, 'Washington Heights South' = 175, 'West Brighton' = 176, 'West Concourse' = 177, 'West Farms-Bronx River' = 178, 'West New Brighton-New Brighton-St. George' = 179, 'West Village' = 180, 'Westchester-Unionport' = 181, 'Westerleigh' = 182, 'Whitestone' = 183, 'Williamsbridge-Olinville' = 184, 'Williamsburg' = 185, 'Windsor Terrace' = 186, 'Woodhaven' = 187, 'Woodlawn-Wakefield' = 188, 'Woodside' = 189, 'Yorkville' = 190, 'park-cemetery-etc-Bronx' = 191, 'park-cemetery-etc-Brooklyn' = 192, 'park-cemetery-etc-Manhattan' = 193, 'park-cemetery-etc-Queens' = 194, 'park-cemetery-etc-Staten Island' = 195), dropoff_puma UInt16) ENGINE = MergeTree(pickup_date, pickup_datetime, 8192) ``` На исходном сервере: -``` sql +```sql CREATE TABLE trips_mergetree_x3 AS trips_mergetree_third ENGINE = Distributed(perftest, default, trips_mergetree_third, rand()) ``` Следующим запрос перераспределит данные: -``` sql +```sql INSERT INTO trips_mergetree_x3 SELECT * FROM trips_mergetree ``` diff --git a/docs/ru/getting_started/example_datasets/ontime.md b/docs/ru/getting_started/example_datasets/ontime.md index 32f222b0630..cfaf959464c 100644 --- a/docs/ru/getting_started/example_datasets/ontime.md +++ b/docs/ru/getting_started/example_datasets/ontime.md @@ -24,7 +24,7 @@ done Создание таблицы: -``` sql +```sql CREATE TABLE `ontime` ( `Year` UInt16, `Quarter` UInt8, @@ -141,17 +141,17 @@ CREATE TABLE `ontime` ( Загрузка данных: ```bash -for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done +$ for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=example-perftest01j --query="INSERT INTO ontime FORMAT CSVWithNames"; done ``` ## Скачивание готовых партиций ```bash -curl -O https://clickhouse-datasets.s3.yandex.net/ontime/partitions/ontime.tar -tar xvf ontime.tar -C /var/lib/clickhouse # путь к папке с данными ClickHouse -# убедитесь, что установлены корректные права доступа на файлы -sudo service clickhouse-server restart -clickhouse-client --query "SELECT COUNT(*) FROM datasets.ontime" +$ curl -O https://clickhouse-datasets.s3.yandex.net/ontime/partitions/ontime.tar +$ tar xvf ontime.tar -C /var/lib/clickhouse # путь к папке с данными ClickHouse +$ # убедитесь, что установлены корректные права доступа на файлы +$ sudo service clickhouse-server restart +$ clickhouse-client --query "SELECT COUNT(*) FROM datasets.ontime" ``` !!!info @@ -162,7 +162,7 @@ clickhouse-client --query "SELECT COUNT(*) FROM datasets.ontime" Q0. -``` sql +```sql SELECT avg(c1) FROM ( @@ -174,7 +174,7 @@ FROM Q1. Количество полетов в день с 2000 по 2008 года -``` sql +```sql SELECT DayOfWeek, count(*) AS c FROM ontime WHERE Year>=2000 AND Year<=2008 @@ -184,7 +184,7 @@ ORDER BY c DESC; Q2. Количество полетов, задержанных более чем на 10 минут, с группировкой по дням неделе, за 2000-2008 года -``` sql +```sql SELECT DayOfWeek, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 @@ -194,7 +194,7 @@ ORDER BY c DESC; Q3. Количество задержек по аэропортам за 2000-2008 -``` sql +```sql SELECT Origin, count(*) AS c FROM ontime WHERE DepDelay>10 AND Year>=2000 AND Year<=2008 @@ -205,7 +205,7 @@ LIMIT 10; Q4. Количество задержек по перевозчикам за 2007 год -``` sql +```sql SELECT Carrier, count(*) FROM ontime WHERE DepDelay>10 AND Year=2007 @@ -215,7 +215,7 @@ ORDER BY count(*) DESC; Q5. Процент задержек по перевозчикам за 2007 год -``` sql +```sql SELECT Carrier, c, c2, c*100/c2 as c3 FROM ( @@ -241,7 +241,7 @@ ORDER BY c3 DESC; Более оптимальная версия того же запроса: -``` sql +```sql SELECT Carrier, avg(DepDelay>10)*100 AS c3 FROM ontime WHERE Year=2007 @@ -251,7 +251,7 @@ ORDER BY Carrier Q6. Предыдущий запрос за более широкий диапазон лет, 2000-2008 -``` sql +```sql SELECT Carrier, c, c2, c*100/c2 as c3 FROM ( @@ -277,7 +277,7 @@ ORDER BY c3 DESC; Более оптимальная версия того же запроса: -``` sql +```sql SELECT Carrier, avg(DepDelay>10)*100 AS c3 FROM ontime WHERE Year>=2000 AND Year<=2008 @@ -287,7 +287,7 @@ ORDER BY Carrier; Q7. Процент полетов, задержанных на более 10 минут, в разбивке по годам -``` sql +```sql SELECT Year, c1/c2 FROM ( @@ -311,7 +311,7 @@ ORDER BY Year; Более оптимальная версия того же запроса: -``` sql +```sql SELECT Year, avg(DepDelay>10) FROM ontime GROUP BY Year @@ -320,7 +320,7 @@ ORDER BY Year; Q8. Самые популярные направления по количеству напрямую соединенных городов для различных диапазонов лет -``` sql +```sql SELECT DestCityName, uniqExact(OriginCityName) AS u F ROM ontime WHERE Year>=2000 and Year<=2010 @@ -331,7 +331,7 @@ LIMIT 10; Q9. -``` sql +```sql SELECT Year, count(*) AS c1 FROM ontime GROUP BY Year; @@ -339,7 +339,7 @@ GROUP BY Year; Q10. -``` sql +```sql SELECT min(Year), max(Year), Carrier, count(*) AS cnt, sum(ArrDelayMinutes>30) AS flights_delayed, @@ -357,7 +357,7 @@ LIMIT 1000; Бонус: -``` sql +```sql SELECT avg(cnt) FROM ( diff --git a/docs/ru/getting_started/example_datasets/star_schema.md b/docs/ru/getting_started/example_datasets/star_schema.md index 545eaeea6a6..2e66ced7149 100644 --- a/docs/ru/getting_started/example_datasets/star_schema.md +++ b/docs/ru/getting_started/example_datasets/star_schema.md @@ -2,25 +2,25 @@ Compiling dbgen: -``` -git clone git@github.com:vadimtk/ssb-dbgen.git -cd ssb-dbgen -make +```bash +$ git clone git@github.com:vadimtk/ssb-dbgen.git +$ cd ssb-dbgen +$ make ``` Generating data: -``` -./dbgen -s 1000 -T c -./dbgen -s 1000 -T l -./dbgen -s 1000 -T p -./dbgen -s 1000 -T s -./dbgen -s 1000 -T d +```bash +$ ./dbgen -s 1000 -T c +$ ./dbgen -s 1000 -T l +$ ./dbgen -s 1000 -T p +$ ./dbgen -s 1000 -T s +$ ./dbgen -s 1000 -T d ``` Creating tables in ClickHouse: -``` +```sql CREATE TABLE customer ( C_CUSTKEY UInt32, @@ -85,16 +85,16 @@ ENGINE = MergeTree ORDER BY S_SUPPKEY; Inserting data: -``` -clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl -clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl -clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl -clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl +```bash +$ clickhouse-client --query "INSERT INTO customer FORMAT CSV" < customer.tbl +$ clickhouse-client --query "INSERT INTO part FORMAT CSV" < part.tbl +$ clickhouse-client --query "INSERT INTO supplier FORMAT CSV" < supplier.tbl +$ clickhouse-client --query "INSERT INTO lineorder FORMAT CSV" < lineorder.tbl ``` Converting "star schema" to denormalized "flat schema": -``` +```sql SET max_memory_usage = 20000000000, allow_experimental_multiple_joins_emulation = 1; CREATE TABLE lineorder_flat @@ -112,44 +112,56 @@ ALTER TABLE lineorder_flat DROP COLUMN C_CUSTKEY, DROP COLUMN S_SUPPKEY, DROP CO Running the queries: -``` Q1.1 +```sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYear(LO_ORDERDATE) = 1993 AND LO_DISCOUNT BETWEEN 1 AND 3 AND LO_QUANTITY < 25; - +``` Q1.2 +```sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toYYYYMM(LO_ORDERDATE) = 199401 AND LO_DISCOUNT BETWEEN 4 AND 6 AND LO_QUANTITY BETWEEN 26 AND 35; - +``` Q1.3 +```sql SELECT sum(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue FROM lineorder_flat WHERE toISOWeek(LO_ORDERDATE) = 6 AND toYear(LO_ORDERDATE) = 1994 AND LO_DISCOUNT BETWEEN 5 AND 7 AND LO_QUANTITY BETWEEN 26 AND 35; - +``` Q2.1 +```sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_CATEGORY = 'MFGR#12' AND S_REGION = 'AMERICA' GROUP BY year, P_BRAND ORDER BY year, P_BRAND; - +``` Q2.2 +```sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_BRAND BETWEEN 'MFGR#2221' AND 'MFGR#2228' AND S_REGION = 'ASIA' GROUP BY year, P_BRAND ORDER BY year, P_BRAND; - +``` Q2.3 +```sql SELECT sum(LO_REVENUE), toYear(LO_ORDERDATE) AS year, P_BRAND FROM lineorder_flat WHERE P_BRAND = 'MFGR#2239' AND S_REGION = 'EUROPE' GROUP BY year, P_BRAND ORDER BY year, P_BRAND; - +``` Q3.1 +```sql SELECT C_NATION, S_NATION, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE C_REGION = 'ASIA' AND S_REGION = 'ASIA' AND year >= 1992 AND year <= 1997 GROUP BY C_NATION, S_NATION, year ORDER BY year asc, revenue desc; - +``` Q3.2 +```sql SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE C_NATION = 'UNITED STATES' AND S_NATION = 'UNITED STATES' AND year >= 1992 AND year <= 1997 GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc; - +``` Q3.3 +```sql SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND year >= 1992 AND year <= 1997 GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc; - +``` Q3.4 +```sql SELECT C_CITY, S_CITY, toYear(LO_ORDERDATE) AS year, sum(LO_REVENUE) AS revenue FROM lineorder_flat WHERE (C_CITY = 'UNITED KI1' OR C_CITY = 'UNITED KI5') AND (S_CITY = 'UNITED KI1' OR S_CITY = 'UNITED KI5') AND toYYYYMM(LO_ORDERDATE) = '199712' GROUP BY C_CITY, S_CITY, year ORDER BY year asc, revenue desc; - +``` Q4.1 +```sql SELECT toYear(LO_ORDERDATE) AS year, C_NATION, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') GROUP BY year, C_NATION ORDER BY year, C_NATION; - +``` Q4.2 +```sql SELECT toYear(LO_ORDERDATE) AS year, S_NATION, P_CATEGORY, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE C_REGION = 'AMERICA' AND S_REGION = 'AMERICA' AND (year = 1997 OR year = 1998) AND (P_MFGR = 'MFGR#1' OR P_MFGR = 'MFGR#2') GROUP BY year, S_NATION, P_CATEGORY ORDER BY year, S_NATION, P_CATEGORY; - +``` Q4.3 +```sql SELECT toYear(LO_ORDERDATE) AS year, S_CITY, P_BRAND, sum(LO_REVENUE - LO_SUPPLYCOST) AS profit FROM lineorder_flat WHERE S_NATION = 'UNITED STATES' AND (year = 1997 OR year = 1998) AND P_CATEGORY = 'MFGR#14' GROUP BY year, S_CITY, P_BRAND ORDER BY year, S_CITY, P_BRAND; ``` diff --git a/docs/ru/getting_started/example_datasets/wikistat.md b/docs/ru/getting_started/example_datasets/wikistat.md index ed8037ffc8e..12469694c53 100644 --- a/docs/ru/getting_started/example_datasets/wikistat.md +++ b/docs/ru/getting_started/example_datasets/wikistat.md @@ -4,7 +4,7 @@ Создание таблицы: -``` sql +```sql CREATE TABLE wikistat ( date Date, @@ -20,9 +20,9 @@ CREATE TABLE wikistat Загрузка данных: ```bash -for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt -cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done -ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done +$ for i in {2007..2016}; do for j in {01..12}; do echo $i-$j >&2; curl -sSL "http://dumps.wikimedia.org/other/pagecounts-raw/$i/$i-$j/" | grep -oE 'pagecounts-[0-9]+-[0-9]+\.gz'; done; done | sort | uniq | tee links.txt +$ cat links.txt | while read link; do wget http://dumps.wikimedia.org/other/pagecounts-raw/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1/')/$(echo $link | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})[0-9]{2}-[0-9]+\.gz/\1-\2/')/$link; done +$ ls -1 /opt/wikistat/ | grep gz | while read i; do echo $i; gzip -cd /opt/wikistat/$i | ./wikistat-loader --time="$(echo -n $i | sed -r 's/pagecounts-([0-9]{4})([0-9]{2})([0-9]{2})-([0-9]{2})([0-9]{2})([0-9]{2})\.gz/\1-\2-\3 \4-00-00/')" | clickhouse-client --query="INSERT INTO wikistat FORMAT TabSeparated"; done ``` [Оригинальная статья](https://clickhouse.yandex/docs/ru/getting_started/example_datasets/wikistat/) diff --git a/docs/ru/getting_started/index.md b/docs/ru/getting_started/index.md index e3fb2ab0985..822955df9eb 100644 --- a/docs/ru/getting_started/index.md +++ b/docs/ru/getting_started/index.md @@ -18,8 +18,8 @@ $ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not Чтобы установить официальные пакеты, пропишите репозиторий Яндекса в `/etc/apt/sources.list` или в отдельный файл `/etc/apt/sources.list.d/clickhouse.list`: -``` -deb http://repo.yandex.ru/clickhouse/deb/stable/ main/ +```bash +$ deb http://repo.yandex.ru/clickhouse/deb/stable/ main/ ``` Если вы хотите использовать наиболее свежую тестовую, замените `stable` на `testing` (не рекомендуется для production окружений). @@ -27,10 +27,10 @@ deb http://repo.yandex.ru/clickhouse/deb/stable/ main/ Затем для самой установки пакетов выполните: ```bash -sudo apt-get install dirmngr # optional -sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 # optional -sudo apt-get update -sudo apt-get install clickhouse-client clickhouse-server +$ sudo apt-get install dirmngr # optional +$ sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 # optional +$ sudo apt-get update +$ sudo apt-get install clickhouse-client clickhouse-server ``` Также эти пакеты можно скачать и установить вручную отсюда: . @@ -41,9 +41,9 @@ sudo apt-get install clickhouse-client clickhouse-server Сначала нужно подключить официальный репозиторий: ```bash -sudo yum install yum-utils -sudo rpm --import https://repo.yandex.ru/clickhouse/CLICKHOUSE-KEY.GPG -sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/x86_64 +$ sudo yum install yum-utils +$ sudo rpm --import https://repo.yandex.ru/clickhouse/CLICKHOUSE-KEY.GPG +$ sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/x86_64 ``` Для использования наиболее свежих версий нужно заменить `stable` на `testing` (рекомендуется для тестовых окружений). @@ -52,7 +52,7 @@ Then run these commands to actually install packages: Для, собственно, установки пакетов необходимо выполнить следующие команды: ```bash -sudo yum install clickhouse-server clickhouse-client +$ sudo yum install clickhouse-server clickhouse-client ``` Также есть возможность установить пакеты вручную, скачав отсюда: . @@ -67,14 +67,14 @@ sudo yum install clickhouse-server clickhouse-client Можно скомпилировать пакеты и установить их, либо использовать программы без установки пакетов. Также при ручой сборке можно отключить необходимость поддержки набора инструкций SSE 4.2 или собрать под процессоры архитектуры AArch64. -``` +```text Client: dbms/programs/clickhouse-client Server: dbms/programs/clickhouse-server ``` Для работы собранного вручную сервера необходимо создать директории для данных и метаданных, а также сделать их `chown` для желаемого пользователя. Пути к этим директориям могут быть изменены в конфигурационном файле сервера (src/dbms/programs/server/config.xml), по умолчанию используются следующие: -``` +```text /opt/clickhouse/data/default/ /opt/clickhouse/metadata/default/ ``` @@ -85,7 +85,7 @@ Server: dbms/programs/clickhouse-server Для запуска сервера в качестве демона, выполните: -``` bash +```bash $ sudo service clickhouse-server start ``` @@ -95,7 +95,7 @@ $ sudo service clickhouse-server start Также можно запустить сервер вручную из консоли: -``` bash +```bash $ clickhouse-server --config-file=/etc/clickhouse-server/config.xml ``` @@ -104,7 +104,7 @@ $ clickhouse-server --config-file=/etc/clickhouse-server/config.xml После запуска сервера, соединиться с ним можно с помощью клиента командной строки: -``` bash +```bash $ clickhouse-client ``` @@ -116,23 +116,19 @@ $ clickhouse-client Пример проверки работоспособности системы: -``` bash +```bash $ ./clickhouse-client ClickHouse client version 0.0.18749. Connecting to localhost:9000. Connected to ClickHouse server version 0.0.18749. - -:) SELECT 1 - +``` +```sql SELECT 1 - +``` +```text ┌─1─┐ │ 1 │ └───┘ - -1 rows in set. Elapsed: 0.003 sec. - -:) ``` **Поздравляем, система работает!** diff --git a/docs/ru/index.md b/docs/ru/index.md index 2db293a36ff..fbb80bfb8a9 100644 --- a/docs/ru/index.md +++ b/docs/ru/index.md @@ -79,22 +79,16 @@ ClickHouse - столбцовая система управления базам Например, для запроса "посчитать количество записей для каждой рекламной системы", требуется прочитать один столбец "идентификатор рекламной системы", который занимает 1 байт в несжатом виде. Если большинство переходов было не с рекламных систем, то можно рассчитывать хотя бы на десятикратное сжатие этого столбца. При использовании быстрого алгоритма сжатия, возможно разжатие данных со скоростью более нескольких гигабайт несжатых данных в секунду. То есть, такой запрос может выполняться со скоростью около нескольких миллиардов строк в секунду на одном сервере. На практике, такая скорость действительно достигается.
Пример -``` +```bash $ clickhouse-client ClickHouse client version 0.0.52053. Connecting to localhost:9000. Connected to ClickHouse server version 0.0.52053. - -:) SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 - -SELECT - CounterID, - count() -FROM hits -GROUP BY CounterID -ORDER BY count() DESC -LIMIT 20 - +``` +```sql +SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 +``` +```text ┌─CounterID─┬──count()─┐ │ 114208 │ 56057344 │ │ 115080 │ 51619590 │ @@ -117,10 +111,6 @@ LIMIT 20 │ 115079 │ 8837972 │ │ 337234 │ 8205961 │ └───────────┴──────────┘ - -20 rows in set. Elapsed: 0.153 sec. Processed 1.00 billion rows, 4.00 GB (6.53 billion rows/s., 26.10 GB/s.) - -:) ```
diff --git a/docs/ru/interfaces/cli.md b/docs/ru/interfaces/cli.md index 59980109240..a2b624c6f21 100644 --- a/docs/ru/interfaces/cli.md +++ b/docs/ru/interfaces/cli.md @@ -22,14 +22,14 @@ Connected to ClickHouse server version 0.0.26176. Примеры использования клиента для вставки данных: ```bash -echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; +$ echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; -cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; +$ cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; 3, 'some text', '2016-08-14 00:00:00' 4, 'some more text', '2016-08-14 00:00:01' _EOF -cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; +$ cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; ``` В batch режиме в качестве формата данных по умолчанию используется формат TabSeparated. Формат может быть указан в секции FORMAT запроса. @@ -79,7 +79,7 @@ clickhouse-client --param_parName="[1, 2]" -q "SELECT * FROM table WHERE a = {p Отформатируйте запрос обычным способом. Представьте значения, которые вы хотите передать из параметров приложения в запрос в следующем формате: -``` +```sql {:} ``` @@ -89,7 +89,7 @@ clickhouse-client --param_parName="[1, 2]" -q "SELECT * FROM table WHERE a = {p #### Пример ```bash -clickhouse-client --param_tuple_in_tuple="(10, ('dt', 10))" -q "SELECT * FROM table WHERE val = {tuple_in_tuple:Tuple(UInt8, Tuple(String, UInt8))}" +$ clickhouse-client --param_tuple_in_tuple="(10, ('dt', 10))" -q "SELECT * FROM table WHERE val = {tuple_in_tuple:Tuple(UInt8, Tuple(String, UInt8))}" ``` ## Конфигурирование {#interfaces_cli_configuration} diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 9acf2d67e4a..c76c68a7685 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -50,7 +50,7 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated`` ``` -``` +```text 2014-03-17 1406958 2014-03-18 1383658 2014-03-19 1405797 @@ -82,7 +82,7 @@ SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORD Строки выводятся с экранированием спецсимволов с помощью обратного слеша. При выводе, используются следующие escape-последовательности: `\b`, `\f`, `\r`, `\n`, `\t`, `\0`, `\'`, `\\`. Парсер также поддерживает последовательности `\a`, `\v`, и `\xHH` (последовательности hex escape) и любые последовательности вида `\c`, где `c` — любой символ (такие последовательности преобразуются в `c`). Таким образом, при чтении поддерживаются форматы, где перевод строки может быть записан как `\n` и как `\` и перевод строки. Например, строка `Hello world`, где между словами вместо пробела стоит перевод строки, может быть считана в любом из следующих вариантов: -``` +```text Hello\nworld Hello\ @@ -211,7 +211,7 @@ format_schema_rows_between_delimiter = '\n ' ``` Пример ввода: -``` +```text Some header Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 Page views: 6, User id: 4324182021466249494, Useless field: world, Duration: 185, Sign: 1 @@ -239,7 +239,7 @@ format_schema_rows_between_delimiter = ',' Похож на TabSeparated, но выводит значения в формате name=value. Имена экранируются так же, как строки в формате TabSeparated и, дополнительно, экранируется также символ =. -``` +```text SearchPhrase= count()=8267016 SearchPhrase=интерьер ванной комнаты count()=2166 SearchPhrase=яндекс count()=1655 @@ -258,7 +258,7 @@ SearchPhrase=баку count()=1000 SELECT * FROM t_null FORMAT TSKV ``` -``` +```text x=1 y=\N ``` @@ -274,8 +274,8 @@ x=1 y=\N При форматировании, строки выводятся в двойных кавычках. Двойная кавычка внутри строки выводится как две двойные кавычки подряд. Других правил экранирования нет. Даты и даты-с-временем выводятся в двойных кавычках. Числа выводятся без кавычек. Значения разделяются символом-разделителем, по умолчанию — `,`. Символ-разделитель определяется настройкой [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter). Строки разделяются unix переводом строки (LF). Массивы сериализуются в CSV следующим образом: сначала массив сериализуется в строку, как в формате TabSeparated, а затем полученная строка выводится в CSV в двойных кавычках. Кортежи в формате CSV сериализуются, как отдельные столбцы (то есть, теряется их вложенность в кортеж). -``` -clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv +```bash +$ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv ``` *По умолчанию — `,`. См. настройку [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter) для дополнительной информации. @@ -460,7 +460,7 @@ ClickHouse заменяет опущенные значения значения Рассмотрим следующую таблицу: -``` +```sql CREATE TABLE IF NOT EXISTS example_table ( x UInt32, @@ -478,7 +478,7 @@ CREATE TABLE IF NOT EXISTS example_table Рассмотрим в качестве примера таблицу `UserActivity`: -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ │ 4324182021466249494 │ 6 │ 185 │ 1 │ @@ -487,7 +487,7 @@ CREATE TABLE IF NOT EXISTS example_table Запрос `SELECT * FROM UserActivity FORMAT JSONEachRow` возвращает: -``` +```text {"UserID":"4324182021466249494","PageViews":5,"Duration":146,"Sign":-1} {"UserID":"4324182021466249494","PageViews":6,"Duration":185,"Sign":1} ``` @@ -578,7 +578,7 @@ SELECT * FROM json_each_row_nested ```sql SELECT * FROM t_null ``` -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ └───┴──────┘ @@ -590,7 +590,7 @@ SELECT * FROM t_null SELECT 'String with \'quotes\' and \t character' AS Escaping_test ``` -``` +```text ┌─Escaping_test────────────────────────┐ │ String with 'quotes' and character │ └──────────────────────────────────────┘ @@ -605,7 +605,7 @@ SELECT 'String with \'quotes\' and \t character' AS Escaping_test SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT PrettyCompact ``` -``` +```text ┌──EventDate─┬───────c─┐ │ 2014-03-17 │ 1406958 │ │ 2014-03-18 │ 1383658 │ @@ -644,7 +644,7 @@ Extremes: Пример: ```bash -watch -n1 "clickhouse-client --query='SELECT event, value FROM system.events FORMAT PrettyCompactNoEscapes'" +$ watch -n1 "clickhouse-client --query='SELECT event, value FROM system.events FORMAT PrettyCompactNoEscapes'" ``` Для отображения в браузере, вы можете использовать HTTP интерфейс. @@ -703,7 +703,7 @@ Array представлены как длина в формате varint (unsig ```sql SELECT * FROM t_null FORMAT Vertical ``` -``` +```text Row 1: ────── x: 1 @@ -716,7 +716,7 @@ y: ᴺᵁᴸᴸ SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT Vertical ``` -``` +```text Row 1: ────── test: string with 'quotes' and with some special @@ -806,12 +806,12 @@ Cap'n Proto - формат бинарных сообщений, похож на Сообщения Cap'n Proto строго типизированы и не самоописывающиеся, т.е. нуждаются во внешнем описании схемы. Схема применяется "на лету" и кешируется между запросами. ```bash -cat capnproto_messages.bin | clickhouse-client --query "INSERT INTO test.hits FORMAT CapnProto SETTINGS format_schema='schema:Message'" +$ cat capnproto_messages.bin | clickhouse-client --query "INSERT INTO test.hits FORMAT CapnProto SETTINGS format_schema='schema:Message'" ``` Где `schema.capnp` выглядит следующим образом: -``` +```capnp struct Message { SearchPhrase @0 :Text; c @1 :Uint64; @@ -838,12 +838,12 @@ SELECT * FROM test.table FORMAT Protobuf SETTINGS format_schema = 'schemafile:Me или ```bash -cat protobuf_messages.bin | clickhouse-client --query "INSERT INTO test.table FORMAT Protobuf SETTINGS format_schema='schemafile:MessageType'" +$ cat protobuf_messages.bin | clickhouse-client --query "INSERT INTO test.table FORMAT Protobuf SETTINGS format_schema='schemafile:MessageType'" ``` Где файл `schemafile.proto` может выглядеть так: -``` +```capnp syntax = "proto3"; message MessageType { @@ -860,7 +860,7 @@ message MessageType { Вложенные сообщения поддерживаются, например, для поля `z` в таком сообщении -``` +```capnp message MessageType { message XType { message YType { @@ -877,7 +877,7 @@ ClickHouse попытается найти столбец с именем `x.y.z Значения по умолчанию, определённые в схеме `proto2`, например, -``` +```capnp syntax = "proto2"; message MessageType { @@ -926,14 +926,14 @@ ClickHouse поддерживает настраиваемую точность Чтобы вставить в ClickHouse данные из файла в формате Parquet, выполните команду следующего вида: -``` -cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet" +```bash +$ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet" ``` Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата Parquet, используйте команду следующего вида: -``` -clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_file.pq} +```bash +$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_file.pq} ``` Для обмена данными с экосистемой Hadoop можно использовать движки таблиц [HDFS](../operations/table_engines/hdfs.md) и `URL`. diff --git a/docs/ru/interfaces/http.md b/docs/ru/interfaces/http.md index 77eb984e8f4..c7c32a46a4c 100644 --- a/docs/ru/interfaces/http.md +++ b/docs/ru/interfaces/http.md @@ -76,31 +76,31 @@ $ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @ Создаём таблицу: ```bash -echo 'CREATE TABLE t (a UInt8) ENGINE = Memory' | curl 'http://localhost:8123/' --data-binary @- +$ echo 'CREATE TABLE t (a UInt8) ENGINE = Memory' | curl 'http://localhost:8123/' --data-binary @- ``` Используем привычный запрос INSERT для вставки данных: ```bash -echo 'INSERT INTO t VALUES (1),(2),(3)' | curl 'http://localhost:8123/' --data-binary @- +$ echo 'INSERT INTO t VALUES (1),(2),(3)' | curl 'http://localhost:8123/' --data-binary @- ``` Данные можно отправить отдельно от запроса: ```bash -echo '(4),(5),(6)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20VALUES' --data-binary @- +$ echo '(4),(5),(6)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20VALUES' --data-binary @- ``` Можно указать любой формат для данных. Формат Values - то же, что используется при записи INSERT INTO t VALUES: ```bash -echo '(7),(8),(9)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20Values' --data-binary @- +$ echo '(7),(8),(9)' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20Values' --data-binary @- ``` Можно вставить данные из tab-separated дампа, указав соответствующий формат: ```bash -echo -ne '10\n11\n12\n' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20TabSeparated' --data-binary @- +$ echo -ne '10\n11\n12\n' | curl 'http://localhost:8123/?query=INSERT%20INTO%20t%20FORMAT%20TabSeparated' --data-binary @- ``` Прочитаем содержимое таблицы. Данные выводятся в произвольном порядке из-за параллельной обработки запроса: @@ -124,7 +124,7 @@ $ curl 'http://localhost:8123/?query=SELECT%20a%20FROM%20t' Удаляем таблицу. ```bash -echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- +$ echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- ``` Для запросов, которые не возвращают таблицу с данными, в случае успеха, выдаётся пустое тело ответа. @@ -141,11 +141,11 @@ echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- Примеры отправки данных со сжатием: ```bash -#Отправка данных на сервер: -curl -vsS "http://localhost:8123/?enable_http_compression=1" -d 'SELECT number FROM system.numbers LIMIT 10' -H 'Accept-Encoding: gzip' +$ #Отправка данных на сервер: +$ curl -vsS "http://localhost:8123/?enable_http_compression=1" -d 'SELECT number FROM system.numbers LIMIT 10' -H 'Accept-Encoding: gzip' -#Отправка данных клиенту: -echo "SELECT 1" | gzip -c | curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' +$ #Отправка данных клиенту: +$ echo "SELECT 1" | gzip -c | curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' ``` !!! note "Примечание" @@ -174,13 +174,13 @@ $ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?data 1. С использованием HTTP Basic Authentification. Пример: ```bash -echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @- +$ echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @- ``` 2. В параметрах URL user и password. Пример: ```bash -echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @- +$ echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @- ``` Если пользователь не задан,то используется `default`. Если пароль не задан, то используется пустой пароль. @@ -208,7 +208,7 @@ $ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:812 Прогресс выполнения запроса можно отслеживать с помощью заголовков ответа `X-ClickHouse-Progress`. Для этого включите [send_progress_in_http_headers](../operations/settings/settings.md#settings-send_progress_in_http_headers). Пример последовательности заголовков: -``` +```text X-ClickHouse-Progress: {"read_rows":"2752512","read_bytes":"240570816","total_rows_to_read":"8880128"} X-ClickHouse-Progress: {"read_rows":"5439488","read_bytes":"482285394","total_rows_to_read":"8880128"} X-ClickHouse-Progress: {"read_rows":"8783786","read_bytes":"819092887","total_rows_to_read":"8880128"} @@ -240,7 +240,7 @@ HTTP интерфейс позволяет передать внешние да Пример: ```bash -curl -sS 'http://localhost:8123/?max_result_bytes=4000000&buffer_size=3000000&wait_end_of_query=1' -d 'SELECT toUInt8(number) FROM system.numbers LIMIT 9000000 FORMAT RowBinary' +$ curl -sS 'http://localhost:8123/?max_result_bytes=4000000&buffer_size=3000000&wait_end_of_query=1' -d 'SELECT toUInt8(number) FROM system.numbers LIMIT 9000000 FORMAT RowBinary' ``` Буферизация позволяет избежать ситуации когда код ответа и HTTP-заголовки были отправлены клиенту, после чего возникла ошибка выполнения запроса. В такой ситуации сообщение об ошибке записывается в конце тела ответа, и на стороне клиента ошибка может быть обнаружена только на этапе парсинга. @@ -252,7 +252,7 @@ curl -sS 'http://localhost:8123/?max_result_bytes=4000000&buffer_size=3000000&wa ### Пример ```bash -curl -sS "
?param_id=2¶m_phrase=test" -d "SELECT * FROM table WHERE int_column = {id:UInt8} and string_column = {phrase:String}" +$ curl -sS "
?param_id=2¶m_phrase=test" -d "SELECT * FROM table WHERE int_column = {id:UInt8} and string_column = {phrase:String}" ``` [Оригинальная статья](https://clickhouse.yandex/docs/ru/interfaces/http_interface/) diff --git a/docs/ru/operations/configuration_files.md b/docs/ru/operations/configuration_files.md index 9514734678d..78b803775e1 100644 --- a/docs/ru/operations/configuration_files.md +++ b/docs/ru/operations/configuration_files.md @@ -19,8 +19,10 @@ В `config.xml` может быть указан отдельный конфиг с настройками пользователей, профилей и квот. Относительный путь к нему указывается в элементе users_config. По умолчанию - `users.xml`. Если `users_config` не указан, то настройки пользователей, профилей и квот, указываются непосредственно в `config.xml`. Для `users_config` могут также существовать переопределения в файлах из директории `users_config.d` (например, `users.d`) и подстановки. Например, можно иметь по отдельному конфигурационному файлу для каждого пользователя: -``` xml +```bash $ cat /etc/clickhouse-server/users.d/alice.xml +``` +```xml diff --git a/docs/ru/operations/settings/constraints_on_settings.md b/docs/ru/operations/settings/constraints_on_settings.md index 50c45a8356a..5a4442295d2 100644 --- a/docs/ru/operations/settings/constraints_on_settings.md +++ b/docs/ru/operations/settings/constraints_on_settings.md @@ -3,7 +3,7 @@ Ограничения на изменение настроек могут находиться внутри секции `users` файла `user.xml` и запрещают пользователю менять некоторые настройки с помощью запроса `SET`. Выглядит это следующим образом: -``` +```xml <имя_пользователя> @@ -31,7 +31,7 @@ **Пример:** Пусть файл `users.xml` содержит строки: -``` +```xml 10000000000 @@ -52,13 +52,13 @@ Каждый из следующих запросов кинет исключение: -``` +```sql SET max_memory_usage=20000000001; SET max_memory_usage=4999999999; SET force_index_by_date=1; ``` -``` +```text Code: 452, e.displayText() = DB::Exception: Setting max_memory_usage should not be greater than 20000000000. Code: 452, e.displayText() = DB::Exception: Setting max_memory_usage should not be less than 5000000000. Code: 452, e.displayText() = DB::Exception: Setting force_index_by_date should not be changed. diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 20017e88af4..3453d8e3d7e 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -175,7 +175,8 @@ ClickHouse применяет настройку в тех случаях, ко ```sql SET input_format_values_interpret_expressions = 0; INSERT INTO datetime_t VALUES (now()) - +``` +```text Exception on client: Code: 27. DB::Exception: Cannot parse input: expected ) before: now()): (at row 1) ``` @@ -183,7 +184,8 @@ Code: 27. DB::Exception: Cannot parse input: expected ) before: now()): (at row ```sql SET input_format_values_interpret_expressions = 1; INSERT INTO datetime_t VALUES (now()) - +``` +```text Ok. ``` @@ -192,7 +194,8 @@ Ok. ```sql SET input_format_values_interpret_expressions = 0; INSERT INTO datetime_t SELECT now() - +``` +```text Ok. ``` @@ -604,7 +607,7 @@ ClickHouse поддерживает следующие алгоритмы выб ### Random (by default) {#load_balancing-random} -``` +```sql load_balancing = random ``` @@ -613,7 +616,7 @@ load_balancing = random ### Nearest Hostname {#load_balancing-nearest_hostname} -``` +```sql load_balancing = nearest_hostname ``` @@ -627,7 +630,7 @@ load_balancing = nearest_hostname ### In Order {#load_balancing-in_order} -``` +```sql load_balancing = in_order ``` @@ -636,7 +639,7 @@ load_balancing = in_order ### First or Random {#load_balancing-first_or_random} -``` +```sql load_balancing = first_or_random ``` diff --git a/docs/ru/operations/settings/settings_profiles.md b/docs/ru/operations/settings/settings_profiles.md index 212e577faaf..a120c388880 100644 --- a/docs/ru/operations/settings/settings_profiles.md +++ b/docs/ru/operations/settings/settings_profiles.md @@ -8,7 +8,7 @@ Установить профиль `web`. -``` sql +```sql SET profile = 'web' ``` diff --git a/docs/ru/operations/settings/settings_users.md b/docs/ru/operations/settings/settings_users.md index 85233f19baa..adb50a02a9d 100644 --- a/docs/ru/operations/settings/settings_users.md +++ b/docs/ru/operations/settings/settings_users.md @@ -4,7 +4,7 @@ Структура раздела `users`: -``` +```xml @@ -79,7 +79,7 @@ Чтобы открыть доступ пользователю из любой сети, укажите: -``` +```xml ::/0 ``` @@ -88,7 +88,7 @@ Чтобы открыть только локальный доступ, укажите: -``` +```xml ::1 127.0.0.1 ``` @@ -111,7 +111,7 @@ Следующая конфигурация задаёт, что пользователь `user1` в результате запросов `SELECT` может получать только те строки `table1`, в которых значение поля `id` равно 1000. -``` +```xml diff --git a/docs/ru/operations/system_tables.md b/docs/ru/operations/system_tables.md index 4c66664b921..3aa77776c44 100644 --- a/docs/ru/operations/system_tables.md +++ b/docs/ru/operations/system_tables.md @@ -47,7 +47,7 @@ SELECT * FROM system.asynchronous_metrics LIMIT 10 Содержит информацию о доступных в конфигурационном файле кластерах и серверах, которые в них входят. Столбцы: -``` +```text cluster String — имя кластера. shard_num UInt32 — номер шарда в кластере, начиная с 1. shard_weight UInt32 — относительный вес шарда при записи данных @@ -356,7 +356,7 @@ SELECT * FROM system.metrics LIMIT 10 Эта системная таблица используется для реализации запроса `SHOW PROCESSLIST`. Столбцы: -``` +```text user String - имя пользователя, который задал запрос. При распределённой обработке запроса, относится к пользователю, с помощью которого сервер-инициатор запроса отправил запрос на данный сервер, а не к имени пользователя, который задал распределённый запрос на сервер-инициатор запроса. address String - IP-адрес, с которого задан запрос. При распределённой обработке запроса, аналогично. @@ -480,7 +480,7 @@ WHERE table = 'visits' FORMAT Vertical ``` -``` +```text Row 1: ────── database: merge @@ -506,7 +506,7 @@ active_replicas: 2 Столбцы: -``` +```text database: имя БД table: имя таблицы engine: имя движка таблицы @@ -598,7 +598,7 @@ WHERE Столбцы: -``` +```text name String - имя настройки value String - значение настройки changed UInt8 - была ли настройка явно задана в конфиге или изменена явным образом @@ -612,7 +612,7 @@ FROM system.settings WHERE changed ``` -``` +```text ┌─name───────────────────┬─value───────┬─changed─┐ │ max_threads │ 8 │ 1 │ │ use_uncompressed_cache │ 0 │ 1 │ @@ -681,7 +681,7 @@ WHERE path = '/clickhouse/tables/01-08/visits/replicas' FORMAT Vertical ``` -``` +```text Row 1: ────── name: example01-08-1.yandex.ru diff --git a/docs/ru/operations/table_engines/aggregatingmergetree.md b/docs/ru/operations/table_engines/aggregatingmergetree.md index 97ddee14714..fa452829a5b 100644 --- a/docs/ru/operations/table_engines/aggregatingmergetree.md +++ b/docs/ru/operations/table_engines/aggregatingmergetree.md @@ -11,7 +11,7 @@ ## Создание таблицы -``` +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -60,7 +60,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Создаём материализованное представление типа `AggregatingMergeTree`, следящее за таблицей `test.visits`: -``` sql +```sql CREATE MATERIALIZED VIEW test.basic ENGINE = AggregatingMergeTree() PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate) AS SELECT @@ -74,7 +74,7 @@ GROUP BY CounterID, StartDate; Вставляем данные в таблицу `test.visits`: -``` sql +```sql INSERT INTO test.visits ... ``` @@ -82,7 +82,7 @@ INSERT INTO test.visits ... Чтобы получить агрегированные данные, выполним запрос вида `SELECT ... GROUP BY ...` из представления `test.basic`: -``` sql +```sql SELECT StartDate, sumMerge(Visits) AS Visits, diff --git a/docs/ru/operations/table_engines/buffer.md b/docs/ru/operations/table_engines/buffer.md index abc20b6a88c..bf3c1b450fc 100644 --- a/docs/ru/operations/table_engines/buffer.md +++ b/docs/ru/operations/table_engines/buffer.md @@ -2,7 +2,7 @@ Буферизует записываемые данные в оперативке, периодически сбрасывая их в другую таблицу. При чтении, производится чтение данных одновременно из буфера и из другой таблицы. -``` +```sql Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes) ``` @@ -22,7 +22,7 @@ min_bytes, max_bytes - условие на количество байт в бу Пример: -``` sql +```sql CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10, 100, 10000, 1000000, 10000000, 100000000) ``` diff --git a/docs/ru/operations/table_engines/collapsingmergetree.md b/docs/ru/operations/table_engines/collapsingmergetree.md index 17e85bcca0c..8260ce1bc76 100644 --- a/docs/ru/operations/table_engines/collapsingmergetree.md +++ b/docs/ru/operations/table_engines/collapsingmergetree.md @@ -65,7 +65,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Например, мы хотим рассчитать, сколько страниц проверили пользователи на каком-то сайте и как долго они там находились. В какой-то момент времени мы пишем следующую строку с состоянием действий пользователя: -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ @@ -73,7 +73,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Через некоторое время мы регистрируем изменение активности пользователя и записываем его следующими двумя строками. -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ │ 4324182021466249494 │ 6 │ 185 │ 1 │ @@ -86,7 +86,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Поскольку нам нужно только последнее состояние активности пользователя, строки -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ │ 4324182021466249494 │ 5 │ 146 │ -1 │ @@ -134,7 +134,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Исходные данные: -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ │ 4324182021466249494 │ 5 │ 146 │ -1 │ @@ -170,11 +170,11 @@ INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1),(4324182021466249494, Получение данных: -``` +```sql SELECT * FROM UAct ``` -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ │ 4324182021466249494 │ 6 │ 185 │ 1 │ @@ -200,7 +200,7 @@ GROUP BY UserID HAVING sum(Sign) > 0 ``` -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┐ │ 4324182021466249494 │ 6 │ 185 │ └─────────────────────┴───────────┴──────────┘ @@ -212,7 +212,7 @@ HAVING sum(Sign) > 0 SELECT * FROM UAct FINAL ``` -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ @@ -224,7 +224,7 @@ SELECT * FROM UAct FINAL Исходные данные: -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ │ 4324182021466249494 │ -5 │ -146 │ -1 │ @@ -253,28 +253,39 @@ insert into UAct values(4324182021466249494, -5, -146, -1); insert into UAct values(4324182021466249494, 6, 185, 1); select * from UAct final; // старайтесь не использовать final (он подходит только для тестов и маленьких таблиц) +``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ - +``` +```sql SELECT UserID, sum(PageViews) AS PageViews, sum(Duration) AS Duration FROM UAct GROUP BY UserID +``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┐ │ 4324182021466249494 │ 6 │ 185 │ └─────────────────────┴───────────┴──────────┘ - +``` +```sql select count() FROM UAct +``` +```text ┌─count()─┐ │ 3 │ └─────────┘ - +``` +```sql optimize table UAct final; select * FROM UAct +``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┘ diff --git a/docs/ru/operations/table_engines/custom_partitioning_key.md b/docs/ru/operations/table_engines/custom_partitioning_key.md index 87f7ea50c63..55904d7dbe0 100644 --- a/docs/ru/operations/table_engines/custom_partitioning_key.md +++ b/docs/ru/operations/table_engines/custom_partitioning_key.md @@ -6,7 +6,7 @@ Ключ партиционирования задается при [создании таблицы](mergetree.md#table_engine-mergetree-creating-a-table), в секции `PARTITION BY expr`. Ключ может представлять собой произвольное выражение из столбцов таблицы. Например, чтобы задать партиционирования по месяцам, можно использовать выражение `toYYYYMM(date_column)`: -``` sql +```sql CREATE TABLE visits ( VisitDate Date, @@ -20,7 +20,7 @@ ORDER BY Hour Ключом партиционирования также может быть кортеж из выражений (аналогично [первичному ключу](mergetree.md#primary-keys-and-indexes-in-queries)). Например: -``` sql +```sql ENGINE = ReplicatedCollapsingMergeTree('/clickhouse/tables/name', 'replica1', Sign) PARTITION BY (toMonday(StartDate), EventType) ORDER BY (CounterID, StartDate, intHash32(UserID)); @@ -44,7 +44,7 @@ FROM system.parts WHERE table = 'visits' ``` -``` +```text ┌─partition─┬─name───────────┬─active─┐ │ 201901 │ 201901_1_3_1 │ 0 │ │ 201901 │ 201901_1_9_2 │ 1 │ @@ -80,7 +80,7 @@ WHERE table = 'visits' OPTIMIZE TABLE visits PARTITION 201902; ``` -``` +```text ┌─partition─┬─name───────────┬─active─┐ │ 201901 │ 201901_1_3_1 │ 0 │ │ 201901 │ 201901_1_9_2 │ 1 │ @@ -98,7 +98,7 @@ OPTIMIZE TABLE visits PARTITION 201902; Другой способ посмотреть набор кусков и партиций – зайти в директорию с данными таблицы: `/var/lib/clickhouse/data//
/`. Например: ```bash -dev:/var/lib/clickhouse/data/default/visits$ ls -l +/var/lib/clickhouse/data/default/visits$ ls -l total 40 drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 1 16:48 201901_1_3_1 drwxr-xr-x 2 clickhouse clickhouse 4096 Feb 5 16:17 201901_1_9_2 diff --git a/docs/ru/operations/table_engines/dictionary.md b/docs/ru/operations/table_engines/dictionary.md index 650ad9466a7..d24dca27a07 100644 --- a/docs/ru/operations/table_engines/dictionary.md +++ b/docs/ru/operations/table_engines/dictionary.md @@ -38,9 +38,7 @@ Запрос данных словаря: -``` sql -select name, type, key, attribute.names, attribute.types, bytes_allocated, element_count,source from system.dictionaries where name = 'products'; - +```sql SELECT name, type, @@ -53,7 +51,7 @@ SELECT FROM system.dictionaries WHERE name = 'products' ``` -``` +```text ┌─name─────┬─type─┬─key────┬─attribute.names─┬─attribute.types─┬─bytes_allocated─┬─element_count─┬─source──────────┐ │ products │ Flat │ UInt64 │ ['title'] │ ['String'] │ 23065376 │ 175032 │ ODBC: .products │ └──────────┴──────┴────────┴─────────────────┴─────────────────┴─────────────────┴───────────────┴─────────────────┘ @@ -65,45 +63,27 @@ WHERE name = 'products' Синтаксис: -``` +```sql CREATE TABLE %table_name% (%fields%) engine = Dictionary(%dictionary_name%)` ``` Пример использования: -``` sql +```sql create table products (product_id UInt64, title String) Engine = Dictionary(products); - -CREATE TABLE products -( - product_id UInt64, - title String, -) -ENGINE = Dictionary(products) -``` -``` -Ok. - -0 rows in set. Elapsed: 0.004 sec. ``` Проверим что у нас в таблице? -``` sql +```sql select * from products limit 1; - -SELECT * -FROM products -LIMIT 1 ``` -``` +```text ┌────product_id─┬─title───────────┐ │ 152689 │ Some item │ └───────────────┴─────────────────┘ - -1 rows in set. Elapsed: 0.006 sec. ``` [Оригинальная статья](https://clickhouse.yandex/docs/ru/operations/table_engines/dictionary/) diff --git a/docs/ru/operations/table_engines/distributed.md b/docs/ru/operations/table_engines/distributed.md index be3dde014cf..52743f72703 100644 --- a/docs/ru/operations/table_engines/distributed.md +++ b/docs/ru/operations/table_engines/distributed.md @@ -6,7 +6,7 @@ Движок Distributed принимает параметры: имя кластера в конфигурационном файле сервера, имя удалённой базы данных, имя удалённой таблицы, а также (не обязательно) ключ шардирования. Пример: -``` +```sql Distributed(logs, default, hits[, sharding_key]) ``` diff --git a/docs/ru/operations/table_engines/external_data.md b/docs/ru/operations/table_engines/external_data.md index 430f90a82ae..358ad7ff9c9 100644 --- a/docs/ru/operations/table_engines/external_data.md +++ b/docs/ru/operations/table_engines/external_data.md @@ -33,9 +33,9 @@ ClickHouse позволяет отправить на сервер данные, Примеры: ```bash -echo -ne "1\n2\n3\n" | clickhouse-client --query="SELECT count() FROM test.visits WHERE TraficSourceID IN _data" --external --file=- --types=Int8 +$ echo -ne "1\n2\n3\n" | clickhouse-client --query="SELECT count() FROM test.visits WHERE TraficSourceID IN _data" --external --file=- --types=Int8 849897 -cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, count() AS c FROM passwd GROUP BY shell ORDER BY c DESC" --external --file=- --name=passwd --structure='login String, unused String, uid UInt16, gid UInt16, comment String, home String, shell String' +$ cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, count() AS c FROM passwd GROUP BY shell ORDER BY c DESC" --external --file=- --name=passwd --structure='login String, unused String, uid UInt16, gid UInt16, comment String, home String, shell String' /bin/sh 20 /bin/false 5 /bin/bash 4 @@ -48,9 +48,9 @@ cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, coun Пример: ```bash -cat /etc/passwd | sed 's/:/\t/g' > passwd.tsv +$ cat /etc/passwd | sed 's/:/\t/g' > passwd.tsv -curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+count()+AS+c+FROM+passwd+GROUP+BY+shell+ORDER+BY+c+DESC&passwd_structure=login+String,+unused+String,+uid+UInt16,+gid+UInt16,+comment+String,+home+String,+shell+String' +$ curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+count()+AS+c+FROM+passwd+GROUP+BY+shell+ORDER+BY+c+DESC&passwd_structure=login+String,+unused+String,+uid+UInt16,+gid+UInt16,+comment+String,+home+String,+shell+String' /bin/sh 20 /bin/false 5 /bin/bash 4 diff --git a/docs/ru/operations/table_engines/file.md b/docs/ru/operations/table_engines/file.md index b67823b988a..bb8f831235f 100644 --- a/docs/ru/operations/table_engines/file.md +++ b/docs/ru/operations/table_engines/file.md @@ -10,7 +10,7 @@ ## Использование движка в сервере ClickHouse -``` +```sql File(Format) ``` @@ -29,7 +29,7 @@ File(Format) **1.** Создадим на сервере таблицу `file_engine_table`: -``` sql +```sql CREATE TABLE file_engine_table (name String, value UInt32) ENGINE=File(TabSeparated) ``` @@ -45,11 +45,11 @@ two 2 **3.** Запросим данные: -``` sql +```sql SELECT * FROM file_engine_table ``` -``` +```text ┌─name─┬─value─┐ │ one │ 1 │ │ two │ 2 │ diff --git a/docs/ru/operations/table_engines/graphitemergetree.md b/docs/ru/operations/table_engines/graphitemergetree.md index f58d4b3d047..40948512a2c 100644 --- a/docs/ru/operations/table_engines/graphitemergetree.md +++ b/docs/ru/operations/table_engines/graphitemergetree.md @@ -76,7 +76,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Структура конфигурации rollup: -``` +```text required-columns patterns ``` @@ -92,7 +92,7 @@ patterns Структура раздела `patterns`: -``` +```text pattern regexp function diff --git a/docs/ru/operations/table_engines/hdfs.md b/docs/ru/operations/table_engines/hdfs.md index 3f42c9ec447..69ec47a6282 100644 --- a/docs/ru/operations/table_engines/hdfs.md +++ b/docs/ru/operations/table_engines/hdfs.md @@ -4,7 +4,7 @@ ## Использование движка -``` +```sql ENGINE = HDFS(URI, format) ``` @@ -15,22 +15,22 @@ ENGINE = HDFS(URI, format) **1.** Создадим на сервере таблицу `hdfs_engine_table`: -``` sql +```sql CREATE TABLE hdfs_engine_table (name String, value UInt32) ENGINE=HDFS('hdfs://hdfs1:9000/other_storage', 'TSV') ``` **2.** Заполним файл: -``` sql +```sql INSERT INTO hdfs_engine_table VALUES ('one', 1), ('two', 2), ('three', 3) ``` **3.** Запросим данные: -``` sql +```sql SELECT * FROM hdfs_engine_table LIMIT 2 ``` -``` +```text ┌─name─┬─value─┐ │ one │ 1 │ │ two │ 2 │ diff --git a/docs/ru/operations/table_engines/jdbc.md b/docs/ru/operations/table_engines/jdbc.md index 97c52f7bfd9..08b1bef87d5 100644 --- a/docs/ru/operations/table_engines/jdbc.md +++ b/docs/ru/operations/table_engines/jdbc.md @@ -27,7 +27,7 @@ ENGINE = JDBC(dbms_uri, external_database, external_table) Создадим таблицу в на сервере MySQL с помощью консольного клиента MySQL: -``` +```text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, @@ -50,30 +50,28 @@ mysql> select * from test; Создадим таблицу на сервере ClickHouse и получим из неё данные: -``` +```sql CREATE TABLE jdbc_table ENGINE JDBC('jdbc:mysql://localhost:3306/?user=root&password=root', 'test', 'test') - -Ok. - +``` +```sql DESCRIBE TABLE jdbc_table - +``` +```text ┌─name───────────────┬─type───────────────┬─default_type─┬─default_expression─┐ │ int_id │ Int32 │ │ │ │ int_nullable │ Nullable(Int32) │ │ │ │ float │ Float32 │ │ │ │ float_nullable │ Nullable(Float32) │ │ │ └────────────────────┴────────────────────┴──────────────┴────────────────────┘ - -10 rows in set. Elapsed: 0.031 sec. - +``` +```sql SELECT * FROM jdbc_table - +``` +```text ┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ │ 1 │ ᴺᵁᴸᴸ │ 2 │ ᴺᵁᴸᴸ │ └────────┴──────────────┴───────┴────────────────┘ - -1 rows in set. Elapsed: 0.055 sec. ``` ## Смотрите также diff --git a/docs/ru/operations/table_engines/join.md b/docs/ru/operations/table_engines/join.md index 0de007b321d..75676bdfa0a 100644 --- a/docs/ru/operations/table_engines/join.md +++ b/docs/ru/operations/table_engines/join.md @@ -4,7 +4,7 @@ ## Создание таблицы -``` +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], diff --git a/docs/ru/operations/table_engines/kafka.md b/docs/ru/operations/table_engines/kafka.md index f2318d824e2..b79520cfd61 100644 --- a/docs/ru/operations/table_engines/kafka.md +++ b/docs/ru/operations/table_engines/kafka.md @@ -10,7 +10,7 @@ Kafka позволяет: ## Создание таблицы {#table_engine-kafka-creating-a-table} -``` +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -77,7 +77,7 @@ SETTINGS !!! attention Не используйте этот метод в новых проектах. По возможности переключите старые проекты на метод, описанный выше. -``` +```sql Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format [, kafka_row_delimiter, kafka_schema, kafka_num_consumers, kafka_skip_broken_messages]) ``` @@ -125,7 +125,7 @@ Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format Чтобы остановить получение данных топика или изменить логику преобразования, отсоедините материализованное представление: -``` +```sql DETACH TABLE consumer; ATTACH MATERIALIZED VIEW consumer; ``` diff --git a/docs/ru/operations/table_engines/merge.md b/docs/ru/operations/table_engines/merge.md index b87bfdafd75..acd7618f9a0 100644 --- a/docs/ru/operations/table_engines/merge.md +++ b/docs/ru/operations/table_engines/merge.md @@ -7,7 +7,7 @@ Пример: -``` +```sql Merge(hits, '^WatchLog') ``` @@ -27,7 +27,7 @@ Merge(hits, '^WatchLog') Пусть есть старая таблица `WatchLog_old`. Необходимо изменить партиционирование без перемещения данных в новую таблицу `WatchLog_new`. При этом в выборке должны участвовать данные обеих таблиц. -``` +```sql CREATE TABLE WatchLog_old(date Date, UserId Int64, EventType String, Cnt UInt64) ENGINE=MergeTree(date, (UserId, EventType), 8192); INSERT INTO WatchLog_old VALUES ('2018-01-01', 1, 'hit', 3); @@ -40,14 +40,14 @@ CREATE TABLE WatchLog as WatchLog_old ENGINE=Merge(currentDatabase(), '^WatchLog SELECT * FROM WatchLog - +``` +```text ┌───────date─┬─UserId─┬─EventType─┬─Cnt─┐ │ 2018-01-01 │ 1 │ hit │ 3 │ └────────────┴────────┴───────────┴─────┘ ┌───────date─┬─UserId─┬─EventType─┬─Cnt─┐ │ 2018-01-02 │ 2 │ hit │ 3 │ └────────────┴────────┴───────────┴─────┘ - ``` ## Виртуальные столбцы diff --git a/docs/ru/operations/table_engines/mergetree.md b/docs/ru/operations/table_engines/mergetree.md index c1fb3dc9edd..3215b1dbd08 100644 --- a/docs/ru/operations/table_engines/mergetree.md +++ b/docs/ru/operations/table_engines/mergetree.md @@ -115,7 +115,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **Пример** -``` +```sql MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID)), 8192) ``` @@ -139,7 +139,7 @@ MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID) Рассмотрим первичный ключ — `(CounterID, Date)`. В этом случае сортировку и индекс можно проиллюстрировать следующим образом: -``` +```text Whole data: [-------------------------------------------------------------------------] CounterID: [aaaaaaaaaaaaaaaaaabbbbcdeeeeeeeeeeeeefgggggggghhhhhhhhhiiiiiiiiikllllllll] Date: [1111111222222233331233211111222222333211111112122222223111112223311122333] @@ -210,7 +210,7 @@ ClickHouse не требует уникального первичного кл Рассмотрим движок сконфигурированный следующим образом: -``` +```sql ENGINE MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate) SETTINGS index_granularity=8192 ``` @@ -309,14 +309,14 @@ INDEX b (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100) GRANULARIT Таблица должна иметь столбец типа [Date](../../data_types/date.md) или [DateTime](../../data_types/datetime.md). Для установки времени жизни данных, следует использовать операцию со столбцом с временем, например: -``` +```sql TTL time_column TTL time_column + interval ``` Чтобы задать `interval`, используйте операторы [интервала времени](../../query_language/operators.md#operators-datetime). -``` +```sql TTL date_time + INTERVAL 1 MONTH TTL date_time + INTERVAL 15 HOUR ``` diff --git a/docs/ru/operations/table_engines/mysql.md b/docs/ru/operations/table_engines/mysql.md index 4f904959ef2..f35e2113bd1 100644 --- a/docs/ru/operations/table_engines/mysql.md +++ b/docs/ru/operations/table_engines/mysql.md @@ -44,7 +44,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Таблица в MySQL: -``` +```text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, diff --git a/docs/ru/operations/table_engines/odbc.md b/docs/ru/operations/table_engines/odbc.md index c3abd18aaa3..34d6bec239d 100644 --- a/docs/ru/operations/table_engines/odbc.md +++ b/docs/ru/operations/table_engines/odbc.md @@ -8,7 +8,7 @@ ## Создание таблицы -``` +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( name1 [type1], @@ -41,15 +41,17 @@ ENGINE = ODBC(connection_settings, external_database, external_table) По умолчанию (если установлен из пакетов) ClickHouse запускается от имени пользователя `clickhouse`. Таким образом, вам нужно создать и настроить этого пользователя на сервере MySQL. +```bash +$ sudo mysql ``` -sudo mysql +```sql mysql> CREATE USER 'clickhouse'@'localhost' IDENTIFIED BY 'clickhouse'; mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'clickhouse' WITH GRANT OPTION; ``` Теперь настроим соединение в `/etc/odbc.ini`. -``` +```bash $ cat /etc/odbc.ini [mysqlconn] DRIVER = /usr/local/lib/libmyodbc5w.so @@ -62,8 +64,8 @@ PASSWORD = clickhouse Вы можете проверить соединение с помощью утилиты `isql` из установки unixODBC. -``` -isql -v mysqlconn +```bash +$ isql -v mysqlconn +---------------------------------------+ | Connected! | | | @@ -72,7 +74,7 @@ isql -v mysqlconn Таблица в MySQL: -``` +```text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, diff --git a/docs/ru/operations/table_engines/replication.md b/docs/ru/operations/table_engines/replication.md index b94cf97762e..3076225feb3 100644 --- a/docs/ru/operations/table_engines/replication.md +++ b/docs/ru/operations/table_engines/replication.md @@ -154,7 +154,7 @@ CREATE TABLE table_name Для запуска восстановления, создайте в ZooKeeper узел `/path_to_table/replica_name/flags/force_restore_data` с любым содержимым или выполните команду для восстановления всех реплицируемых таблиц: ```bash -sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data +$ sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data ``` Затем запустите сервер. При старте, сервер удалит эти флаги и запустит восстановление. diff --git a/docs/ru/operations/table_engines/stripelog.md b/docs/ru/operations/table_engines/stripelog.md index c4ab87fc855..7e80f79255f 100644 --- a/docs/ru/operations/table_engines/stripelog.md +++ b/docs/ru/operations/table_engines/stripelog.md @@ -6,7 +6,7 @@ ## Создание таблицы {#table_engines-stripelog-creating-a-table} -``` +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( column1_name [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], @@ -61,7 +61,7 @@ ClickHouse использует несколько потоков при выб SELECT * FROM stripe_log_table ``` -``` +```text ┌───────────timestamp─┬─message_type─┬─message────────────────────┐ │ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ │ 2019-01-18 14:34:53 │ WARNING │ The first warning message │ @@ -77,7 +77,7 @@ SELECT * FROM stripe_log_table SELECT * FROM stripe_log_table ORDER BY timestamp ``` -``` +```text ┌───────────timestamp─┬─message_type─┬─message────────────────────┐ │ 2019-01-18 14:23:43 │ REGULAR │ The first regular message │ │ 2019-01-18 14:27:32 │ REGULAR │ The second regular message │ diff --git a/docs/ru/operations/table_engines/summingmergetree.md b/docs/ru/operations/table_engines/summingmergetree.md index 9ca1052fe5e..05d20ba60a6 100644 --- a/docs/ru/operations/table_engines/summingmergetree.md +++ b/docs/ru/operations/table_engines/summingmergetree.md @@ -68,8 +68,8 @@ ORDER BY key Добавим в неё данные: -``` -:) INSERT INTO summtt Values(1,1),(1,2),(2,1) +```sql +INSERT INTO summtt Values(1,1),(1,2),(2,1) ``` ClickHouse может не полностью просуммировать все строки ([смотрите ниже по тексту](#obrabotka-dannykh)), поэтому при запросе мы используем агрегатную функцию `sum` и секцию `GROUP BY`. @@ -77,7 +77,7 @@ ClickHouse может не полностью просуммировать вс ```sql SELECT key, sum(value) FROM summtt GROUP BY key ``` -``` +```text ┌─key─┬─sum(value)─┐ │ 2 │ 1 │ │ 1 │ 3 │ @@ -118,7 +118,7 @@ ClickHouse может слить куски данных таким образо Примеры: -``` +```text [(1, 100)] + [(2, 150)] -> [(1, 100), (2, 150)] [(1, 100)] + [(1, 150)] -> [(1, 250)] [(1, 100)] + [(1, 150), (2, 150)] -> [(1, 250), (2, 150)] diff --git a/docs/ru/operations/table_engines/url.md b/docs/ru/operations/table_engines/url.md index f67242ee870..8e8313e814e 100644 --- a/docs/ru/operations/table_engines/url.md +++ b/docs/ru/operations/table_engines/url.md @@ -21,7 +21,7 @@ **1.** Создадим на сервере таблицу `url_engine_table`: -``` sql +```sql CREATE TABLE url_engine_table (word String, value UInt64) ENGINE=URL('http://127.0.0.1:12345/', CSV) ``` @@ -46,16 +46,16 @@ if __name__ == "__main__": ``` ```bash -python3 server.py +$ python3 server.py ``` **3.** Запросим данные: -``` sql +```sql SELECT * FROM url_engine_table ``` -``` +```text ┌─word──┬─value─┐ │ Hello │ 1 │ │ World │ 2 │ diff --git a/docs/ru/operations/table_engines/versionedcollapsingmergetree.md b/docs/ru/operations/table_engines/versionedcollapsingmergetree.md index 43dfd083630..0de677e726b 100644 --- a/docs/ru/operations/table_engines/versionedcollapsingmergetree.md +++ b/docs/ru/operations/table_engines/versionedcollapsingmergetree.md @@ -28,7 +28,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] **Параметры движка** -``` +```sql VersionedCollapsingMergeTree(sign, version) ``` @@ -80,7 +80,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Например, мы хотим рассчитать, сколько страниц пользователи посетили на каком-либо сайте и как долго они там находились. В какой-то момент времени мы записываем следующую строку состояния пользовательской активности: -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | └─────────────────────┴───────────┴──────────┴──────┴─────────┘ @@ -88,7 +88,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Через некоторое время мы регистрируем изменение активности пользователя и записываем его следующими двумя строками. -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | │ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 | @@ -101,7 +101,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Поскольку нам нужно только последнее состояние активности пользователя, строки -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | │ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | @@ -138,7 +138,7 @@ ClickHouse не гарантирует, что все строки с одина Данные для примера: -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 | │ 4324182021466249494 │ 5 │ 146 │ -1 │ 1 | @@ -175,11 +175,11 @@ INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1, 1),(432418202146624949 Получение данных: -``` +```sql SELECT * FROM UAct ``` -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 5 │ 146 │ 1 │ 1 │ └─────────────────────┴───────────┴──────────┴──────┴─────────┘ @@ -206,7 +206,7 @@ GROUP BY UserID, Version HAVING sum(Sign) > 0 ``` -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Version─┐ │ 4324182021466249494 │ 6 │ 185 │ 2 │ └─────────────────────┴───────────┴──────────┴─────────┘ @@ -218,7 +218,7 @@ HAVING sum(Sign) > 0 SELECT * FROM UAct FINAL ``` -``` +```text ┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┬─Version─┐ │ 4324182021466249494 │ 6 │ 185 │ 1 │ 2 │ └─────────────────────┴───────────┴──────────┴──────┴─────────┘ diff --git a/docs/ru/operations/tips.md b/docs/ru/operations/tips.md index 1022ed5a9ba..7803f5f9141 100644 --- a/docs/ru/operations/tips.md +++ b/docs/ru/operations/tips.md @@ -5,7 +5,7 @@ Всегда используйте `performance` scaling governor. `ondemand` scaling governor работает намного хуже при постоянно высоком спросе. ```bash -echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor +$ echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor ``` ## Ограничение CPU @@ -21,8 +21,8 @@ echo 'performance' | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_gover Не выключайте overcommit. Значение `cat /proc/sys/vm/overcommit_memory` должно быть 0 or 1. Выполните: -``` -echo 0 | sudo tee /proc/sys/vm/overcommit_memory +```bash +$ echo 0 | sudo tee /proc/sys/vm/overcommit_memory ``` ## Huge pages @@ -30,7 +30,7 @@ echo 0 | sudo tee /proc/sys/vm/overcommit_memory Механизм прозрачных huge pages нужно отключить. Он мешает работе аллокаторов памяти, что приводит к значительной деградации производительности. ```bash -echo 'never' | sudo tee /sys/kernel/mm/transparent_hugepage/enabled +$ echo 'never' | sudo tee /sys/kernel/mm/transparent_hugepage/enabled ``` С помощью `perf top` можно наблюдать за временем, проведенном в ядре операционной системы для управления памятью. @@ -55,7 +55,7 @@ echo 'never' | sudo tee /sys/kernel/mm/transparent_hugepage/enabled При использовании RAID-5, RAID-6 или RAID-50, нужно всегда увеличивать stripe_cache_size, так как значение по умолчанию выбрано не самым удачным образом. ```bash -echo 4096 | sudo tee /sys/block/md2/md/stripe_cache_size +$ echo 4096 | sudo tee /sys/block/md2/md/stripe_cache_size ``` Точное число стоит вычислять из числа устройств и размер блока по формуле: `2 * num_devices * chunk_size_in_bytes / 4096`. @@ -164,7 +164,7 @@ dynamicConfigFile=/etc/zookeeper-{{ cluster['name'] }}/conf/zoo.cfg.dynamic Версия Java: -``` +```text Java(TM) SE Runtime Environment (build 1.8.0_25-b17) Java HotSpot(TM) 64-Bit Server VM (build 25.25-b02, mixed mode) ``` @@ -212,7 +212,7 @@ JAVA_OPTS="-Xms{{ cluster.get('xms','128M') }} \ Salt init: -``` +```text description "zookeeper-{{ cluster['name'] }} centralized coordination service" start on runlevel [2345] diff --git a/docs/ru/operations/troubleshooting.md b/docs/ru/operations/troubleshooting.md index a48eea394ca..844bd24fc37 100644 --- a/docs/ru/operations/troubleshooting.md +++ b/docs/ru/operations/troubleshooting.md @@ -25,14 +25,14 @@ Команда: -``` -sudo service clickhouse-server status +```bash +$ sudo service clickhouse-server status ``` Если сервер не запущен, запустите его с помощью команды: -``` -sudo service clickhouse-server start +```bash +$ sudo service clickhouse-server start ``` **Проверьте журналы** @@ -46,19 +46,19 @@ sudo service clickhouse-server start Если `clickhouse-server` не запустился из-за ошибки конфигурации вы увидите `` строку с описанием ошибки. Например: -``` +```text 2019.01.11 15:23:25.549505 [ 45 ] {} ExternalDictionaries: Failed reloading 'event2id' external dictionary: Poco::Exception. Code: 1000, e.code() = 111, e.displayText() = Connection refused, e.what() = Connection refused ``` Если вы не видите ошибки в конце файла, просмотрите весь файл начиная со строки: -``` +```text Application: starting up. ``` При попытке запустить второй экземпляр `clickhouse-server` журнал выглядит следующим образом: -``` +```text 2019.01.11 15:25:11.151730 [ 1 ] {} : Starting ClickHouse 19.1.0 with revision 54413 2019.01.11 15:25:11.154578 [ 1 ] {} Application: starting up 2019.01.11 15:25:11.156361 [ 1 ] {} StatusFile: Status file ./status already exists - unclean restart. Contents: @@ -76,14 +76,14 @@ Revision: 54413 Если из логов `clickhouse-server` вы не получили необходимой информации или логов нет, то вы можете посмотреть логи `system.d` командой: -``` -sudo journalctl -u clickhouse-server +```bash +$ sudo journalctl -u clickhouse-server ``` **Запустите clickhouse-server в интерактивном режиме** -``` -sudo -u clickhouse /usr/bin/clickhouse-server --config-file /etc/clickhouse-server/config.xml +```bash +$ sudo -u clickhouse /usr/bin/clickhouse-server --config-file /etc/clickhouse-server/config.xml ``` Эта команда запускает сервер как интерактивное приложение со стандартными параметрами скрипта автозапуска. В этом режиме `clickhouse-server` выводит сообщения в консоль. diff --git a/docs/ru/operations/update.md b/docs/ru/operations/update.md index 41346775637..eb870c2de9c 100644 --- a/docs/ru/operations/update.md +++ b/docs/ru/operations/update.md @@ -2,10 +2,10 @@ Если ClickHouse установлен с помощью deb-пакетов, выполните следующие команды на сервере: -``` -sudo apt-get update -sudo apt-get install clickhouse-client clickhouse-server -sudo service clickhouse-server restart +```bash +$ sudo apt-get update +$ sudo apt-get install clickhouse-client clickhouse-server +$ sudo service clickhouse-server restart ``` Если ClickHouse установлен не из рекомендуемых deb-пакетов, используйте соответствующий метод обновления. diff --git a/docs/ru/operations/utils/clickhouse-copier.md b/docs/ru/operations/utils/clickhouse-copier.md index b38e25f6c16..9eb5a151a4a 100644 --- a/docs/ru/operations/utils/clickhouse-copier.md +++ b/docs/ru/operations/utils/clickhouse-copier.md @@ -23,7 +23,7 @@ Утилиту следует запускать вручную следующим образом: ```bash -clickhouse-copier copier --daemon --config zookeeper.xml --task-path /task/path --base-dir /path/to/dir +$ clickhouse-copier copier --daemon --config zookeeper.xml --task-path /task/path --base-dir /path/to/dir ``` Параметры запуска: diff --git a/docs/ru/operations/utils/clickhouse-local.md b/docs/ru/operations/utils/clickhouse-local.md index dd3e21d9ce1..a9c32e49606 100644 --- a/docs/ru/operations/utils/clickhouse-local.md +++ b/docs/ru/operations/utils/clickhouse-local.md @@ -14,8 +14,8 @@ Основной формат вызова: -``` bash -clickhouse-local --structure "table_structure" --input-format "format_of_incoming_data" -q "query" +```bash +$ clickhouse-local --structure "table_structure" --input-format "format_of_incoming_data" -q "query" ``` Ключи команды: @@ -35,8 +35,8 @@ clickhouse-local --structure "table_structure" --input-format "format_of_incomin ## Примеры вызова -``` bash -echo -e "1,2\n3,4" | clickhouse-local -S "a Int64, b Int64" -if "CSV" -q "SELECT * FROM table" +```bash +$ echo -e "1,2\n3,4" | clickhouse-local -S "a Int64, b Int64" -if "CSV" -q "SELECT * FROM table" Read 2 rows, 32.00 B in 0.000 sec., 5182 rows/sec., 80.97 KiB/sec. 1 2 3 4 @@ -44,7 +44,7 @@ Read 2 rows, 32.00 B in 0.000 sec., 5182 rows/sec., 80.97 KiB/sec. Вызов выше эквивалентен следующему: -``` bash +```bash $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64) ENGINE = File(CSV, stdin); SELECT a, b FROM table; DROP TABLE table" Read 2 rows, 32.00 B in 0.000 sec., 4987 rows/sec., 77.93 KiB/sec. 1 2 @@ -53,7 +53,7 @@ Read 2 rows, 32.00 B in 0.000 sec., 4987 rows/sec., 77.93 KiB/sec. А теперь давайте выведем на экран объем оперативной памяти, занимаемой пользователями (Unix): -``` bash +```bash $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' | clickhouse-local -S "user String, mem Float64" -q "SELECT user, round(sum(mem), 2) as memTotal FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty" Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec. ┏━━━━━━━━━━┳━━━━━━━━━━┓ diff --git a/docs/ru/query_language/agg_functions/index.md b/docs/ru/query_language/agg_functions/index.md index 138bae48fd3..fad195991db 100644 --- a/docs/ru/query_language/agg_functions/index.md +++ b/docs/ru/query_language/agg_functions/index.md @@ -15,7 +15,7 @@ ClickHouse поддерживает также: Рассмотрим таблицу: -``` +```text ┌─x─┬────y─┐ │ 1 │ 2 │ │ 2 │ ᴺᵁᴸᴸ │ @@ -27,17 +27,14 @@ ClickHouse поддерживает также: Выполним суммирование значений в столбце `y`: +```sql +SELECT sum(y) FROM t_null_big ``` -:) SELECT sum(y) FROM t_null_big - -SELECT sum(y) -FROM t_null_big - +```text ┌─sum(y)─┐ │ 7 │ └────────┘ -1 rows in set. Elapsed: 0.002 sec. ``` Функция `sum` работает с `NULL` как с `0`. В частности, это означает, что если на вход в функцию подать выборку, где все значения `NULL`, то результат будет `0`, а не `NULL`. @@ -45,17 +42,13 @@ FROM t_null_big Теперь с помощью фукции `groupArray` сформируем массив из стобца `y`: +```sql +SELECT groupArray(y) FROM t_null_big ``` -:) SELECT groupArray(y) FROM t_null_big - -SELECT groupArray(y) -FROM t_null_big - +```text ┌─groupArray(y)─┐ │ [2,2,3] │ └───────────────┘ - -1 rows in set. Elapsed: 0.002 sec. ``` `groupArray` не включает `NULL` в результирующий массив. diff --git a/docs/ru/query_language/agg_functions/parametric_functions.md b/docs/ru/query_language/agg_functions/parametric_functions.md index 93e96be1dfa..5bdf838d115 100644 --- a/docs/ru/query_language/agg_functions/parametric_functions.md +++ b/docs/ru/query_language/agg_functions/parametric_functions.md @@ -21,7 +21,7 @@ Это вырожденный пример. Его можно записать с помощью других агрегатных функций: -``` +```sql minIf(EventTime, URL LIKE '%company%') < maxIf(EventTime, URL LIKE '%cart%'). ``` @@ -53,7 +53,7 @@ minIf(EventTime, URL LIKE '%company%') < maxIf(EventTime, URL LIKE '%cart%'). Отыскивает цепочки событий в скользящем окне по времени и вычисляет максимальное количество произошедших событий из цепочки. -``` +```sql windowFunnel(window)(timestamp, cond1, cond2, cond3, ...) ``` @@ -85,7 +85,7 @@ windowFunnel(window)(timestamp, cond1, cond2, cond3, ...) Чтобы узнать, как далеко пользователь `user_id` смог пройти по цепочке за час в январе 2017-го года, составим запрос: -``` +```sql SELECT level, count() AS c @@ -121,7 +121,7 @@ ORDER BY level Пример применения: -``` +```text Задача: показывать в отчёте только поисковые фразы, по которым было хотя бы 5 уникальных посетителей. Решение: пишем в запросе GROUP BY SearchPhrase HAVING uniqUpTo(4)(UserID) >= 5 ``` diff --git a/docs/ru/query_language/agg_functions/reference.md b/docs/ru/query_language/agg_functions/reference.md index b948b17abee..89922a30c6b 100644 --- a/docs/ru/query_language/agg_functions/reference.md +++ b/docs/ru/query_language/agg_functions/reference.md @@ -81,7 +81,7 @@ SELECT count(DISTINCT num) FROM t Выбирает часто встречающееся значение с помощью алгоритма "[heavy hitters](http://www.cs.umd.edu/~samir/498/karp.pdf)". Если существует значение, которое встречается чаще, чем в половине случаев, в каждом потоке выполнения запроса, то возвращается данное значение. В общем случае, результат недетерминирован. -``` +```sql anyHeavy(column) ``` @@ -98,7 +98,7 @@ SELECT anyHeavy(AirlineID) AS res FROM ontime ``` -``` +```text ┌───res─┐ │ 19690 │ └───────┘ @@ -113,7 +113,7 @@ FROM ontime Применяет побитовое `И` для последовательности чисел. -``` +```sql groupBitAnd(expr) ``` @@ -129,7 +129,7 @@ groupBitAnd(expr) Тестовые данные: -``` +```text binary decimal 00101100 = 44 00011100 = 28 @@ -139,7 +139,7 @@ binary decimal Запрос: -``` +```sql SELECT groupBitAnd(num) FROM t ``` @@ -147,7 +147,7 @@ SELECT groupBitAnd(num) FROM t Результат: -``` +```text binary decimal 00000100 = 4 ``` @@ -156,7 +156,7 @@ binary decimal Применяет побитовое `ИЛИ` для последовательности чисел. -``` +```sql groupBitOr(expr) ``` @@ -172,7 +172,7 @@ groupBitOr(expr) Тестовые данные: -``` +```text binary decimal 00101100 = 44 00011100 = 28 @@ -182,7 +182,7 @@ binary decimal Запрос: -``` +```sql SELECT groupBitOr(num) FROM t ``` @@ -190,7 +190,7 @@ SELECT groupBitOr(num) FROM t Результат: -``` +```text binary decimal 01111101 = 125 ``` @@ -199,7 +199,7 @@ binary decimal Применяет побитовое `ИСКЛЮЧАЮЩЕЕ ИЛИ` для последовательности чисел. -``` +```sql groupBitXor(expr) ``` @@ -215,7 +215,7 @@ groupBitXor(expr) Тестовые данные: -``` +```text binary decimal 00101100 = 44 00011100 = 28 @@ -225,7 +225,7 @@ binary decimal Запрос: -``` +```sql SELECT groupBitXor(num) FROM t ``` @@ -233,7 +233,7 @@ SELECT groupBitXor(num) FROM t Результат: -``` +```text binary decimal 01101000 = 104 ``` @@ -242,7 +242,7 @@ binary decimal Bitmap или агрегатные вычисления для столбца с типом данных `UInt*`, возвращают кардинальность в виде значения типа UInt64, если добавить суффикс -State, то возвращают [объект bitmap](../functions/bitmap_functions.md). -``` +```sql groupBitmap(expr) ``` @@ -258,7 +258,7 @@ groupBitmap(expr) Тестовые данные: -``` +```text UserID 1 1 @@ -268,13 +268,13 @@ UserID Запрос: -``` +```sql SELECT groupBitmap(UserID) as num FROM t ``` Результат: -``` +```text num 3 ``` @@ -293,15 +293,17 @@ num **Пример:** -``` +```text ┌─user─────┬─salary─┐ │ director │ 5000 │ │ manager │ 3000 │ │ worker │ 1000 │ └──────────┴────────┘ - +``` +```sql SELECT argMin(user, salary) FROM salary - +``` +```text ┌─argMin(user, salary)─┐ │ worker │ └──────────────────────┘ @@ -353,7 +355,7 @@ FROM sum_map GROUP BY timeslot ``` -``` +```text ┌────────────timeslot─┬─sumMap(statusMap.status, statusMap.requests)─┐ │ 2000-01-01 00:00:00 │ ([1,2,3,4,5],[10,10,20,10,10]) │ │ 2000-01-01 00:01:00 │ ([4,5,6,7,8],[10,10,20,10,10]) │ @@ -364,7 +366,7 @@ GROUP BY timeslot Вычисляет [коэффициент асимметрии](https://ru.wikipedia.org/wiki/Коэффициент_асимметрии) для последовательности. -``` +```sql skewPop(expr) ``` @@ -388,7 +390,7 @@ SELECT skewPop(value) FROM series_with_value_column Он представляет собой несмещенную оценку асимметрии случайной величины, если переданные значения образуют ее выборку. -``` +```sql skewSamp(expr) ``` @@ -410,7 +412,7 @@ SELECT skewSamp(value) FROM series_with_value_column Вычисляет [коэффициент эксцесса](https://ru.wikipedia.org/wiki/Коэффициент_эксцесса) последовательности. -``` +```sql kurtPop(expr) ``` @@ -434,7 +436,7 @@ SELECT kurtPop(value) FROM series_with_value_column Он представляет собой несмещенную оценку эксцесса случайной величины, если переданные значения образуют ее выборку. -``` +```sql kurtSamp(expr) ``` @@ -467,7 +469,7 @@ SELECT kurtSamp(value) FROM series_with_value_column Пример: -``` +```text ┌─uid─┬─timestamp─┬─value─┐ │ 1 │ 2 │ 0.2 │ │ 1 │ 7 │ 0.7 │ @@ -482,7 +484,7 @@ SELECT kurtSamp(value) FROM series_with_value_column └─────┴───────────┴───────┘ ``` -``` +```sql CREATE TABLE time_series( uid UInt64, timestamp Int64, @@ -500,7 +502,7 @@ FROM ( И результат будет: -``` +```text [(2,0.2),(3,0.9),(7,2.1),(8,2.4),(12,3.6),(17,5.1),(18,5.4),(24,7.2),(25,2.5)] ``` @@ -511,7 +513,7 @@ FROM ( Для пример из описания timeSeriesGroupRateSum результат будет следующим: -``` +```text [(2,0),(3,0.1),(7,0.3),(8,0.3),(12,0.3),(17,0.3),(18,0.3),(24,0.3),(25,0.1)] ``` @@ -525,7 +527,7 @@ FROM ( Приближённо вычисляет количество различных значений аргумента. -``` +```sql uniq(x[, ...]) ``` @@ -561,7 +563,7 @@ uniq(x[, ...]) Приближённо вычисляет количество различных значений аргумента. -``` +```sql uniqCombined(HLL_precision)(x[, ...]) ``` @@ -605,7 +607,7 @@ uniqCombined(HLL_precision)(x[, ...]) Вычисляет приблизительное число различных значений аргументов, используя алгоритм [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog). -``` +```sql uniqHLL12(x[, ...]) ``` @@ -641,7 +643,7 @@ uniqHLL12(x[, ...]) Вычисляет точное количество различных значений аргументов. -``` +```sql uniqExact(x[, ...]) ``` @@ -685,7 +687,7 @@ uniqExact(x[, ...]) Вычисляет скользящую сумму входных значений. -``` +```sql groupArrayMovingSum(numbers_for_summing) groupArrayMovingSum(window_size)(numbers_for_summing) ``` @@ -943,7 +945,7 @@ FROM t Реализует [Filtered Space-Saving](http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf) алгоритм для анализа TopK, на основе reduce-and-combine алгоритма из методики [Parallel Space Saving](https://arxiv.org/pdf/1401.0702.pdf). -``` +```sql topK(N)(column) ``` @@ -965,7 +967,7 @@ SELECT topK(3)(AirlineID) AS res FROM ontime ``` -``` +```text ┌─res─────────────────┐ │ [19393,19790,19805] │ └─────────────────────┘ @@ -989,7 +991,7 @@ FROM ontime Выполняет простую (одномерную) линейную регрессию. -``` +```sql simpleLinearRegression(x, y) ``` diff --git a/docs/ru/query_language/alter.md b/docs/ru/query_language/alter.md index 21f19323d2d..01d8bbc3179 100644 --- a/docs/ru/query_language/alter.md +++ b/docs/ru/query_language/alter.md @@ -6,7 +6,7 @@ Изменение структуры таблицы. -``` sql +```sql ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|CLEAR|COMMENT|MODIFY COLUMN ... ``` @@ -25,7 +25,7 @@ ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|CLEAR|COMMENT|MODIFY COLUMN #### ADD COLUMN {#alter_add-column} -``` sql +```sql ADD COLUMN [IF NOT EXISTS] name [type] [default_expr] [AFTER name_after] ``` @@ -39,13 +39,13 @@ ADD COLUMN [IF NOT EXISTS] name [type] [default_expr] [AFTER name_after] Пример: -``` sql +```sql ALTER TABLE visits ADD COLUMN browser String AFTER user_id ``` #### DROP COLUMN {#alter_drop-column} -``` sql +```sql DROP COLUMN [IF EXISTS] name ``` @@ -55,13 +55,13 @@ DROP COLUMN [IF EXISTS] name Пример: -``` sql +```sql ALTER TABLE visits DROP COLUMN browser ``` #### CLEAR COLUMN {#alter_clear-column} -``` sql +```sql CLEAR COLUMN [IF EXISTS] name IN PARTITION partition_name ``` Сбрасывает все значения в столбце для заданной партиции. Если указано `IF EXISTS`, запрос не будет возвращать ошибку, если столбца не существует. @@ -70,13 +70,13 @@ CLEAR COLUMN [IF EXISTS] name IN PARTITION partition_name Пример: -``` sql +```sql ALTER TABLE visits CLEAR COLUMN browser IN PARTITION tuple() ``` #### COMMENT COLUMN {#alter_comment-column} -``` sql +```sql COMMENT COLUMN [IF EXISTS] name 'Text comment' ``` @@ -88,13 +88,13 @@ COMMENT COLUMN [IF EXISTS] name 'Text comment' Пример: -``` sql +```sql ALTER TABLE visits COMMENT COLUMN browser 'Столбец показывает, из каких браузеров пользователи заходили на сайт.' ``` #### MODIFY COLUMN {#alter_modify-column} -``` sql +```sql MODIFY COLUMN [IF EXISTS] name [type] [default_expr] ``` @@ -104,7 +104,7 @@ MODIFY COLUMN [IF EXISTS] name [type] [default_expr] Пример запроса: -``` sql +```sql ALTER TABLE visits MODIFY COLUMN browser Array(String) ``` @@ -138,7 +138,7 @@ ALTER TABLE visits MODIFY COLUMN browser Array(String) Поддерживается операция: -``` sql +```sql MODIFY ORDER BY new_expression ``` @@ -153,7 +153,7 @@ MODIFY ORDER BY new_expression ### Манипуляции с индексами Добавить или удалить индекс можно с помощью операций -``` +```sql ALTER TABLE [db].name ADD INDEX name expression TYPE type GRANULARITY value [AFTER name] ALTER TABLE [db].name DROP INDEX name ``` @@ -170,7 +170,7 @@ ALTER TABLE [db].name DROP INDEX name Про ограничения подробнее написано [тут](create.md#constraints). Добавить или удалить ограничение можно с помощью запросов -``` +```sql ALTER TABLE [db].name ADD CONSTRAINT constraint_name CHECK expression; ALTER TABLE [db].name DROP CONSTRAINT constraint_name; ``` @@ -286,7 +286,7 @@ ALTER TABLE visits CLEAR COLUMN hour in PARTITION 201902 #### CLEAR INDEX IN PARTITION {#alter_clear-index-partition} -``` sql +```sql ALTER TABLE table_name CLEAR INDEX index_name IN PARTITION partition_expr ``` @@ -418,19 +418,19 @@ OPTIMIZE TABLE table_not_partitioned PARTITION tuple() FINAL; На данный момент доступны команды: -``` sql +```sql ALTER TABLE [db.]table DELETE WHERE filter_expr ``` Выражение `filter_expr` должно иметь тип UInt8. Запрос удаляет строки таблицы, для которых это выражение принимает ненулевое значение. -``` sql +```sql ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] WHERE filter_expr ``` Команда доступна начиная с версии 18.12.14. Выражение `filter_expr` должно иметь тип UInt8. Запрос изменяет значение указанных столбцов на вычисленное значение соответствующих выражений в каждой строке, для которой `filter_expr` принимает ненулевое значение. Вычисленные значения преобразуются к типу столбца с помощью оператора `CAST`. Изменение столбцов, которые используются при вычислении первичного ключа или ключа партиционирования, не поддерживается. -``` sql +```sql ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name ``` diff --git a/docs/ru/query_language/create.md b/docs/ru/query_language/create.md index d32afb7b9d9..95d794b691e 100644 --- a/docs/ru/query_language/create.md +++ b/docs/ru/query_language/create.md @@ -51,7 +51,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name AS [db2.]name2 [ENGINE = engine] Создаёт таблицу с такой же структурой, как другая таблица. Можно указать другой движок для таблицы. Если движок не указан, то будет выбран такой же движок, как у таблицы `db2.name2`. -``` sql +```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name AS table_fucntion() ``` diff --git a/docs/ru/query_language/dicts/external_dicts_dict_layout.md b/docs/ru/query_language/dicts/external_dicts_dict_layout.md index 826d9b78ae9..22fd6ec0d91 100644 --- a/docs/ru/query_language/dicts/external_dicts_dict_layout.md +++ b/docs/ru/query_language/dicts/external_dicts_dict_layout.md @@ -99,7 +99,7 @@ Пример: таблица содержит скидки для каждого рекламодателя в виде: -``` +```text +---------------+---------------------+-------------------+--------+ | advertiser id | discount start date | discount end date | amount | +===============+=====================+===================+========+ @@ -226,7 +226,7 @@ Пример: таблица содержит префиксы сети и соответствующие им номера AS и коды стран: -``` +```text +-----------------+-------+--------+ | prefix | asn | cca2 | +=================+=======+========+ @@ -269,13 +269,13 @@ Для запросов необходимо использовать те же функции (`dictGetT` с кортежем), что и для словарей с составными ключами: -``` +```sql dictGetT('dict_name', 'attr_name', tuple(ip)) ``` Функция принимает либо `UInt32` для IPv4, либо `FixedString(16)` для IPv6: -``` +```sql dictGetString('prefix', 'asn', tuple(IPv6StringToNum('2001:db8::1'))) ``` diff --git a/docs/ru/query_language/dicts/external_dicts_dict_sources.md b/docs/ru/query_language/dicts/external_dicts_dict_sources.md index 1c0d214053c..9a25ffdb4aa 100644 --- a/docs/ru/query_language/dicts/external_dicts_dict_sources.md +++ b/docs/ru/query_language/dicts/external_dicts_dict_sources.md @@ -132,7 +132,7 @@ ClickHouse получает от ODBC-драйвера информацию о Сконфигурируем unixODBC для работы с PostgreSQL. Содержимое `/etc/odbc.ini`: -``` +```text [gregtest] Driver = /usr/lib/psqlodbca.so Servername = localhost @@ -145,7 +145,7 @@ PASSWORD = test Если выполнить запрос вида: -``` +```sql SELECT * FROM odbc('DSN=gregtest;Servername=some-server.com', 'test_db'); ``` @@ -156,12 +156,12 @@ SELECT * FROM odbc('DSN=gregtest;Servername=some-server.com', 'test_db'); ОС Ubuntu. Установка unixODBC и ODBC-драйвера для PostgreSQL: : - - sudo apt-get install -y unixodbc odbcinst odbc-postgresql - +```bash +$ sudo apt-get install -y unixodbc odbcinst odbc-postgresql +``` Настройка `/etc/odbc.ini` (или `~/.odbc.ini`): -``` +```text [DEFAULT] Driver = myconnection @@ -223,13 +223,13 @@ SELECT * FROM odbc('DSN=gregtest;Servername=some-server.com', 'test_db'); Установка драйвера: : -``` - sudo apt-get install tdsodbc freetds-bin sqsh +```bash +$ sudo apt-get install tdsodbc freetds-bin sqsh ``` Настройка драйвера: : -``` +```bash $ cat /etc/freetds/freetds.conf ... diff --git a/docs/ru/query_language/functions/arithmetic_functions.md b/docs/ru/query_language/functions/arithmetic_functions.md index e17d193c203..3c541b50a99 100644 --- a/docs/ru/query_language/functions/arithmetic_functions.md +++ b/docs/ru/query_language/functions/arithmetic_functions.md @@ -4,11 +4,11 @@ Пример: -``` sql +```sql SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 + 0 + 0) ``` -``` +```text ┌─toTypeName(0)─┬─toTypeName(plus(0, 0))─┬─toTypeName(plus(plus(0, 0), 0))─┬─toTypeName(plus(plus(plus(0, 0), 0), 0))─┐ │ UInt8 │ UInt16 │ UInt32 │ UInt64 │ └───────────────┴────────────────────────┴─────────────────────────────────┴──────────────────────────────────────────┘ diff --git a/docs/ru/query_language/functions/array_functions.md b/docs/ru/query_language/functions/array_functions.md index 11d5c819b02..a3d3fff9ff2 100644 --- a/docs/ru/query_language/functions/array_functions.md +++ b/docs/ru/query_language/functions/array_functions.md @@ -49,7 +49,7 @@ Объединяет массивы, переданные в качестве аргументов. -``` +```sql arrayConcat(arrays) ``` @@ -62,7 +62,7 @@ arrayConcat(arrays) SELECT arrayConcat([1, 2], [3, 4], [5, 6]) AS res ``` -``` +```text ┌─res───────────┐ │ [1,2,3,4,5,6] │ └───────────────┘ @@ -83,9 +83,10 @@ SELECT arrayConcat([1, 2], [3, 4], [5, 6]) AS res `NULL` обрабатывается как значение. -``` +```sql SELECT has([1, 2, NULL], NULL) - +``` +```text ┌─has([1, 2, NULL], NULL)─┐ │ 1 │ └─────────────────────────┘ @@ -95,7 +96,7 @@ SELECT has([1, 2, NULL], NULL) Проверяет, является ли один массив подмножеством другого. -``` +```sql hasAll(set, subset) ``` @@ -133,7 +134,7 @@ hasAll(set, subset) Проверяет, имеют ли два массива хотя бы один общий элемент. -``` +```sql hasAny(array1, array2) ``` @@ -170,11 +171,10 @@ hasAny(array1, array2) Пример: -``` -:) SELECT indexOf([1,3,NULL,NULL],NULL) - +```sql SELECT indexOf([1, 3, NULL, NULL], NULL) - +``` +```text ┌─indexOf([1, 3, NULL, NULL], NULL)─┐ │ 3 │ └───────────────────────────────────┘ @@ -190,9 +190,10 @@ SELECT indexOf([1, 3, NULL, NULL], NULL) Пример: -``` +```sql SELECT countEqual([1, 2, NULL, NULL], NULL) - +``` +```text ┌─countEqual([1, 2, NULL, NULL], NULL)─┐ │ 2 │ └──────────────────────────────────────┘ @@ -216,7 +217,7 @@ WHERE CounterID = 160656 LIMIT 10 ``` -``` +```text ┌─Reaches─┬──Hits─┐ │ 95606 │ 31406 │ └─────────┴───────┘ @@ -232,7 +233,7 @@ FROM test.hits WHERE (CounterID = 160656) AND notEmpty(GoalsReached) ``` -``` +```text ┌─Reaches─┬──Hits─┐ │ 95606 │ 31406 │ └─────────┴───────┘ @@ -263,7 +264,7 @@ ORDER BY Reaches DESC LIMIT 10 ``` -``` +```text ┌──GoalID─┬─Reaches─┬─Visits─┐ │ 53225 │ 3214 │ 1097 │ │ 2825062 │ 3188 │ 1097 │ @@ -286,7 +287,7 @@ LIMIT 10 SELECT arrayEnumerateUniq([1, 1, 1, 2, 2, 2], [1, 1, 2, 1, 1, 2]) AS res ``` -``` +```text ┌─res───────────┐ │ [1,2,1,1,2,1] │ └───────────────┘ @@ -298,7 +299,7 @@ SELECT arrayEnumerateUniq([1, 1, 1, 2, 2, 2], [1, 1, 2, 1, 1, 2]) AS res Удаляет последний элемент из массива. -``` +```sql arrayPopBack(array) ``` @@ -311,7 +312,7 @@ arrayPopBack(array) ```sql SELECT arrayPopBack([1, 2, 3]) AS res ``` - +text ``` ┌─res───┐ │ [1,2] │ @@ -322,7 +323,7 @@ SELECT arrayPopBack([1, 2, 3]) AS res Удаляет первый элемент из массива. -``` +```sql arrayPopFront(array) ``` @@ -336,7 +337,7 @@ arrayPopFront(array) SELECT arrayPopFront([1, 2, 3]) AS res ``` -``` +```text ┌─res───┐ │ [2,3] │ └───────┘ @@ -346,7 +347,7 @@ SELECT arrayPopFront([1, 2, 3]) AS res Добавляет один элемент в конец массива. -``` +```sql arrayPushBack(array, single_value) ``` @@ -361,7 +362,7 @@ arrayPushBack(array, single_value) SELECT arrayPushBack(['a'], 'b') AS res ``` -``` +```text ┌─res───────┐ │ ['a','b'] │ └───────────┘ @@ -371,7 +372,7 @@ SELECT arrayPushBack(['a'], 'b') AS res Добавляет один элемент в начало массива. -``` +```sql arrayPushFront(array, single_value) ``` @@ -386,7 +387,7 @@ arrayPushFront(array, single_value) SELECT arrayPushBack(['b'], 'a') AS res ``` -``` +```text ┌─res───────┐ │ ['a','b'] │ └───────────┘ @@ -396,7 +397,7 @@ SELECT arrayPushBack(['b'], 'a') AS res Изменяет длину массива. -``` +```sql arrayResize(array, size[, extender]) ``` @@ -414,17 +415,19 @@ arrayResize(array, size[, extender]) **Примеры вызовов** -``` +```sql SELECT arrayResize([1], 3) - +``` +```text ┌─arrayResize([1], 3)─┐ │ [1,0,0] │ └─────────────────────┘ ``` -``` +```sql SELECT arrayResize([1], 3, NULL) - +``` +```text ┌─arrayResize([1], 3, NULL)─┐ │ [1,NULL,NULL] │ └───────────────────────────┘ @@ -434,7 +437,7 @@ SELECT arrayResize([1], 3, NULL) Возвращает срез массива. -``` +```sql arraySlice(array, offset[, length]) ``` @@ -450,7 +453,7 @@ arraySlice(array, offset[, length]) SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res ``` -``` +```text ┌─res────────┐ │ [2,NULL,4] │ └────────────┘ @@ -464,10 +467,10 @@ SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res Пример сортировки целочисленных значений: -``` sql +```sql SELECT arraySort([1, 3, 3, 0]) ``` -``` +```text ┌─arraySort([1, 3, 3, 0])─┐ │ [0,1,3,3] │ └─────────────────────────┘ @@ -475,10 +478,10 @@ SELECT arraySort([1, 3, 3, 0]) Пример сортировки строковых значений: -``` sql +```sql SELECT arraySort(['hello', 'world', '!']) ``` -``` +```text ┌─arraySort(['hello', 'world', '!'])─┐ │ ['!','hello','world'] │ └────────────────────────────────────┘ @@ -486,10 +489,10 @@ SELECT arraySort(['hello', 'world', '!']) Значения `NULL`, `NaN` и `Inf` сортируются по следующему принципу: -``` sql +```sql SELECT arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]); ``` -``` +```text ┌─arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf])─┐ │ [-inf,-4,1,2,3,inf,nan,nan,NULL,NULL] │ └───────────────────────────────────────────────────────────┘ @@ -504,10 +507,10 @@ SELECT arraySort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]); Рассмотрим пример: -``` sql +```sql SELECT arraySort((x) -> -x, [1, 2, 3]) as res; ``` -``` +```text ┌─res─────┐ │ [3,2,1] │ └─────────┘ @@ -517,11 +520,11 @@ SELECT arraySort((x) -> -x, [1, 2, 3]) as res; Лямбда-функция может принимать несколько аргументов. В этом случае, в функцию `arraySort` нужно передавать несколько массивов, которые будут соответствовать аргументам лямбда-функции (массивы должны быть одинаковой длины). Следует иметь в виду, что результат будет содержать элементы только из первого массива; элементы из всех последующих массивов будут задавать ключи сортировки. Например: -``` sql +```sql SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; ``` -``` +```text ┌─res────────────────┐ │ ['world', 'hello'] │ └────────────────────┘ @@ -531,18 +534,18 @@ SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; Ниже приведены другие примеры. -``` sql +```sql SELECT arraySort((x, y) -> y, [0, 1, 2], ['c', 'b', 'a']) as res; ``` -``` sql +```text ┌─res─────┐ │ [2,1,0] │ └─────────┘ ``` -``` sql +```sql SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; ``` -``` sql +```text ┌─res─────┐ │ [2,1,0] │ └─────────┘ @@ -557,10 +560,10 @@ SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; Пример сортировки целочисленных значений: -``` sql +```sql SELECT arrayReverseSort([1, 3, 3, 0]); ``` -``` +```text ┌─arrayReverseSort([1, 3, 3, 0])─┐ │ [3,3,1,0] │ └────────────────────────────────┘ @@ -568,21 +571,21 @@ SELECT arrayReverseSort([1, 3, 3, 0]); Пример сортировки строковых значений: -``` sql +```sql SELECT arrayReverseSort(['hello', 'world', '!']); ``` -``` +```text ┌─arrayReverseSort(['hello', 'world', '!'])─┐ │ ['world','hello','!'] │ └───────────────────────────────────────────┘ -``` +``` Значения `NULL`, `NaN` и `Inf` сортируются в следующем порядке: -``` sql +```sql SELECT arrayReverseSort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]) as res; ``` -``` sql +```text ┌─res───────────────────────────────────┐ │ [inf,3,2,1,-4,-inf,nan,nan,NULL,NULL] │ └───────────────────────────────────────┘ @@ -595,10 +598,10 @@ SELECT arrayReverseSort([1, nan, 2, NULL, 3, nan, -4, NULL, inf, -inf]) as res; Функция `arrayReverseSort` является [функцией высшего порядка](higher_order_functions.md). Вы можете передать ей в качестве первого аргумента лямбда-функцию. Например: -``` sql +```sql SELECT arrayReverseSort((x) -> -x, [1, 2, 3]) as res; ``` -``` +```text ┌─res─────┐ │ [1,2,3] │ └─────────┘ @@ -611,10 +614,10 @@ SELECT arrayReverseSort((x) -> -x, [1, 2, 3]) as res; Лямбда-функция может принимать на вход несколько аргументов. В этом случае, в функцию `arrayReverseSort` нужно передавать несколько массивов, которые будут соответствовать аргументам лямбда-функции (массивы должны быть одинаковой длины). Следует иметь в виду, что результат будет содержать элементы только из первого массива; элементы из всех последующих массивов будут определять ключи сортировки. Например: -``` sql +```sql SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; ``` -``` sql +```text ┌─res───────────────┐ │ ['hello','world'] │ └───────────────────┘ @@ -627,18 +630,18 @@ SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; Ниже приведены ещё примеры. -``` sql +```sql SELECT arrayReverseSort((x, y) -> y, [0, 1, 2], ['c', 'b', 'a']) as res; ``` -``` sql +```text ┌─res─────┐ │ [0,1,2] │ └─────────┘ ``` -``` sql +```sql SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res; ``` -``` sql +```text ┌─res─────┐ │ [4,3,5] │ └─────────┘ diff --git a/docs/ru/query_language/functions/array_join.md b/docs/ru/query_language/functions/array_join.md index 7a89ab73fb4..1317d4329f2 100644 --- a/docs/ru/query_language/functions/array_join.md +++ b/docs/ru/query_language/functions/array_join.md @@ -15,11 +15,11 @@ Пример: -``` sql +```sql SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src ``` -``` +```text ┌─dst─┬─\'Hello\'─┬─src─────┐ │ 1 │ Hello │ [1,2,3] │ │ 2 │ Hello │ [1,2,3] │ diff --git a/docs/ru/query_language/functions/bitmap_functions.md b/docs/ru/query_language/functions/bitmap_functions.md index 00608b72770..f54d88ab35f 100644 --- a/docs/ru/query_language/functions/bitmap_functions.md +++ b/docs/ru/query_language/functions/bitmap_functions.md @@ -4,7 +4,7 @@ Создаёт битовый массив из массива целочисленных значений. -``` +```sql bitmapBuild(array) ``` @@ -28,7 +28,7 @@ SELECT bitmapBuild([1, 2, 3, 4, 5]) AS res, toTypeName(res) Преобразует битовый массив в массив целочисленных значений. -``` +```sql bitmapToArray(bitmap) ``` @@ -42,7 +42,7 @@ bitmapToArray(bitmap) SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res ``` -``` +```text ┌─res─────────┐ │ [1,2,3,4,5] │ └─────────────┘ @@ -52,7 +52,7 @@ SELECT bitmapToArray(bitmapBuild([1, 2, 3, 4, 5])) AS res Проверяет вхождение элемента в битовый массив. -``` +```sql bitmapContains(haystack, needle) ``` @@ -70,7 +70,7 @@ bitmapContains(haystack, needle) **Пример** -``` sql +```sql SELECT bitmapContains(bitmapBuild([1,5,7,9]), toUInt32(9)) AS res ``` ```text @@ -83,7 +83,7 @@ SELECT bitmapContains(bitmapBuild([1,5,7,9]), toUInt32(9)) AS res Проверяет, имеют ли два битовых массива хотя бы один общий элемент. -``` +```sql bitmapHasAny(bitmap1, bitmap2) ``` @@ -104,7 +104,7 @@ bitmapHasAny(bitmap1, bitmap2) SELECT bitmapHasAny(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res ``` -``` +```text ┌─res─┐ │ 1 │ └─────┘ @@ -115,7 +115,7 @@ SELECT bitmapHasAny(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res Аналогично функции `hasAll(array, array)` возвращает 1 если первый битовый массив содержит все элементы второго, 0 в противном случае. Если второй аргумент является пустым битовым массивом, то возвращает 1. -``` +```sql bitmapHasAll(bitmap,bitmap) ``` @@ -129,7 +129,7 @@ bitmapHasAll(bitmap,bitmap) SELECT bitmapHasAll(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res ``` -``` +```text ┌─res─┐ │ 0 │ └─────┘ @@ -139,7 +139,7 @@ SELECT bitmapHasAll(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res Логическое И для двух битовых массивов. Результат — новый битовый массив. -``` +```sql bitmapAnd(bitmap,bitmap) ``` @@ -153,7 +153,7 @@ bitmapAnd(bitmap,bitmap) SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -``` +```text ┌─res─┐ │ [3] │ └─────┘ @@ -163,7 +163,7 @@ SELECT bitmapToArray(bitmapAnd(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS re Логическое ИЛИ для двух битовых массивов. Результат — новый битовый массив. -``` +```sql bitmapOr(bitmap,bitmap) ``` @@ -177,7 +177,7 @@ bitmapOr(bitmap,bitmap) SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -``` +```text ┌─res─────────┐ │ [1,2,3,4,5] │ └─────────────┘ @@ -187,7 +187,7 @@ SELECT bitmapToArray(bitmapOr(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res Логическое исключающее ИЛИ для двух битовых массивов. Результат — новый битовый массив. -``` +```sql bitmapXor(bitmap,bitmap) ``` @@ -201,7 +201,7 @@ bitmapXor(bitmap,bitmap) SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -``` +```text ┌─res───────┐ │ [1,2,4,5] │ └───────────┘ @@ -211,7 +211,7 @@ SELECT bitmapToArray(bitmapXor(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS re Логическое отрицание И для двух битовых массивов. Результат — новый битовый массив. -``` +```sql bitmapAndnot(bitmap,bitmap) ``` @@ -225,7 +225,7 @@ bitmapAndnot(bitmap,bitmap) SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS res ``` -``` +```text ┌─res───┐ │ [1,2] │ └───────┘ @@ -235,7 +235,7 @@ SELECT bitmapToArray(bitmapAndnot(bitmapBuild([1,2,3]),bitmapBuild([3,4,5]))) AS Возвращает кардинальность битового массива в виде значения типа `UInt64`. -``` +```sql bitmapCardinality(bitmap) ``` @@ -249,7 +249,7 @@ bitmapCardinality(bitmap) SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res ``` -``` +```text ┌─res─┐ │ 5 │ └─────┘ @@ -259,7 +259,7 @@ SELECT bitmapCardinality(bitmapBuild([1, 2, 3, 4, 5])) AS res Выполняет логическое И и возвращает кардинальность (`UInt64`) результирующего битового массива. -``` +```sql bitmapAndCardinality(bitmap,bitmap) ``` @@ -273,7 +273,7 @@ bitmapAndCardinality(bitmap,bitmap) SELECT bitmapAndCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -``` +```text ┌─res─┐ │ 1 │ └─────┘ @@ -283,7 +283,7 @@ SELECT bitmapAndCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; Выполняет логическое ИЛИ и возвращает кардинальность (`UInt64`) результирующего битового массива. -``` +```sql bitmapOrCardinality(bitmap,bitmap) ``` @@ -297,7 +297,7 @@ bitmapOrCardinality(bitmap,bitmap) SELECT bitmapOrCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -``` +```text ┌─res─┐ │ 5 │ └─────┘ @@ -307,7 +307,7 @@ SELECT bitmapOrCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; Выполняет логическое исключающее ИЛИ и возвращает кардинальность (`UInt64`) результирующего битового массива. -``` +```sql bitmapXorCardinality(bitmap,bitmap) ``` @@ -321,7 +321,7 @@ bitmapXorCardinality(bitmap,bitmap) SELECT bitmapXorCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -``` +```text ┌─res─┐ │ 4 │ └─────┘ @@ -331,7 +331,7 @@ SELECT bitmapXorCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; Выполняет логическое отрицание И и возвращает кардинальность (`UInt64`) результирующего битового массива. -``` +```sql bitmapAndnotCardinality(bitmap,bitmap) ``` @@ -345,7 +345,7 @@ bitmapAndnotCardinality(bitmap,bitmap) SELECT bitmapAndnotCardinality(bitmapBuild([1,2,3]),bitmapBuild([3,4,5])) AS res; ``` -``` +```text ┌─res─┐ │ 2 │ └─────┘ diff --git a/docs/ru/query_language/functions/conditional_functions.md b/docs/ru/query_language/functions/conditional_functions.md index 0140a008123..a648314deb4 100644 --- a/docs/ru/query_language/functions/conditional_functions.md +++ b/docs/ru/query_language/functions/conditional_functions.md @@ -31,7 +31,7 @@ multiIf(cond_1, then_1, cond_2, then_2...else) Рассмотрим таблицу -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -40,7 +40,7 @@ multiIf(cond_1, then_1, cond_2, then_2...else) Выполним запрос `SELECT multiIf(isNull(y), x, y < 3, y, NULL) FROM t_null`. Результат: -``` +```text ┌─multiIf(isNull(y), x, less(y, 3), y, NULL)─┐ │ 1 │ │ ᴺᵁᴸᴸ │ diff --git a/docs/ru/query_language/functions/date_time_functions.md b/docs/ru/query_language/functions/date_time_functions.md index 2503beb5810..432bf56652b 100644 --- a/docs/ru/query_language/functions/date_time_functions.md +++ b/docs/ru/query_language/functions/date_time_functions.md @@ -4,7 +4,7 @@ Все функции по работе с датой и временем, для которых это имеет смысл, могут принимать второй, необязательный аргумент - имя часового пояса. Пример: Asia/Yekaterinburg. В этом случае, они используют не локальный часовой пояс (по умолчанию), а указанный. -``` sql +```sql SELECT toDateTime('2016-06-15 23:00:00') AS time, toDate(time) AS date_local, @@ -12,7 +12,7 @@ SELECT toString(time, 'US/Samoa') AS time_samoa ``` -``` +```text ┌────────────────time─┬─date_local─┬─date_yekat─┬─time_samoa──────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-16 │ 2016-06-15 09:00:00 │ └─────────────────────┴────────────┴────────────┴─────────────────────┘ diff --git a/docs/ru/query_language/functions/ext_dict_functions.md b/docs/ru/query_language/functions/ext_dict_functions.md index e7f23a64664..c07f8adf84c 100644 --- a/docs/ru/query_language/functions/ext_dict_functions.md +++ b/docs/ru/query_language/functions/ext_dict_functions.md @@ -6,7 +6,7 @@ Извлекает значение из внешнего словаря. -``` +```sql dictGet('dict_name', 'attr_name', id_expr) dictGetOrDefault('dict_name', 'attr_name', id_expr, default_value_expr) ``` @@ -95,7 +95,7 @@ LIMIT 3 Проверяет, присутствует ли запись с указанным ключом в словаре. -``` +```sql dictHas('dict_name', id) ``` @@ -115,7 +115,7 @@ dictHas('dict_name', id) Для иерархического словаря возвращает массив ключей словаря, начиная с переданного `id_expr` и продолжая цепочкой родительских элементов. -``` +```sql dictGetHierarchy('dict_name', id) ``` @@ -167,7 +167,7 @@ ClickHouse поддерживает специализированные фун Синтаксис: -``` +```sql dictGet[Type]('dict_name', 'attr_name', id_expr) dictGet[Type]OrDefault('dict_name', 'attr_name', id_expr, default_value_expr) ``` diff --git a/docs/ru/query_language/functions/functions_for_nulls.md b/docs/ru/query_language/functions/functions_for_nulls.md index 9899fa8c786..65457fa81b7 100644 --- a/docs/ru/query_language/functions/functions_for_nulls.md +++ b/docs/ru/query_language/functions/functions_for_nulls.md @@ -4,7 +4,7 @@ Проверяет является ли аргумент [NULL](../syntax.md#null). -``` +```sql isNull(x) ``` @@ -21,7 +21,7 @@ isNull(x) Входная таблица -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -30,25 +30,20 @@ isNull(x) Запрос +```sql +SELECT x FROM t_null WHERE isNull(y) ``` -:) SELECT x FROM t_null WHERE isNull(y) - -SELECT x -FROM t_null -WHERE isNull(y) - +```text ┌─x─┐ │ 1 │ └───┘ - -1 rows in set. Elapsed: 0.010 sec. ``` ## isNotNull Проверяет не является ли аргумент [NULL](../syntax.md#null). -``` +```sql isNotNull(x) ``` @@ -65,7 +60,7 @@ isNotNull(x) Входная таблица -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -74,25 +69,20 @@ isNotNull(x) Запрос +```sql +SELECT x FROM t_null WHERE isNotNull(y) ``` -:) SELECT x FROM t_null WHERE isNotNull(y) - -SELECT x -FROM t_null -WHERE isNotNull(y) - +```text ┌─x─┐ │ 2 │ └───┘ - -1 rows in set. Elapsed: 0.010 sec. ``` ## coalesce Последовательно слева-направо проверяет являются ли переданные аргументы `NULL` и возвращает первый не `NULL`. -``` +```sql coalesce(x,...) ``` **Параметры** @@ -108,7 +98,7 @@ coalesce(x,...) Рассмотрим адресную книгу, в которой может быть указано несколько способов связи с клиентом. -``` +```text ┌─name─────┬─mail─┬─phone─────┬──icq─┐ │ client 1 │ ᴺᵁᴸᴸ │ 123-45-67 │ 123 │ │ client 2 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ @@ -119,25 +109,21 @@ coalesce(x,...) Получим из адресной книги первый доступный способ связаться с клиентом: +```sql +SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook ``` -:) SELECT coalesce(mail, phone, CAST(icq,'Nullable(String)')) FROM aBook - -SELECT coalesce(mail, phone, CAST(icq, 'Nullable(String)')) -FROM aBook - +```text ┌─name─────┬─coalesce(mail, phone, CAST(icq, 'Nullable(String)'))─┐ │ client 1 │ 123-45-67 │ │ client 2 │ ᴺᵁᴸᴸ │ └──────────┴──────────────────────────────────────────────────────┘ - -2 rows in set. Elapsed: 0.006 sec. ``` ## ifNull Возвращает альтернативное значение, если основной аргумент — `NULL`. -``` +```sql ifNull(x,alt) ``` @@ -153,16 +139,18 @@ ifNull(x,alt) **Пример** -``` +```sql SELECT ifNull('a', 'b') - +``` +```text ┌─ifNull('a', 'b')─┐ │ a │ └──────────────────┘ ``` -``` +```sql SELECT ifNull(NULL, 'b') - +``` +```text ┌─ifNull(NULL, 'b')─┐ │ b │ └───────────────────┘ @@ -172,7 +160,7 @@ SELECT ifNull(NULL, 'b') Возвращает `NULL`, если аргументы равны. -``` +```sql nullIf(x, y) ``` @@ -187,16 +175,18 @@ nullIf(x, y) **Пример** -``` +```sql SELECT nullIf(1, 1) - +``` +```text ┌─nullIf(1, 1)─┐ │ ᴺᵁᴸᴸ │ └──────────────┘ ``` -``` +```sql SELECT nullIf(1, 2) - +``` +```text ┌─nullIf(1, 2)─┐ │ 1 │ └──────────────┘ @@ -206,7 +196,7 @@ SELECT nullIf(1, 2) Приводит значение типа [Nullable](../../data_types/nullable.md) к не `Nullable`, если значение не `NULL`. -``` +```sql assumeNotNull(x) ``` @@ -223,14 +213,15 @@ assumeNotNull(x) Рассмотрим таблицу `t_null`. -``` +```sql SHOW CREATE TABLE t_null - +``` +```text ┌─statement─────────────────────────────────────────────────────────────────┐ │ CREATE TABLE default.t_null ( x Int8, y Nullable(Int8)) ENGINE = TinyLog │ └───────────────────────────────────────────────────────────────────────────┘ ``` -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -239,17 +230,19 @@ SHOW CREATE TABLE t_null Применим функцию `assumeNotNull` к столбцу `y`. -``` +```sql SELECT assumeNotNull(y) FROM t_null - +``` +```text ┌─assumeNotNull(y)─┐ │ 0 │ │ 3 │ └──────────────────┘ ``` -``` +```sql SELECT toTypeName(assumeNotNull(y)) FROM t_null - +``` +```text ┌─toTypeName(assumeNotNull(y))─┐ │ Int8 │ │ Int8 │ @@ -260,7 +253,7 @@ SELECT toTypeName(assumeNotNull(y)) FROM t_null Преобразует тип аргумента к `Nullable`. -``` +```sql toNullable(x) ``` @@ -274,15 +267,18 @@ toNullable(x) **Пример** -``` +```sql SELECT toTypeName(10) - +``` +```text ┌─toTypeName(10)─┐ │ UInt8 │ └────────────────┘ - +``` +```sql SELECT toTypeName(toNullable(10)) - +``` +```text ┌─toTypeName(toNullable(10))─┐ │ Nullable(UInt8) │ └────────────────────────────┘ diff --git a/docs/ru/query_language/functions/geo.md b/docs/ru/query_language/functions/geo.md index 63ceae9208e..55789c93ab4 100644 --- a/docs/ru/query_language/functions/geo.md +++ b/docs/ru/query_language/functions/geo.md @@ -4,7 +4,7 @@ Вычисляет расстояние между двумя точками на поверхности Земли по [формуле большого круга](https://en.wikipedia.org/wiki/Great-circle_distance). -``` +```sql greatCircleDistance(lon1Deg, lat1Deg, lon2Deg, lat2Deg) ``` @@ -29,7 +29,7 @@ greatCircleDistance(lon1Deg, lat1Deg, lon2Deg, lat2Deg) SELECT greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673) ``` -``` +```text ┌─greatCircleDistance(55.755831, 37.617673, -55.755831, -37.617673)─┐ │ 14132374.194975413 │ └───────────────────────────────────────────────────────────────────┘ @@ -62,7 +62,7 @@ pointInEllipses(x, y, x₀, y₀, a₀, b₀,...,xₙ, yₙ, aₙ, bₙ) SELECT pointInEllipses(10., 10., 10., 9.1, 1., 0.9999) ``` -``` +```text ┌─pointInEllipses(10., 10., 10., 9.1, 1., 0.9999)─┐ │ 1 │ └─────────────────────────────────────────────────┘ @@ -72,7 +72,7 @@ SELECT pointInEllipses(10., 10., 10., 9.1, 1., 0.9999) Проверяет, принадлежит ли точка многоугольнику на плоскости. -``` +```sql pointInPolygon((x, y), [(a, b), (c, d) ...], ...) ``` @@ -93,7 +93,7 @@ pointInPolygon((x, y), [(a, b), (c, d) ...], ...) SELECT pointInPolygon((3., 3.), [(6, 0), (8, 4), (5, 8), (0, 2)]) AS res ``` -``` +```text ┌─res─┐ │ 1 │ └─────┘ @@ -103,7 +103,7 @@ SELECT pointInPolygon((3., 3.), [(6, 0), (8, 4), (5, 8), (0, 2)]) AS res Кодирует широту и долготу в строку geohash, смотрите [http://geohash.org/](http://geohash.org/), [https://en.wikipedia.org/wiki/Geohash](https://en.wikipedia.org/wiki/Geohash). -``` +```sql geohashEncode(longitude, latitude, [precision]) ``` @@ -123,7 +123,7 @@ geohashEncode(longitude, latitude, [precision]) SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res ``` -``` +```text ┌─res──────────┐ │ ezs42d000000 │ └──────────────┘ @@ -133,7 +133,7 @@ SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res Декодирует любую строку, закодированную в geohash, на долготу и широту. -``` +```sql geohashDecode(geohash_string) ``` @@ -151,7 +151,7 @@ geohashDecode(geohash_string) SELECT geohashDecode('ezs42') AS res ``` -``` +```text ┌─res─────────────────────────────┐ │ (-5.60302734375,42.60498046875) │ └─────────────────────────────────┘ @@ -161,7 +161,7 @@ SELECT geohashDecode('ezs42') AS res Получает H3 индекс точки `(lon, lat)` с заданным разрешением -``` +```sql geoToH3(lon, lat, resolution) ``` @@ -180,10 +180,10 @@ geoToH3(lon, lat, resolution) **Пример** -``` sql +```sql SELECT geoToH3(37.79506683, 55.71290588, 15) as h3Index ``` -``` +```text ┌────────────h3Index─┐ │ 644325524701193974 │ └────────────────────┘ diff --git a/docs/ru/query_language/functions/hash_functions.md b/docs/ru/query_language/functions/hash_functions.md index e171b2bfa38..96c3e5cae53 100644 --- a/docs/ru/query_language/functions/hash_functions.md +++ b/docs/ru/query_language/functions/hash_functions.md @@ -6,7 +6,7 @@ [Интерпретирует](../../query_language/functions/type_conversion_functions.md#type_conversion_functions-reinterpretAsString) все входные параметры как строки и вычисляет хэш [MD5](https://ru.wikipedia.org/wiki/MD5) для каждой из них. Затем объединяет хэши, берет первые 8 байт хэша результирующей строки и интерпретирует их как значение типа `UInt64` с big-endian порядком байтов. -``` +```sql halfMD5(par1, ...) ``` @@ -43,7 +43,7 @@ SELECT halfMD5(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00') Генерирует 64-х битное значение [SipHash](https://131002.net/siphash/). -``` +```sql sipHash64(par1,...) ``` @@ -70,7 +70,7 @@ sipHash64(par1,...) SELECT sipHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')) AS SipHash, toTypeName(SipHash) AS type ``` -``` +```text ┌──────────────SipHash─┬─type───┐ │ 13726873534472839665 │ UInt64 │ └──────────────────────┴────────┘ @@ -86,7 +86,7 @@ SELECT sipHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00 Генерирует 64-х битное значение [CityHash](https://github.com/google/cityhash). -``` +```sql cityHash64(par1,...) ``` @@ -151,7 +151,7 @@ SELECT groupBitXor(cityHash64(*)) FROM table Генерирует 64-х битное значение [FarmHash](https://github.com/google/farmhash). -``` +```sql farmHash64(par1, ...) ``` @@ -192,7 +192,7 @@ HiveHash — это результат [JavaHash](#hash_functions-javahash) с Генерирует 64-х битное значение [MetroHash](http://www.jandrewrogers.com/2015/05/27/metrohash/). -``` +```sql metroHash64(par1, ...) ``` @@ -226,7 +226,7 @@ SELECT metroHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00: Генерирует значение [MurmurHash2](https://github.com/aappleby/smhasher). -``` +```sql murmurHash2_32(par1, ...) murmurHash2_64(par1, ...) ``` @@ -256,7 +256,7 @@ SELECT murmurHash2_64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23: Генерирует значение [MurmurHash3](https://github.com/aappleby/smhasher). -``` +```sql murmurHash3_32(par1, ...) murmurHash3_64(par1, ...) ``` @@ -286,7 +286,7 @@ SELECT murmurHash3_32(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23: Генерирует значение [MurmurHash3](https://github.com/aappleby/smhasher). -``` +```sql murmurHash3_128( expr ) ``` diff --git a/docs/ru/query_language/functions/higher_order_functions.md b/docs/ru/query_language/functions/higher_order_functions.md index aea793d27b4..b64a561f58a 100644 --- a/docs/ru/query_language/functions/higher_order_functions.md +++ b/docs/ru/query_language/functions/higher_order_functions.md @@ -25,9 +25,10 @@ Примеры: -``` sql +```sql SELECT arrayMap(x -> (x + 2), [1, 2, 3]) as res; - +``` +```text ┌─res─────┐ │ [3,4,5] │ └─────────┘ @@ -35,9 +36,10 @@ SELECT arrayMap(x -> (x + 2), [1, 2, 3]) as res; Следующий пример показывает, как создать кортежи из элементов разных массивов: -``` sql +```sql SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res - +``` +```text ┌─res─────────────────┐ │ [(1,4),(2,5),(3,6)] │ └─────────────────────┘ @@ -51,17 +53,17 @@ SELECT arrayMap((x, y) -> (x, y), [1, 2, 3], [4, 5, 6]) AS res Примеры: -``` sql +```sql SELECT arrayFilter(x -> x LIKE '%World%', ['Hello', 'abc World']) AS res ``` -``` +```text ┌─res───────────┐ │ ['abc World'] │ └───────────────┘ ``` -``` sql +```sql SELECT arrayFilter( (i, x) -> x LIKE '%World%', @@ -70,7 +72,7 @@ SELECT AS res ``` -``` +```text ┌─res─┐ │ [2] │ └─────┘ @@ -107,11 +109,11 @@ SELECT Пример: -``` sql +```sql SELECT arrayCumSum([1, 1, 1, 1]) AS res ``` -``` +```text ┌─res──────────┐ │ [1, 2, 3, 4] │ └──────────────┘ @@ -126,11 +128,11 @@ SELECT arrayCumSum([1, 1, 1, 1]) AS res Пример: -``` sql +```sql SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]); ``` -``` +```text ┌─res────────────────┐ │ ['world', 'hello'] │ └────────────────────┘ @@ -144,10 +146,10 @@ SELECT arraySort((x, y) -> y, ['hello', 'world'], [2, 1]); Пример: -``` sql +```sql SELECT arrayReverseSort((x, y) -> y, ['hello', 'world'], [2, 1]) as res; ``` -``` sql +```text ┌─res───────────────┐ │ ['hello','world'] │ └───────────────────┘ diff --git a/docs/ru/query_language/functions/ip_address_functions.md b/docs/ru/query_language/functions/ip_address_functions.md index 57c11b46d81..5b1c5e32172 100644 --- a/docs/ru/query_language/functions/ip_address_functions.md +++ b/docs/ru/query_language/functions/ip_address_functions.md @@ -24,7 +24,7 @@ ORDER BY c DESC LIMIT 10 ``` -``` +```text ┌─k──────────────┬─────c─┐ │ 83.149.9.xxx │ 26238 │ │ 217.118.81.xxx │ 26074 │ @@ -50,7 +50,7 @@ IPv6-mapped IPv4 адреса выводится в формате ::ffff:111.22 SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr ``` -``` +```text ┌─addr─────────┐ │ 2a02:6b8::11 │ └──────────────┘ @@ -67,7 +67,7 @@ ORDER BY c DESC LIMIT 10 ``` -``` +```text ┌─IPv6NumToString(ClientIP6)──────────────┬─────c─┐ │ 2a02:2168:aaa:bbbb::2 │ 24695 │ │ 2a02:2698:abcd:abcd:abcd:abcd:8888:5555 │ 22408 │ @@ -93,7 +93,7 @@ ORDER BY c DESC LIMIT 10 ``` -``` +```text ┌─IPv6NumToString(ClientIP6)─┬──────c─┐ │ ::ffff:94.26.111.111 │ 747440 │ │ ::ffff:37.143.222.4 │ 529483 │ @@ -121,7 +121,7 @@ HEX может быть в любом регистре. SELECT IPv6NumToString(IPv4ToIPv6(IPv4StringToNum('192.168.0.1'))) AS addr ``` -``` +```text ┌─addr───────────────┐ │ ::ffff:192.168.0.1 │ └────────────────────┘ @@ -140,7 +140,7 @@ SELECT cutIPv6(ipv4, 0, 2) ``` -``` +```text ┌─cutIPv6(ipv6, 2, 0)─────────────────┬─cutIPv6(ipv4, 0, 2)─┐ │ 2001:db8:ac10:fe01:feed:babe:cafe:0 │ ::ffff:192.168.0.0 │ └─────────────────────────────────────┴─────────────────────┘ @@ -184,7 +184,7 @@ SELECT toTypeName(toIPv4(IPv4_string)) ``` -``` +```text ┌─toTypeName(IPv4StringToNum(IPv4_string))─┬─toTypeName(toIPv4(IPv4_string))─┐ │ UInt32 │ IPv4 │ └──────────────────────────────────────────┴─────────────────────────────────┘ @@ -198,7 +198,7 @@ SELECT hex(toIPv4(IPv4_string)) ``` -``` +```text ┌─hex(IPv4StringToNum(IPv4_string))─┬─hex(toIPv4(IPv4_string))─┐ │ ABE1822D │ ABE1822D │ └───────────────────────────────────┴──────────────────────────┘ @@ -216,7 +216,7 @@ SELECT toTypeName(toIPv6(IPv6_string)) ``` -``` +```text ┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐ │ FixedString(16) │ IPv6 │ └──────────────────────────────────────────┴─────────────────────────────────┘ @@ -230,7 +230,7 @@ SELECT hex(toIPv6(IPv6_string)) ``` -``` +```text ┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐ │ 20010438FFFF000000000000407D1BC1 │ 20010438FFFF000000000000407D1BC1 │ └───────────────────────────────────┴──────────────────────────────────┘ diff --git a/docs/ru/query_language/functions/json_functions.md b/docs/ru/query_language/functions/json_functions.md index d1794112b41..d1de97ef10e 100644 --- a/docs/ru/query_language/functions/json_functions.md +++ b/docs/ru/query_language/functions/json_functions.md @@ -35,7 +35,7 @@ Примеры: -``` +```sql visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"' visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}' ``` @@ -46,7 +46,7 @@ visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}' Примеры: -``` +```sql visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺' visitParamExtractString('{"abc":"\\u263"}', 'abc') = '' @@ -65,9 +65,9 @@ visitParamExtractString('{"abc":"hello}', 'abc') = '' Примеры: -``` -select JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 1 -select JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4) = 0 +```sql +SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 1 +SELECT JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4) = 0 ``` `indices_or_keys` — это список из нуля или более аргументов каждый из них может быть либо строкой либо целым числом. @@ -82,12 +82,12 @@ select JSONHas('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 4) = 0 Примеры: -``` -select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'a' -select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 2) = 'b' -select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -1) = 'b' -select JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' -select JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' +```sql +SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'a' +SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', 2) = 'b' +SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -1) = 'b' +SELECT JSONExtractKey('{"a": "hello", "b": [-100, 200.0, 300]}', -2) = 'a' +SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' ``` ## JSONLength(json[, indices_or_keys]...) @@ -98,9 +98,9 @@ select JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 1) = 'hello' Примеры: -``` -select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 -select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 +```sql +SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 3 +SELECT JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 ``` ## JSONType(json[, indices_or_keys]...) @@ -111,10 +111,10 @@ select JSONLength('{"a": "hello", "b": [-100, 200.0, 300]}') = 2 Примеры: -``` -select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}') = 'Object' -select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' -select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' +```sql +SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}') = 'Object' +SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'String' +SELECT JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' ``` ## JSONExtractUInt(json[, indices_or_keys]...) @@ -131,10 +131,10 @@ select JSONType('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = 'Array' Примеры: -``` -select JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 -select JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200.0 -select JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 +```sql +SELECT JSONExtractInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 1) = -100 +SELECT JSONExtractFloat('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 2) = 200.0 +SELECT JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 ``` ## JSONExtractString(json[, indices_or_keys]...) @@ -147,12 +147,12 @@ select JSONExtractUInt('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', -1) = 300 Примеры: -``` -select JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'hello' -select JSONExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' -select JSONExtractString('{"abc":"\\u263a"}', 'abc') = '☺' -select JSONExtractString('{"abc":"\\u263"}', 'abc') = '' -select JSONExtractString('{"abc":"hello}', 'abc') = '' +```sql +SELECT JSONExtractString('{"a": "hello", "b": [-100, 200.0, 300]}', 'a') = 'hello' +SELECT JSONExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' +SELECT JSONExtractString('{"abc":"\\u263a"}', 'abc') = '☺' +SELECT JSONExtractString('{"abc":"\\u263"}', 'abc') = '' +SELECT JSONExtractString('{"abc":"hello}', 'abc') = '' ``` ## JSONExtract(json[, indices_or_keys...], return_type) @@ -166,7 +166,7 @@ select JSONExtractString('{"abc":"hello}', 'abc') = '' Примеры: -``` +```sql SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(String, Array(Float64))') = ('hello',[-100,200,300]) SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'Tuple(b Array(Float64), a String)') = ([-100,200,300],'hello') SELECT JSONExtract('{"a": "hello", "b": [-100, 200.0, 300]}', 'b', 'Array(Nullable(Int8))') = [-100, NULL, NULL] @@ -182,7 +182,7 @@ SELECT JSONExtract('{"day": 5}', 'day', 'Enum8(\'Sunday\' = 0, \'Monday\' = 1, \ Пример: -``` +```sql SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') = [('a',5),('b',7),('c',11)]; ``` @@ -194,8 +194,8 @@ SELECT JSONExtractKeysAndValues('{"x": {"a": 5, "b": 7, "c": 11}}', 'x', 'Int8') Пример: -``` -select JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]' +```sql +SELECT JSONExtractRaw('{"a": "hello", "b": [-100, 200.0, 300]}', 'b') = '[-100, 200.0, 300]' ``` [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/json_functions/) diff --git a/docs/ru/query_language/functions/math_functions.md b/docs/ru/query_language/functions/math_functions.md index 0470a469780..92feceb3a95 100644 --- a/docs/ru/query_language/functions/math_functions.md +++ b/docs/ru/query_language/functions/math_functions.md @@ -39,11 +39,11 @@ Пример (правило трёх сигм): -``` sql +```sql SELECT erf(3 / sqrt(2)) ``` -``` +```text ┌─erf(divide(3, sqrt(2)))─┐ │ 0.9973002039367398 │ └─────────────────────────┘ diff --git a/docs/ru/query_language/functions/other_functions.md b/docs/ru/query_language/functions/other_functions.md index bfb19bb64b1..987840cac99 100644 --- a/docs/ru/query_language/functions/other_functions.md +++ b/docs/ru/query_language/functions/other_functions.md @@ -8,7 +8,7 @@ Извлекает конечную часть строки после последнего слэша или бэкслэша. Функция часто используется для извлечения имени файла из пути. -``` +```sql basename( expr ) ``` @@ -65,9 +65,11 @@ SELECT 'some-file-name' AS a, basename(a) `NULL` представляется как строка, соответствующая отображению `NULL` в форматах `Pretty`. -``` +```sql SELECT visibleWidth(NULL) +``` +```text ┌─visibleWidth(NULL)─┐ │ 4 │ └────────────────────┘ @@ -131,7 +133,7 @@ SELECT visibleWidth(NULL) Пример: -``` sql +```sql SELECT toHour(EventTime) AS h, count() AS c, @@ -141,7 +143,7 @@ GROUP BY h ORDER BY h ASC ``` -``` +```text ┌──h─┬──────c─┬─bar────────────────┐ │ 0 │ 292907 │ █████████▋ │ │ 1 │ 180563 │ ██████ │ @@ -199,7 +201,7 @@ ORDER BY h ASC Пример: -``` sql +```sql SELECT transform(SearchEngineID, [2, 3], ['Yandex', 'Google'], 'Other') AS title, count() AS c @@ -209,7 +211,7 @@ GROUP BY title ORDER BY c DESC ``` -``` +```text ┌─title─────┬──────c─┐ │ Yandex │ 498635 │ │ Google │ 229872 │ @@ -228,7 +230,7 @@ ORDER BY c DESC Пример: -``` sql +```sql SELECT transform(domain(Referer), ['yandex.ru', 'google.ru', 'vk.com'], ['www.yandex', 'example.com']) AS s, count() AS c @@ -238,7 +240,7 @@ ORDER BY count() DESC LIMIT 10 ``` -``` +```text ┌─s──────────────┬───────c─┐ │ │ 2906259 │ │ www.yandex │ 867767 │ @@ -257,13 +259,13 @@ LIMIT 10 Пример: -``` sql +```sql SELECT arrayJoin([1, 1024, 1024*1024, 192851925]) AS filesize_bytes, formatReadableSize(filesize_bytes) AS filesize ``` -``` +```text ┌─filesize_bytes─┬─filesize───┐ │ 1 │ 1.00 B │ │ 1024 │ 1.00 KiB │ @@ -302,7 +304,7 @@ SELECT Если значение `offset` выходит за пределы блока данных, то берётся значение по-умолчанию для колонки `column`. Если передан параметр `default_value`, то значение берётся из него. Например, эта функция может использоваться чтобы оценить year-over-year значение показателя: -``` sql +```sql WITH toDate('2018-01-01') AS start_date SELECT toStartOfMonth(start_date + (number * 32)) AS month, @@ -312,7 +314,7 @@ SELECT FROM numbers(16) ``` -``` +```text ┌──────month─┬─money─┬─prev_year─┬─year_over_year─┐ │ 2018-01-01 │ 32 │ 0 │ 0 │ │ 2018-02-01 │ 63 │ 0 │ 0 │ @@ -342,7 +344,7 @@ FROM numbers(16) Пример: -``` sql +```sql SELECT EventID, EventTime, @@ -359,7 +361,7 @@ FROM ) ``` -``` +```text ┌─EventID─┬───────────EventTime─┬─delta─┐ │ 1106 │ 2016-11-24 00:00:04 │ 0 │ │ 1107 │ 2016-11-24 00:00:05 │ 1 │ @@ -371,21 +373,22 @@ FROM Обратите внимание — размер блока влияет на результат. С каждым новым блоком состояние `runningDifference` сбрасывается. -``` sql +```sql SELECT number, runningDifference(number + 1) AS diff FROM numbers(100000) WHERE diff != 1 - +``` +```text ┌─number─┬─diff─┐ │ 0 │ 0 │ └────────┴──────┘ ┌─number─┬─diff─┐ │ 65536 │ 0 │ └────────┴──────┘ - - +``` +``` set max_block_size=100000 -- по умолчанию 65536! SELECT @@ -393,7 +396,8 @@ SELECT runningDifference(number + 1) AS diff FROM numbers(100000) WHERE diff != 1 - +``` +```text ┌─number─┬─diff─┐ │ 0 │ 0 │ └────────┴──────┘ @@ -415,7 +419,7 @@ WHERE diff != 1 Возвращает количество полей в [Enum](../../data_types/enum.md). -``` +```sql getSizeOfEnumType(value) ``` @@ -431,9 +435,11 @@ getSizeOfEnumType(value) **Пример** -``` +```sql SELECT getSizeOfEnumType( CAST('a' AS Enum8('a' = 1, 'b' = 2) ) ) AS x +``` +```text ┌─x─┐ │ 2 │ └───┘ @@ -443,7 +449,7 @@ SELECT getSizeOfEnumType( CAST('a' AS Enum8('a' = 1, 'b' = 2) ) ) AS x Возвращает имя класса, которым представлен тип данных столбца в оперативной памяти. -``` +```sql toColumnTypeName(value) ``` @@ -457,21 +463,20 @@ toColumnTypeName(value) **Пример разницы между `toTypeName` и `toColumnTypeName`** +```sql +SELECT toTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) ``` -:) select toTypeName(cast('2018-01-01 01:02:03' AS DateTime)) - -SELECT toTypeName(CAST('2018-01-01 01:02:03', 'DateTime')) +```text ┌─toTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ │ DateTime │ └─────────────────────────────────────────────────────┘ +``` +```sql +SELECT toColumnTypeName(CAST('2018-01-01 01:02:03' AS DateTime)) +``` -1 rows in set. Elapsed: 0.008 sec. - -:) select toColumnTypeName(cast('2018-01-01 01:02:03' AS DateTime)) - -SELECT toColumnTypeName(CAST('2018-01-01 01:02:03', 'DateTime')) - +```text ┌─toColumnTypeName(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ │ Const(UInt32) │ └───────────────────────────────────────────────────────────┘ @@ -483,7 +488,7 @@ SELECT toColumnTypeName(CAST('2018-01-01 01:02:03', 'DateTime')) Выводит развернутое описание структур данных в оперативной памяти -``` +```sql dumpColumnStructure(value) ``` @@ -497,9 +502,11 @@ dumpColumnStructure(value) **Пример** -``` +```sql SELECT dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime')) +``` +```text ┌─dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime'))─┐ │ DateTime, Const(size = 1, UInt32(size = 1)) │ └──────────────────────────────────────────────────────────────┘ @@ -511,7 +518,7 @@ SELECT dumpColumnStructure(CAST('2018-01-01 01:02:03', 'DateTime')) Не учитывает значения по умолчанию для столбцов, заданные пользователем. -``` +```sql defaultValueOfArgumentType(expression) ``` @@ -527,26 +534,23 @@ defaultValueOfArgumentType(expression) **Пример** +```sql +SELECT defaultValueOfArgumentType( CAST(1 AS Int8) ) ``` -:) SELECT defaultValueOfArgumentType( CAST(1 AS Int8) ) - -SELECT defaultValueOfArgumentType(CAST(1, 'Int8')) +```text ┌─defaultValueOfArgumentType(CAST(1, 'Int8'))─┐ │ 0 │ └─────────────────────────────────────────────┘ +``` +```sql +SELECT defaultValueOfArgumentType( CAST(1 AS Nullable(Int8) ) ) +``` -1 rows in set. Elapsed: 0.002 sec. - -:) SELECT defaultValueOfArgumentType( CAST(1 AS Nullable(Int8) ) ) - -SELECT defaultValueOfArgumentType(CAST(1, 'Nullable(Int8)')) - +```text ┌─defaultValueOfArgumentType(CAST(1, 'Nullable(Int8)'))─┐ │ ᴺᵁᴸᴸ │ └───────────────────────────────────────────────────────┘ - -1 rows in set. Elapsed: 0.002 sec. ``` ## indexHint @@ -565,9 +569,11 @@ SELECT defaultValueOfArgumentType(CAST(1, 'Nullable(Int8)')) Рассмотрим таблицу с тестовыми данными [ontime](../../getting_started/example_datasets/ontime.md). -``` +```sql SELECT count() FROM ontime +``` +```text ┌─count()─┐ │ 4276457 │ └─────────┘ @@ -577,16 +583,11 @@ SELECT count() FROM ontime Выполним выборку по дате следующим образом: +```sql +SELECT FlightDate AS k, count() FROM ontime GROUP BY k ORDER BY k ``` -:) SELECT FlightDate AS k, count() FROM ontime GROUP BY k ORDER BY k - -SELECT - FlightDate AS k, - count() -FROM ontime -GROUP BY k -ORDER BY k ASC +```text ┌──────────k─┬─count()─┐ │ 2017-01-01 │ 13970 │ │ 2017-01-02 │ 15882 │ @@ -595,28 +596,18 @@ ORDER BY k ASC │ 2017-09-29 │ 16384 │ │ 2017-09-30 │ 12520 │ └────────────┴─────────┘ - -273 rows in set. Elapsed: 0.072 sec. Processed 4.28 million rows, 8.55 MB (59.00 million rows/s., 118.01 MB/s.) ``` В этой выборке индекс не используется и ClickHouse обработал всю таблицу (`Processed 4.28 million rows`). Для подключения индекса выберем конкретную дату и выполним следующий запрос: +```sql +SELECT FlightDate AS k, count() FROM ontime WHERE k = '2017-09-15' GROUP BY k ORDER BY k ``` -:) SELECT FlightDate AS k, count() FROM ontime WHERE k = '2017-09-15' GROUP BY k ORDER BY k - -SELECT - FlightDate AS k, - count() -FROM ontime -WHERE k = '2017-09-15' -GROUP BY k -ORDER BY k ASC +```text ┌──────────k─┬─count()─┐ │ 2017-09-15 │ 16428 │ └────────────┴─────────┘ - -1 rows in set. Elapsed: 0.014 sec. Processed 32.74 thousand rows, 65.49 KB (2.31 million rows/s., 4.63 MB/s.) ``` В последней строке выдачи видно, что благодаря использованию индекса, ClickHouse обработал значительно меньшее количество строк (`Processed 32.74 thousand rows`). @@ -624,9 +615,7 @@ ORDER BY k ASC Теперь передадим выражение `k = '2017-09-15'` в функцию `indexHint`: -``` -:) SELECT FlightDate AS k, count() FROM ontime WHERE indexHint(k = '2017-09-15') GROUP BY k ORDER BY k - +```sql SELECT FlightDate AS k, count() @@ -634,15 +623,15 @@ FROM ontime WHERE indexHint(k = '2017-09-15') GROUP BY k ORDER BY k ASC +``` +```text ┌──────────k─┬─count()─┐ │ 2017-09-14 │ 7071 │ │ 2017-09-15 │ 16428 │ │ 2017-09-16 │ 1077 │ │ 2017-09-30 │ 8167 │ └────────────┴─────────┘ - -4 rows in set. Elapsed: 0.004 sec. Processed 32.74 thousand rows, 65.49 KB (8.97 million rows/s., 17.94 MB/s.) ``` В ответе на запрос видно, что ClickHouse применил индекс таким же образом, что и в предыдущий раз (`Processed 32.74 thousand rows`). Однако по результирующему набору строк видно, что выражение `k = '2017-09-15'` не использовалось при формировании результата. @@ -655,7 +644,7 @@ ORDER BY k ASC Используется для внутренней реализации [arrayJoin](array_join.md#functions_arrayjoin). -``` +```sql replicate(x, arr) ``` @@ -670,9 +659,11 @@ replicate(x, arr) **Пример** -``` +```sql SELECT replicate(1, ['a', 'b', 'c']) +``` +```text ┌─replicate(1, ['a', 'b', 'c'])─┐ │ [1,1,1] │ └───────────────────────────────┘ @@ -682,7 +673,7 @@ SELECT replicate(1, ['a', 'b', 'c']) Возвращает объем оставшегося места в файловой системе, в которой расположены файлы баз данных. Смотрите описание конфигурационного параметра сервера [path](../../operations/server_settings/settings.md#server_settings-path). -``` +```sql filesystemAvailable() ``` @@ -733,7 +724,8 @@ custom_message - необязательный параметр, констант ```sql SELECT throwIf(number = 3, 'Too many') FROM numbers(10); - +``` +```text ↙ Progress: 0.00 rows, 0.00 B (0.00 rows/s., 0.00 B/s.) Received exception from server (version 19.14.1): Code: 395. DB::Exception: Received from localhost:9000. DB::Exception: Too many. ``` @@ -744,7 +736,9 @@ Code: 395. DB::Exception: Received from localhost:9000. DB::Exception: Too many. ```sql SELECT identity(42) +``` +```text ┌─identity(42)─┐ │ 42 │ └──────────────┘ diff --git a/docs/ru/query_language/functions/rounding_functions.md b/docs/ru/query_language/functions/rounding_functions.md index 498498266e4..61bf6d94419 100644 --- a/docs/ru/query_language/functions/rounding_functions.md +++ b/docs/ru/query_language/functions/rounding_functions.md @@ -22,7 +22,7 @@ N может быть отрицательным. Функция возвращает ближайшее значение указанного порядка. В случае, когда заданное число равноудалено от чисел необходимого порядка, функция возвращает то из них, которое имеет ближайшую чётную цифру (банковское округление). -``` +```sql round(expression [, decimal_places]) ``` @@ -43,10 +43,10 @@ round(expression [, decimal_places]) **Пример использования** -``` sql +```sql SELECT number / 2 AS x, round(x) FROM system.numbers LIMIT 3 ``` -``` +```text ┌───x─┬─round(divide(number, 2))─┐ │ 0 │ 0 │ │ 0.5 │ 0 │ @@ -58,7 +58,7 @@ SELECT number / 2 AS x, round(x) FROM system.numbers LIMIT 3 Округление до ближайшего числа. -``` +```text round(3.2, 0) = 3 round(4.1267, 2) = 4.13 round(22,-1) = 20 @@ -68,7 +68,7 @@ round(-467,-2) = -500 Банковское округление. -``` +```text round(3.5) = 4 round(4.5) = 4 round(3.55, 1) = 3.6 diff --git a/docs/ru/query_language/functions/splitting_merging_functions.md b/docs/ru/query_language/functions/splitting_merging_functions.md index 8561048b864..8d9c6aef14f 100644 --- a/docs/ru/query_language/functions/splitting_merging_functions.md +++ b/docs/ru/query_language/functions/splitting_merging_functions.md @@ -19,9 +19,11 @@ separator - необязательный параметр, константна **Пример:** -``` +```sql SELECT alphaTokens('abca1abc') +``` +```text ┌─alphaTokens('abca1abc')─┐ │ ['abca','abc'] │ └─────────────────────────┘ diff --git a/docs/ru/query_language/functions/string_functions.md b/docs/ru/query_language/functions/string_functions.md index cc6563dacd5..f514ac1cbd3 100644 --- a/docs/ru/query_language/functions/string_functions.md +++ b/docs/ru/query_language/functions/string_functions.md @@ -45,7 +45,7 @@ Заменяет некорректные символы UTF-8 на символ `�` (U+FFFD). Все идущие подряд некорректные символы схлопываются в один заменяющий символ. -``` +```sql toValidUTF8( input_string ) ``` @@ -80,13 +80,16 @@ SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') ```sql SELECT format('{1} {0} {1}', 'World', 'Hello') - +``` +```text ┌─format('{1} {0} {1}', 'World', 'Hello')─┐ │ Hello World Hello │ └─────────────────────────────────────────┘ - +``` +```sql SELECT format('{} {}', 'Hello', 'World') - +``` +```text ┌─format('{} {}', 'Hello', 'World')─┐ │ Hello World │ └───────────────────────────────────┘ diff --git a/docs/ru/query_language/functions/string_replace_functions.md b/docs/ru/query_language/functions/string_replace_functions.md index 498b321fb1b..0c4cb9923f2 100644 --- a/docs/ru/query_language/functions/string_replace_functions.md +++ b/docs/ru/query_language/functions/string_replace_functions.md @@ -17,7 +17,7 @@ Пример 1. Переведём дату в американский формат: -``` sql +```sql SELECT DISTINCT EventDate, replaceRegexpOne(toString(EventDate), '(\\d{4})-(\\d{2})-(\\d{2})', '\\2/\\3/\\1') AS res @@ -26,7 +26,7 @@ LIMIT 7 FORMAT TabSeparated ``` -``` +```text 2014-03-17 03/17/2014 2014-03-18 03/18/2014 2014-03-19 03/19/2014 @@ -38,11 +38,11 @@ FORMAT TabSeparated Пример 2. Размножить строку десять раз: -``` sql +```sql SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') AS res ``` -``` +```text ┌─res────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ │ Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World! │ └────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ @@ -51,11 +51,11 @@ SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') ## replaceRegexpAll(haystack, pattern, replacement) То же самое, но делается замена всех вхождений. Пример: -``` sql +```sql SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res ``` -``` +```text ┌─res────────────────────────┐ │ HHeelllloo,, WWoorrlldd!! │ └────────────────────────────┘ @@ -64,11 +64,11 @@ SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res В качестве исключения, если регулярное выражение сработало на пустой подстроке, то замена делается не более одного раза. Пример: -``` sql +```sql SELECT replaceRegexpAll('Hello, World!', '^', 'here: ') AS res ``` -``` +```text ┌─res─────────────────┐ │ here: Hello, World! │ └─────────────────────┘ diff --git a/docs/ru/query_language/functions/type_conversion_functions.md b/docs/ru/query_language/functions/type_conversion_functions.md index 8635ea089e0..72354e77f8c 100644 --- a/docs/ru/query_language/functions/type_conversion_functions.md +++ b/docs/ru/query_language/functions/type_conversion_functions.md @@ -116,7 +116,7 @@ SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) Форматы даты и даты-с-временем для функций toDate/toDateTime определены следующим образом: -``` +```text YYYY-MM-DD YYYY-MM-DD hh:mm:ss ``` @@ -135,7 +135,7 @@ SELECT toString(now(), 'Asia/Yekaterinburg') AS now_yekat ``` -``` +```text ┌───────────now_local─┬─now_yekat───────────┐ │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ └─────────────────────┴─────────────────────┘ @@ -158,7 +158,7 @@ SELECT SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut ``` -``` +```text ┌─s─────────────┬─s_cut─┐ │ foo\0\0\0\0\0 │ foo │ └───────────────┴───────┘ @@ -168,7 +168,7 @@ SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut ``` -``` +```text ┌─s──────────┬─s_cut─┐ │ foo\0bar\0 │ foo │ └────────────┴───────┘ @@ -202,7 +202,7 @@ SELECT CAST(timestamp, 'FixedString(22)') AS fixed_string ``` -``` +```text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ └─────────────────────┴─────────────────────┴────────────┴─────────────────────┴───────────────────────────┘ @@ -212,16 +212,19 @@ SELECT Поддержано преобразование к типу [Nullable](../../data_types/nullable.md) и обратно. Пример: -``` +```sql SELECT toTypeName(x) FROM t_null - +``` +```text ┌─toTypeName(x)─┐ │ Int8 │ │ Int8 │ └───────────────┘ - +``` +```sql SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null - +``` +```text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ │ Nullable(UInt16) │ diff --git a/docs/ru/query_language/functions/url_functions.md b/docs/ru/query_language/functions/url_functions.md index 7002273d5cb..a6e97b8a013 100644 --- a/docs/ru/query_language/functions/url_functions.md +++ b/docs/ru/query_language/functions/url_functions.md @@ -13,7 +13,7 @@ Извлекает имя хоста из URL. -``` +```sql domain(url) ``` @@ -23,7 +23,7 @@ domain(url) URL может быть указан со схемой или без неё. Примеры: -``` +```text svn+ssh://some.svn-hosting.com:80/repo/trunk some.svn-hosting.com:80/repo/trunk https://yandex.com/time/ @@ -31,7 +31,7 @@ https://yandex.com/time/ Для указанных примеров функция `domain` возвращает следующие результаты: -``` +```text some.svn-hosting.com some.svn-hosting.com yandex.com @@ -64,7 +64,7 @@ SELECT domain('svn+ssh://some.svn-hosting.com:80/repo/trunk') Извлекает домен верхнего уровня из URL. -``` +```sql topLevelDomain(url) ``` @@ -74,7 +74,7 @@ topLevelDomain(url) URL может быть указан со схемой или без неё. Примеры: -``` +```text svn+ssh://some.svn-hosting.com:80/repo/trunk some.svn-hosting.com:80/repo/trunk https://yandex.com/time/ @@ -138,7 +138,7 @@ SELECT topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk') То же самое, но без протокола и хоста в результате. Элемент / (корень) не включается. Пример: Функция используется для реализации древовидных отчётов по URL в Яндекс.Метрике. -``` +```text URLPathHierarchy('https://example.com/browse/CONV-6788') = [ '/browse/', @@ -150,11 +150,11 @@ URLPathHierarchy('https://example.com/browse/CONV-6788') = Возвращает декодированный URL. Пример: -``` sql +```sql SELECT decodeURLComponent('http://127.0.0.1:8123/?query=SELECT%201%3B') AS DecodedURL; ``` -``` +```text ┌─DecodedURL─────────────────────────────┐ │ http://127.0.0.1:8123/?query=SELECT 1; │ └────────────────────────────────────────┘ diff --git a/docs/ru/query_language/functions/uuid_functions.md b/docs/ru/query_language/functions/uuid_functions.md index 93020824091..d933130d30c 100644 --- a/docs/ru/query_language/functions/uuid_functions.md +++ b/docs/ru/query_language/functions/uuid_functions.md @@ -16,15 +16,15 @@ generateUUIDv4() Этот пример демонстрирует, как создать таблицу с UUID-колонкой и добавить в нее сгенерированный UUID. -``` sql -:) CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog +```sql +CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog -:) INSERT INTO t_uuid SELECT generateUUIDv4() +INSERT INTO t_uuid SELECT generateUUIDv4() -:) SELECT * FROM t_uuid +SELECT * FROM t_uuid ``` -``` +```text ┌────────────────────────────────────x─┐ │ f4bf890f-f9dc-4332-ad5c-0c18e73f28e9 │ └──────────────────────────────────────┘ @@ -44,11 +44,11 @@ toUUID(String) **Пример использования** -``` sql -:) SELECT toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0') AS uuid +```sql +SELECT toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0') AS uuid ``` -``` +```text ┌─────────────────────────────────uuid─┐ │ 61f0c404-5cb3-11e7-907b-a6006ad3dba0 │ └──────────────────────────────────────┘ @@ -58,7 +58,7 @@ toUUID(String) Принимает строку, содержащую 36 символов в формате `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`, и возвращает в виде набора байт в [FixedString(16)](../../data_types/fixedstring.md). -``` sql +```sql UUIDStringToNum(String) ``` @@ -68,13 +68,13 @@ FixedString(16) **Пример использования** -``` sql -:) SELECT +```sql +SELECT '612f3c40-5d3b-217e-707b-6a546a3d7b29' AS uuid, UUIDStringToNum(uuid) AS bytes ``` -``` +```text ┌─uuid─────────────────────────────────┬─bytes────────────┐ │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ a/<@];!~p{jTj={) │ └──────────────────────────────────────┴──────────────────┘ @@ -84,7 +84,7 @@ FixedString(16) Принимает значение типа [FixedString(16)](../../data_types/fixedstring.md). Возвращает строку из 36 символов в текстовом виде. -``` sql +```sql UUIDNumToString(FixedString(16)) ``` @@ -94,13 +94,13 @@ UUIDNumToString(FixedString(16)) **Пример использования** -``` sql +```sql SELECT 'a/<@];!~p{jTj={)' AS bytes, UUIDNumToString(toFixedString(bytes, 16)) AS uuid ``` -``` +```text ┌─bytes────────────┬─uuid─────────────────────────────────┐ │ a/<@];!~p{jTj={) │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ └──────────────────┴──────────────────────────────────────┘ diff --git a/docs/ru/query_language/functions/ym_dict_functions.md b/docs/ru/query_language/functions/ym_dict_functions.md index c086d7404c8..2bf511438b1 100644 --- a/docs/ru/query_language/functions/ym_dict_functions.md +++ b/docs/ru/query_language/functions/ym_dict_functions.md @@ -21,7 +21,7 @@ ClickHouse поддерживает работу одновременно с н Во все функции по работе с регионами, в конце добавлен один необязательный аргумент - ключ словаря. Далее он обозначен как geobase. Пример: -``` +```text regionToCountry(RegionID) - использует словарь по умолчанию: /opt/geo/regions_hierarchy.txt; regionToCountry(RegionID, '') - использует словарь по умолчанию: /opt/geo/regions_hierarchy.txt; regionToCountry(RegionID, 'ua') - использует словарь для ключа ua: /opt/geo/regions_hierarchy_ua.txt; @@ -33,13 +33,13 @@ regionToCountry(RegionID, 'ua') - использует словарь для к ### regionToArea(id\[, geobase\]) Переводит регион в область (тип в геобазе - 5). В остальном, аналогично функции regionToCity. -``` sql +```sql SELECT DISTINCT regionToName(regionToArea(toUInt32(number), 'ua')) FROM system.numbers LIMIT 15 ``` -``` +```text ┌─regionToName(regionToArea(toUInt32(number), \'ua\'))─┐ │ │ │ Москва и Московская область │ @@ -62,13 +62,13 @@ LIMIT 15 ### regionToDistrict(id\[, geobase\]) Переводит регион в федеральный округ (тип в геобазе - 4). В остальном, аналогично функции regionToCity. -``` sql +```sql SELECT DISTINCT regionToName(regionToDistrict(toUInt32(number), 'ua')) FROM system.numbers LIMIT 15 ``` -``` +```text ┌─regionToName(regionToDistrict(toUInt32(number), \'ua\'))─┐ │ │ │ Центральный федеральный округ │ diff --git a/docs/ru/query_language/insert_into.md b/docs/ru/query_language/insert_into.md index 1f92e6525b8..88c548d394c 100644 --- a/docs/ru/query_language/insert_into.md +++ b/docs/ru/query_language/insert_into.md @@ -5,7 +5,7 @@ Базовый формат запроса: -``` sql +```sql INSERT INTO [db.]table [(c1, c2, c3)] VALUES (v11, v12, v13), (v21, v22, v23), ... ``` @@ -18,13 +18,13 @@ INSERT INTO [db.]table [(c1, c2, c3)] VALUES (v11, v12, v13), (v21, v22, v23), . В INSERT можно передавать данные любого [формата](../interfaces/formats.md#formats), который поддерживает ClickHouse. Для этого формат необходимо указать в запросе в явном виде: -``` sql +```sql INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format_name data_set ``` Например, следующий формат запроса идентичен базовому варианту INSERT ... VALUES: -``` sql +```sql INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ... ``` @@ -32,7 +32,7 @@ ClickHouse отсекает все пробелы и один перенос с Пример: -``` sql +```sql INSERT INTO t FORMAT TabSeparated 11 Hello, world! 22 Qwerty @@ -46,7 +46,7 @@ INSERT INTO t FORMAT TabSeparated ### Вставка результатов `SELECT` {#insert_query_insert-select} -``` sql +```sql INSERT INTO [db.]table [(c1, c2, c3)] SELECT ... ``` diff --git a/docs/ru/query_language/misc.md b/docs/ru/query_language/misc.md index ab19e559649..d169e5715e9 100644 --- a/docs/ru/query_language/misc.md +++ b/docs/ru/query_language/misc.md @@ -257,7 +257,7 @@ SHOW PROCESSLIST [INTO OUTFILE filename] [FORMAT format] Полезный совет (выполните в консоли): ```bash -watch -n1 "clickhouse-client --query='SHOW PROCESSLIST'" +$ watch -n1 "clickhouse-client --query='SHOW PROCESSLIST'" ``` ## SHOW TABLES diff --git a/docs/ru/query_language/operators.md b/docs/ru/query_language/operators.md index de38bb5b193..c39409e356b 100644 --- a/docs/ru/query_language/operators.md +++ b/docs/ru/query_language/operators.md @@ -67,7 +67,7 @@ ## Оператор для работы с датами и временем {#operators-datetime} -``` sql +```sql EXTRACT(part FROM date); ``` @@ -88,7 +88,7 @@ EXTRACT(part FROM date); Примеры: -``` sql +```sql SELECT EXTRACT(DAY FROM toDate('2017-06-15')); SELECT EXTRACT(MONTH FROM toDate('2017-06-15')); SELECT EXTRACT(YEAR FROM toDate('2017-06-15')); @@ -96,7 +96,7 @@ SELECT EXTRACT(YEAR FROM toDate('2017-06-15')); В следующем примере создадим таблицу и добавим в неё значение с типом `DateTime`. -``` sql +```sql CREATE TABLE test.Orders ( OrderId UInt64, @@ -106,10 +106,10 @@ CREATE TABLE test.Orders ENGINE = Log; ``` -``` sql +```sql INSERT INTO test.Orders VALUES (1, 'Jarlsberg Cheese', toDateTime('2008-10-11 13:23:44')); ``` -``` sql +```sql SELECT toYear(OrderDate) AS OrderYear, toMonth(OrderDate) AS OrderMonth, @@ -118,7 +118,9 @@ SELECT toMinute(OrderDate) AS OrderMinute, toSecond(OrderDate) AS OrderSecond FROM test.Orders; +``` +```text ┌─OrderYear─┬─OrderMonth─┬─OrderDay─┬─OrderHour─┬─OrderMinute─┬─OrderSecond─┐ │ 2008 │ 10 │ 11 │ 13 │ 23 │ 44 │ └───────────┴────────────┴──────────┴───────────┴─────────────┴─────────────┘ @@ -148,7 +150,7 @@ FROM test.Orders; ## Условное выражение {#operator_case} -``` sql +```sql CASE [x] WHEN a THEN b [WHEN ... THEN ...] @@ -198,18 +200,13 @@ ClickHouse поддерживает операторы `IS NULL` и `IS NOT NULL - `0` в обратном случае. - Для прочих значений оператор `IS NULL` всегда возвращает `0`. -```bash -:) SELECT x+100 FROM t_null WHERE y IS NULL - -SELECT x + 100 -FROM t_null -WHERE isNull(y) - +```sql +SELECT x+100 FROM t_null WHERE y IS NULL +``` +```text ┌─plus(x, 100)─┐ │ 101 │ └──────────────┘ - -1 rows in set. Elapsed: 0.002 sec. ``` @@ -220,18 +217,13 @@ WHERE isNull(y) - `1`, в обратном случае. - Для прочих значений оператор `IS NOT NULL` всегда возвращает `1`. -```bash -:) SELECT * FROM t_null WHERE y IS NOT NULL - -SELECT * -FROM t_null -WHERE isNotNull(y) - +```sql +SELECT * FROM t_null WHERE y IS NOT NULL +``` +```text ┌─x─┬─y─┐ │ 2 │ 3 │ └───┴───┘ - -1 rows in set. Elapsed: 0.002 sec. ``` [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/operators/) diff --git a/docs/ru/query_language/select.md b/docs/ru/query_language/select.md index 955873025f6..ca7df787350 100644 --- a/docs/ru/query_language/select.md +++ b/docs/ru/query_language/select.md @@ -35,7 +35,7 @@ SELECT [DISTINCT] expr_list В дальнейшем, результаты выражений можно использовать в секции SELECT. Пример 1: Использование константного выражения как "переменной" -``` +```sql WITH '2019-08-01 15:23:00' as ts_upper_bound SELECT * FROM hits @@ -45,7 +45,7 @@ WHERE ``` Пример 2: Выкидывание выражения sum(bytes) из списка колонок в SELECT -``` +```sql WITH sum(bytes) as s SELECT formatReadableSize(s), @@ -56,7 +56,7 @@ ORDER BY s ``` Пример 3: Использование результатов скалярного подзапроса -``` +```sql /* запрос покажет TOP 10 самых больших таблиц */ WITH ( @@ -75,7 +75,7 @@ LIMIT 10 Пример 4: Переиспользование выражения В настоящий момент, переиспользование выражения из секции WITH внутри подзапроса возможно только через дублирование. -``` +```sql WITH ['hello'] AS hello SELECT hello, @@ -85,7 +85,8 @@ FROM WITH ['hello'] AS hello SELECT hello ) - +``` +```text ┌─hello─────┬─hello─────┐ │ ['hello'] │ ['hello'] │ └───────────┴───────────┘ @@ -227,7 +228,7 @@ SAMPLE 1/10 OFFSET 1/2 Позволяет выполнить `JOIN` с массивом или вложенной структурой данных. Смысл похож на функцию [arrayJoin](functions/array_join.md#functions_arrayjoin), но функциональность более широкая. -``` sql +```sql SELECT FROM [LEFT] ARRAY JOIN @@ -246,7 +247,7 @@ FROM Рассмотрим примеры использования `ARRAY JOIN` и `LEFT ARRAY JOIN`. Для начала создадим таблицу, содержащую столбец с типом [Array](../data_types/array.md), и добавим в него значение: -``` sql +```sql CREATE TABLE arrays_test ( s String, @@ -256,7 +257,7 @@ CREATE TABLE arrays_test INSERT INTO arrays_test VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []); ``` -``` +```text ┌─s───────────┬─arr─────┐ │ Hello │ [1,2] │ │ World │ [3,4,5] │ @@ -271,7 +272,7 @@ SELECT s, arr FROM arrays_test ARRAY JOIN arr; ``` -``` +```text ┌─s─────┬─arr─┐ │ Hello │ 1 │ │ Hello │ 2 │ @@ -288,7 +289,7 @@ SELECT s, arr FROM arrays_test LEFT ARRAY JOIN arr; ``` -``` +```text ┌─s───────────┬─arr─┐ │ Hello │ 1 │ │ Hello │ 2 │ @@ -303,13 +304,13 @@ LEFT ARRAY JOIN arr; Для массива в секции `ARRAY JOIN` может быть указан алиас. В этом случае, элемент массива будет доступен под этим алиасом, а сам массив — под исходным именем. Пример: -``` sql +```sql SELECT s, arr, a FROM arrays_test ARRAY JOIN arr AS a; ``` -``` +```text ┌─s─────┬─arr─────┬─a─┐ │ Hello │ [1,2] │ 1 │ │ Hello │ [1,2] │ 2 │ @@ -321,13 +322,13 @@ ARRAY JOIN arr AS a; Используя алиасы, можно выполнять `JOIN` с внешними массивами: -``` sql +```sql SELECT s, arr_external FROM arrays_test ARRAY JOIN [1, 2, 3] AS arr_external; ``` -``` +```text ┌─s───────────┬─arr_external─┐ │ Hello │ 1 │ │ Hello │ 2 │ @@ -343,13 +344,13 @@ ARRAY JOIN [1, 2, 3] AS arr_external; В секции `ARRAY JOIN` можно указать через запятую сразу несколько массивов. В этом случае, `JOIN` делается с ними одновременно (прямая сумма, а не прямое произведение). Обратите внимание, массивы должны быть одинаковых размеров. Примеры: -``` sql +```sql SELECT s, arr, a, num, mapped FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS mapped; ``` -``` +```text ┌─s─────┬─arr─────┬─a─┬─num─┬─mapped─┐ │ Hello │ [1,2] │ 1 │ 1 │ 2 │ │ Hello │ [1,2] │ 2 │ 2 │ 3 │ @@ -361,13 +362,13 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS ma В примере ниже используется функция [arrayEnumerate](functions/array_functions.md#array_functions-arrayenumerate): -``` sql +```sql SELECT s, arr, a, num, arrayEnumerate(arr) FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num; ``` -``` +```text ┌─s─────┬─arr─────┬─a─┬─num─┬─arrayEnumerate(arr)─┐ │ Hello │ [1,2] │ 1 │ 1 │ [1,2] │ │ Hello │ [1,2] │ 2 │ 2 │ [1,2] │ @@ -381,7 +382,7 @@ ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num; `ARRAY JOIN` также работает с [вложенными структурами данных](../data_types/nested_data_structures/nested.md). Пример: -``` sql +```sql CREATE TABLE nested_test ( s String, @@ -394,7 +395,7 @@ INSERT INTO nested_test VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [], []); ``` -``` +```text ┌─s───────┬─nest.x──┬─nest.y─────┐ │ Hello │ [1,2] │ [10,20] │ │ World │ [3,4,5] │ [30,40,50] │ @@ -402,13 +403,13 @@ VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [] └─────────┴─────────┴────────────┘ ``` -``` sql +```sql SELECT s, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN nest; ``` -``` +```text ┌─s─────┬─nest.x─┬─nest.y─┐ │ Hello │ 1 │ 10 │ │ Hello │ 2 │ 20 │ @@ -420,13 +421,13 @@ ARRAY JOIN nest; При указании имени вложенной структуры данных в `ARRAY JOIN`, смысл такой же, как `ARRAY JOIN` со всеми элементами-массивами, из которых она состоит. Пример: -``` sql +```sql SELECT s, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN `nest.x`, `nest.y`; ``` -``` +```text ┌─s─────┬─nest.x─┬─nest.y─┐ │ Hello │ 1 │ 10 │ │ Hello │ 2 │ 20 │ @@ -438,13 +439,13 @@ ARRAY JOIN `nest.x`, `nest.y`; Такой вариант тоже имеет смысл: -``` sql +```sql SELECT s, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN `nest.x`; ``` -``` +```text ┌─s─────┬─nest.x─┬─nest.y─────┐ │ Hello │ 1 │ [10,20] │ │ Hello │ 2 │ [10,20] │ @@ -456,13 +457,13 @@ ARRAY JOIN `nest.x`; Алиас для вложенной структуры данных можно использовать, чтобы выбрать как результат `JOIN`-а, так и исходный массив. Пример: -``` sql +```sql SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y` FROM nested_test ARRAY JOIN nest AS n; ``` -``` +```text ┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┐ │ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ │ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ @@ -474,13 +475,13 @@ ARRAY JOIN nest AS n; Пример использования функции [arrayEnumerate](functions/array_functions.md#array_functions-arrayenumerate): -``` sql +```sql SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num FROM nested_test ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num; ``` -``` +```text ┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┬─num─┐ │ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ 1 │ │ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ 2 │ @@ -524,13 +525,13 @@ FROM Для создания запросов мы рекомендуем использоват синтаксис `JOIN ON` или `JOIN USING`. Например: -``` +```sql SELECT * FROM t1 JOIN t2 ON t1.a = t2.a JOIN t3 ON t1.a = t3.a ``` В секции `FROM` вы можете использовать разделенные запятыми списки таблиц для объединения. Этот синтаксис работает только при включённой настройке [allow_experimental_cross_to_join_conversion = 1](../operations/settings/settings.md#settings-allow_experimental_cross_to_join_conversion). Например: -``` +```sql SELECT * FROM t1, t2, t3 WHERE t1.a = t2.a AND t1.a = t3.a ``` @@ -552,7 +553,7 @@ ClickHouse не поддерживает синтаксис с запятыми Синтаксис `ASOF JOIN`: -``` +```sql SELECT expression_list FROM table_1 ASOF JOIN table_2 USING(equi_column1, ... equi_columnN, asof_column) ``` @@ -560,7 +561,7 @@ SELECT expression_list FROM table_1 ASOF JOIN table_2 USING(equi_column1, ... eq Например, рассмотрим следующие таблицы: -``` +```text table_1 table_2 event | ev_time | user_id event | ev_time | user_id ----------|---------|---------- ----------|---------|---------- @@ -620,7 +621,7 @@ ORDER BY hits DESC LIMIT 10 ``` -``` +```text ┌─CounterID─┬───hits─┬─visits─┐ │ 1143050 │ 523264 │ 13665 │ │ 731962 │ 475698 │ 102716 │ @@ -687,18 +688,14 @@ ClickHouse использует в выражении индексы, если Пример проверки на `NULL`: -```bash -:) SELECT * FROM t_null WHERE y IS NULL - -SELECT * -FROM t_null -WHERE isNull(y) +```sql +SELECT * FROM t_null WHERE y IS NULL +``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ └───┴──────┘ - -1 rows in set. Elapsed: 0.002 sec. ``` @@ -728,7 +725,7 @@ WHERE isNull(y) Пример: -``` sql +```sql SELECT count(), median(FetchTiming > 60 ? 60 : FetchTiming), @@ -742,7 +739,7 @@ FROM hits Пример: -``` sql +```sql SELECT domainWithoutWWW(URL) AS domain, count(), @@ -765,7 +762,7 @@ GROUP BY вычисляет для каждого встретившегося Пусть есть таблица: -``` +```text ┌─x─┬────y─┐ │ 1 │ 2 │ │ 2 │ ᴺᵁᴸᴸ │ @@ -777,7 +774,7 @@ GROUP BY вычисляет для каждого встретившегося В результате запроса `SELECT sum(x), y FROM t_null_big GROUP BY y` мы получим: -``` +```text ┌─sum(x)─┬────y─┐ │ 4 │ 2 │ │ 3 │ 3 │ @@ -929,7 +926,7 @@ WHERE и HAVING отличаются тем, что WHERE выполняется Для таблицы -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 2 │ @@ -946,7 +943,7 @@ WHERE и HAVING отличаются тем, что WHERE выполняется Выполним запрос `SELECT * FROM t_null_nan ORDER BY y NULLS FIRST`, получим: -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 7 │ ᴺᵁᴸᴸ │ @@ -1040,7 +1037,7 @@ ClickHouse поддерживает использование в одном з Произвольное количество запросов может быть объединено с помощью `UNION ALL`. Пример: -``` sql +```sql SELECT CounterID, 1 AS table, toInt64(count()) AS c FROM test.hits GROUP BY CounterID @@ -1087,7 +1084,7 @@ SELECT CounterID, 2 AS table, sum(Sign) AS c Примеры: -``` sql +```sql SELECT UserID IN (123, 456) FROM ... SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... ``` @@ -1106,7 +1103,7 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... В подзапросе может быть указано более одного столбца для фильтрации кортежей. Пример: -``` sql +```sql SELECT (CounterID, UserID) IN (SELECT CounterID, UserID FROM ...) FROM ... ``` @@ -1115,7 +1112,7 @@ SELECT (CounterID, UserID) IN (SELECT CounterID, UserID FROM ...) FROM ... Оператор IN и подзапрос могут встречаться в любой части запроса, в том числе в агрегатных и лямбда функциях. Пример: -``` sql +```sql SELECT EventDate, avg(UserID IN @@ -1129,7 +1126,7 @@ GROUP BY EventDate ORDER BY EventDate ASC ``` -``` +```text ┌──EventDate─┬────ratio─┐ │ 2014-03-17 │ 1 │ │ 2014-03-18 │ 0.807696 │ @@ -1151,7 +1148,7 @@ ORDER BY EventDate ASC Рассмотрим для примера таблицу `t_null`: -``` +```text ┌─x─┬────y─┐ │ 1 │ ᴺᵁᴸᴸ │ │ 2 │ 3 │ @@ -1160,7 +1157,7 @@ ORDER BY EventDate ASC При выполнении запроса `SELECT x FROM t_null WHERE y IN (NULL,3)` получим следующий результат: -``` +```text ┌─x─┐ │ 2 │ └───┘ @@ -1168,10 +1165,11 @@ ORDER BY EventDate ASC Видно, что строка, в которой `y = NULL`, выброшена из результатов запроса. Это произошло потому, что ClickHouse не может решить входит ли `NULL` в множество `(NULL,3)`, возвращает результат операции `0`, а `SELECT` выбрасывает эту строку из финальной выдачи. -``` +```sql SELECT y IN (NULL, 3) FROM t_null - +``` +```text ┌─in(y, tuple(NULL, 3))─┐ │ 0 │ │ 1 │ @@ -1200,13 +1198,13 @@ FROM t_null Например, запрос -``` sql +```sql SELECT uniq(UserID) FROM distributed_table ``` будет отправлен на все удалённые серверы в виде -``` sql +```sql SELECT uniq(UserID) FROM local_table ``` @@ -1214,7 +1212,7 @@ SELECT uniq(UserID) FROM local_table Теперь рассмотрим запрос с IN-ом: -``` sql +```sql SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34) ``` @@ -1222,7 +1220,7 @@ SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID I Этот запрос будет отправлен на все удалённые серверы в виде -``` sql +```sql SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34) ``` @@ -1232,19 +1230,19 @@ SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SEL Чтобы исправить работу запроса, когда данные размазаны по серверам кластера произвольным образом, можно было бы указать **distributed_table** внутри подзапроса. Запрос будет выглядеть так: -``` sql +```sql SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) ``` Этот запрос будет отправлен на все удалённые серверы в виде -``` sql +```sql SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) ``` На каждом удалённом сервере начнёт выполняться подзапрос. Так как в подзапросе используется распределённая таблица, то подзапрос будет, на каждом удалённом сервере, снова отправлен на каждый удалённый сервер, в виде -``` sql +```sql SELECT UserID FROM local_table WHERE CounterID = 34 ``` @@ -1252,19 +1250,19 @@ SELECT UserID FROM local_table WHERE CounterID = 34 В таких случаях всегда следует использовать GLOBAL IN вместо IN. Рассмотрим его работу для запроса -``` sql +```sql SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID GLOBAL IN (SELECT UserID FROM distributed_table WHERE CounterID = 34) ``` На сервере-инициаторе запроса будет выполнен подзапрос -``` sql +```sql SELECT UserID FROM distributed_table WHERE CounterID = 34 ``` , и результат будет сложен во временную таблицу в оперативке. Затем запрос будет отправлен на каждый удалённый сервер в виде -``` sql +```sql SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID GLOBAL IN _data1 ``` diff --git a/docs/ru/query_language/syntax.md b/docs/ru/query_language/syntax.md index f48c8d236e4..703394789b3 100644 --- a/docs/ru/query_language/syntax.md +++ b/docs/ru/query_language/syntax.md @@ -108,7 +108,7 @@ INSERT INTO t VALUES (1, 'Hello, world'), (2, 'abc'), (3, 'def') Синоним — это пользовательское имя выражения в запросе. -``` +```sql expr AS alias ``` @@ -136,7 +136,7 @@ expr AS alias Будьте осторожны с синонимами, совпадающими с именами столбцов или таблиц. Рассмотрим следующий пример: -``` +```sql CREATE TABLE t ( a Int, @@ -145,12 +145,13 @@ CREATE TABLE t ENGINE = TinyLog() ``` -``` +```sql SELECT argMax(a, b), sum(b) AS b FROM t - +``` +```text Received exception from server (version 18.14.17): Code: 184. DB::Exception: Received from localhost:9000, 127.0.0.1. DB::Exception: Aggregate function sum(b) is found inside another aggregate function in query. ``` diff --git a/docs/ru/query_language/system.md b/docs/ru/query_language/system.md index 2abdc5d34de..998cf7fc682 100644 --- a/docs/ru/query_language/system.md +++ b/docs/ru/query_language/system.md @@ -62,7 +62,7 @@ ClickHouse может оперировать [распределёнными](.. Отключает фоновую отправку при вставке данных в распределённые таблицы. -``` +```sql SYSTEM STOP DISTRIBUTED SENDS [db.] ``` @@ -70,7 +70,7 @@ SYSTEM STOP DISTRIBUTED SENDS [db.] В синхронном режиме отправляет все данные на узлы кластера. Если какие-либо узлы недоступны, ClickHouse генерирует исключение и останавливает выполнение запроса. Такой запрос можно повторять до успешного завершения, что будет означать возвращение связанности с остальными узлами кластера. -``` +```sql SYSTEM FLUSH DISTRIBUTED [db.] ``` @@ -78,7 +78,7 @@ SYSTEM FLUSH DISTRIBUTED [db.] Включает фоновую отправку при вставке данных в распределенные таблицы. -``` +```sql SYSTEM START DISTRIBUTED SENDS [db.] ``` diff --git a/docs/ru/query_language/table_functions/file.md b/docs/ru/query_language/table_functions/file.md index 9fc82b151b8..bec1dff44b7 100644 --- a/docs/ru/query_language/table_functions/file.md +++ b/docs/ru/query_language/table_functions/file.md @@ -3,7 +3,7 @@ Создаёт таблицу из файла. -``` +```sql file(path, format, structure) ``` @@ -33,12 +33,12 @@ $ cat /var/lib/clickhouse/user_files/test.csv Таблица из `test.csv` и выборка первых двух строк из неё: -``` sql +```sql SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 2 ``` -``` +```text ┌─column1─┬─column2─┬─column3─┐ │ 1 │ 2 │ 3 │ │ 3 │ 2 │ 1 │ diff --git a/docs/ru/query_language/table_functions/hdfs.md b/docs/ru/query_language/table_functions/hdfs.md index ae881edea35..d79873c8842 100644 --- a/docs/ru/query_language/table_functions/hdfs.md +++ b/docs/ru/query_language/table_functions/hdfs.md @@ -3,7 +3,7 @@ Создаёт таблицу из файла в HDFS. -``` +```sql hdfs(URI, format, structure) ``` @@ -21,12 +21,12 @@ hdfs(URI, format, structure) Таблица из `hdfs://hdfs1:9000/test` и выборка первых двух строк из неё: -``` sql +```sql SELECT * FROM hdfs('hdfs://hdfs1:9000/test', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 2 ``` -``` +```text ┌─column1─┬─column2─┬─column3─┐ │ 1 │ 2 │ 3 │ │ 3 │ 2 │ 1 │ diff --git a/docs/ru/query_language/table_functions/input.md b/docs/ru/query_language/table_functions/input.md index 35889dd653c..796ca38e0bc 100644 --- a/docs/ru/query_language/table_functions/input.md +++ b/docs/ru/query_language/table_functions/input.md @@ -23,13 +23,13 @@ а в файле `data.csv` данные имеют другую структуру `(col1 String, col2 Date, col3 Int32)`. Запрос для вставки данных из файла `data.csv` в таблицу `test` с одновременным преобразованием и использованием функций выглядит так: ```bash -cat data.csv | clickhouse-client --query="INSERT INTO test SELECT lower(col1), col3 * col3 FROM input('col1 String, col2 Date, col3 Int32') FORMAT CSV"; +$ cat data.csv | clickhouse-client --query="INSERT INTO test SELECT lower(col1), col3 * col3 FROM input('col1 String, col2 Date, col3 Int32') FORMAT CSV"; ``` - Если в `data.csv` лежат данные той же структуры `test_structure`, что и у таблицы `test`, то следующие два запроса эквивалентны: ```bash -cat data.csv | clickhouse-client --query="INSERT INTO test FORMAT CSV" -cat data.csv | clickhouse-client --query="INSERT INTO test SELECT * FROM input('test_structure') FORMAT CSV" +$ cat data.csv | clickhouse-client --query="INSERT INTO test FORMAT CSV" +$ cat data.csv | clickhouse-client --query="INSERT INTO test SELECT * FROM input('test_structure') FORMAT CSV" ``` [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/table_functions/input/) diff --git a/docs/ru/query_language/table_functions/jdbc.md b/docs/ru/query_language/table_functions/jdbc.md index 4f6273489da..6b18edd13df 100644 --- a/docs/ru/query_language/table_functions/jdbc.md +++ b/docs/ru/query_language/table_functions/jdbc.md @@ -9,15 +9,15 @@ **Пример** -``` sql +```sql SELECT * FROM jdbc('jdbc:mysql://localhost:3306/?user=root&password=root', 'schema', 'table') ``` -``` sql +```sql SELECT * FROM jdbc('mysql://localhost:3306/?user=root&password=root', 'schema', 'table') ``` -``` sql +```sql SELECT * FROM jdbc('datasource://mysql-local', 'schema', 'table') ``` diff --git a/docs/ru/query_language/table_functions/mysql.md b/docs/ru/query_language/table_functions/mysql.md index eb321ece738..d698f5e69a2 100644 --- a/docs/ru/query_language/table_functions/mysql.md +++ b/docs/ru/query_language/table_functions/mysql.md @@ -2,7 +2,7 @@ Позволяет выполнять запросы `SELECT` над данными, хранящимися на удалённом MySQL сервере. -``` +```sql mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']); ``` @@ -32,7 +32,7 @@ mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_ Таблица в MySQL: -``` +```text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, diff --git a/docs/ru/query_language/table_functions/numbers.md b/docs/ru/query_language/table_functions/numbers.md index a5e4ba58ed9..7406773c8c2 100644 --- a/docs/ru/query_language/table_functions/numbers.md +++ b/docs/ru/query_language/table_functions/numbers.md @@ -7,13 +7,13 @@ Следующие запросы эквивалентны: -``` sql +```sql SELECT * FROM numbers(10); SELECT * FROM numbers(0,10); SELECT * FROM system.numbers LIMIT 10; ``` Примеры: -``` sql +```sql -- генерация последовательности всех дат от 2010-01-01 до 2010-12-31 select toDate('2010-01-01') + number as d FROM numbers(365); ``` diff --git a/docs/ru/query_language/table_functions/odbc.md b/docs/ru/query_language/table_functions/odbc.md index 9be84e4fc98..a05e50b75a5 100644 --- a/docs/ru/query_language/table_functions/odbc.md +++ b/docs/ru/query_language/table_functions/odbc.md @@ -2,7 +2,7 @@ Возвращает таблицу, подключенную через [ODBC](https://en.wikipedia.org/wiki/Open_Database_Connectivity). -``` +```sql odbc(connection_settings, external_database, external_table) ``` @@ -26,15 +26,17 @@ odbc(connection_settings, external_database, external_table) По умолчанию (если установлен из пакетов) ClickHouse запускается от имени пользователя `clickhouse`. Таким образом, вам нужно создать и настроить этого пользователя на сервере MySQL. +```bash +$ sudo mysql ``` -sudo mysql +```sql mysql> CREATE USER 'clickhouse'@'localhost' IDENTIFIED BY 'clickhouse'; mysql> GRANT ALL PRIVILEGES ON *.* TO 'clickhouse'@'clickhouse' WITH GRANT OPTION; ``` Теперь настроим соединение в `/etc/odbc.ini`. -``` +```bash $ cat /etc/odbc.ini [mysqlconn] DRIVER = /usr/local/lib/libmyodbc5w.so @@ -47,8 +49,8 @@ PASSWORD = clickhouse Вы можете проверить соединение с помощью утилиты `isql` из установки unixODBC. -``` -isql -v mysqlconn +```bash +$ isql -v mysqlconn +---------------------------------------+ | Connected! | | | @@ -57,7 +59,7 @@ isql -v mysqlconn Таблица в MySQL: -``` +```text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, -> `int_nullable` INT NULL DEFAULT NULL, diff --git a/docs/ru/query_language/table_functions/remote.md b/docs/ru/query_language/table_functions/remote.md index 02aa48c7d63..a19b6ce5cd5 100644 --- a/docs/ru/query_language/table_functions/remote.md +++ b/docs/ru/query_language/table_functions/remote.md @@ -5,7 +5,7 @@ Сигнатуры: -``` sql +```sql remote('addresses_expr', db, table[, 'user'[, 'password']]) remote('addresses_expr', db.table[, 'user'[, 'password']]) ``` @@ -17,7 +17,7 @@ remote('addresses_expr', db.table[, 'user'[, 'password']]) Примеры: -``` +```text example01-01-1 example01-01-1:9000 localhost @@ -30,19 +30,19 @@ localhost Пример: -``` +```text example01-01-1,example01-02-1 ``` Часть выражения может быть указана в фигурных скобках. Предыдущий пример может быть записан следующим образом: -``` +```text example01-0{1,2}-1 ``` В фигурных скобках может быть указан диапазон (неотрицательных целых) чисел через две точки. В этом случае, диапазон раскрывается в множество значений, генерирующих адреса шардов. Если запись первого числа начинается с нуля, то значения формируются с таким же выравниванием нулями. Предыдущий пример может быть записан следующим образом: -``` +```text example01-{01..02}-1 ``` @@ -52,7 +52,7 @@ example01-{01..02}-1 Пример: -``` +```text example01-{01..02}-{1|2} ``` diff --git a/docs/ru/query_language/table_functions/url.md b/docs/ru/query_language/table_functions/url.md index 65a66fce6bf..79951209b3f 100644 --- a/docs/ru/query_language/table_functions/url.md +++ b/docs/ru/query_language/table_functions/url.md @@ -13,7 +13,7 @@ structure - структура таблицы в форме `'UserID UInt64, Nam **Пример** -``` sql +```sql -- получение 3-х строк таблицы, состоящей из двух колонк типа String и UInt32 от сервера, отдающего данные в формате CSV SELECT * FROM url('http://127.0.0.1:12345/', CSV, 'column1 String, column2 UInt32') LIMIT 3 ``` From bd29efdbbe22e9672799908c38a0b1b8f4d630e3 Mon Sep 17 00:00:00 2001 From: millb Date: Mon, 23 Sep 2019 18:47:34 +0300 Subject: [PATCH 201/309] New hex function release Bugs and tests fixed --- dbms/src/Functions/FunctionsCoding.h | 25 +++++++------------ .../0_stateless/01013_hex_float.reference | 7 +++++- .../queries/0_stateless/01013_hex_float.sql | 5 +++- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/dbms/src/Functions/FunctionsCoding.h b/dbms/src/Functions/FunctionsCoding.h index adaaa875a71..1ab00d725f6 100644 --- a/dbms/src/Functions/FunctionsCoding.h +++ b/dbms/src/Functions/FunctionsCoding.h @@ -946,9 +946,10 @@ public: { WhichDataType which(arguments[0]); - if (!which.isStringOrFixedString() - && !which.isDateOrDateTime() - && !which.isUInt() && !which.isFloat()) + if (!which.isStringOrFixedString() && + !which.isDateOrDateTime() && + !which.isUInt() && + !which.isFloat()) throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); @@ -1026,7 +1027,7 @@ public: { const ColumnVector * col_vec = checkAndGetColumn>(col); - static constexpr size_t MAX_FLOAT_HEX_LENGTH = sizeof(T) * 2 + 1; /// Including trailing zero byte. + static constexpr size_t FLOAT_HEX_LENGTH = sizeof(T) * 2 + 1; /// Including trailing zero byte. if (col_vec) { @@ -1038,27 +1039,19 @@ public: size_t size = in_vec.size(); out_offsets.resize(size); - out_vec.resize(size * 3 + MAX_FLOAT_HEX_LENGTH); /// 3 is length of one byte in hex plus zero byte. + out_vec.resize(size * FLOAT_HEX_LENGTH); size_t pos = 0; + char * out = reinterpret_cast(&out_vec[0]); for (size_t i = 0; i < size; ++i) { - /// Manual exponential growth, so as not to rely on the linear amortized work time of `resize` (no one guarantees it). - if (pos + MAX_FLOAT_HEX_LENGTH > out_vec.size()) - out_vec.resize(out_vec.size() * 2 + MAX_FLOAT_HEX_LENGTH); - - char * begin = reinterpret_cast(&out_vec[pos]); - char * end = begin; - const UInt8 * in_pos = reinterpret_cast(&in_vec[i]); - executeOneString(in_pos, in_pos + sizeof(in_vec[i]), end); + executeOneString(in_pos, in_pos + sizeof(T), out); - pos += end - begin; + pos += FLOAT_HEX_LENGTH; out_offsets[i] = pos; } - out_vec.resize(pos); - col_res = std::move(col_str); return true; } diff --git a/dbms/tests/queries/0_stateless/01013_hex_float.reference b/dbms/tests/queries/0_stateless/01013_hex_float.reference index ac428aa6bea..c3e4ec26847 100644 --- a/dbms/tests/queries/0_stateless/01013_hex_float.reference +++ b/dbms/tests/queries/0_stateless/01013_hex_float.reference @@ -4,4 +4,9 @@ 2342920CA19CC73B 7DC39425AD49B254 2C616D8C9DF0423F -BA490C022BFF5EC0 +3BDF4F8D97FE5EC0 +0A57C742 +00004843 +00004943 +0000000000406940 +0000000000606940 diff --git a/dbms/tests/queries/0_stateless/01013_hex_float.sql b/dbms/tests/queries/0_stateless/01013_hex_float.sql index e6da504657f..30869529d87 100644 --- a/dbms/tests/queries/0_stateless/01013_hex_float.sql +++ b/dbms/tests/queries/0_stateless/01013_hex_float.sql @@ -4,4 +4,7 @@ SELECT hex(1e+18); SELECT hex(1e-20); SELECT hex(1e+100); SELECT hex(0.000578); -SELECt hex(-123.987); +SELECT hex(-123.978); +SELECT hex(toFloat32(99.67)); +SELECT hex(toFloat32(number)) FROM numbers(200, 2); +SELECT hex(toFloat64(number)) FROM numbers(202, 2); From 8c5f8e5fbeef471ee921696af8198f5b9900b97d Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 23 Sep 2019 18:57:18 +0300 Subject: [PATCH 202/309] Unpack to another directory --- docker/test/split_build_smoke_test/run.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docker/test/split_build_smoke_test/run.sh b/docker/test/split_build_smoke_test/run.sh index 436cc7ff1c1..2772ead5dfe 100755 --- a/docker/test/split_build_smoke_test/run.sh +++ b/docker/test/split_build_smoke_test/run.sh @@ -3,13 +3,14 @@ set -x install_and_run_server() { - tar -xzf package_folder/shared_build.tgz -C package_folder --strip 1 - LD_LIBRARY_PATH=/package_folder /package_folder/clickhouse-server --config /package_folder/config/config.xml >/var/log/clickhouse-server/stderr.log 2>&1 & + mkdir /unpacked + tar -xzf /package_folder/shared_build.tgz -C /unpacked --strip 1 + LD_LIBRARY_PATH=/unpacked /unpacked/clickhouse-server --config /unpacked/config/config.xml >/var/log/clickhouse-server/stderr.log 2>&1 & sleep 5 } run_client() { - LD_LIBRARY_PATH=/package_folder /package_folder/clickhouse-client --query \"select 'OK'\" 2>/var/log/clickhouse-server/clientstderr.log || echo 'FAIL' + LD_LIBRARY_PATH=/unpacked /unpacked/clickhouse-client --query \"select 'OK'\" 2>/var/log/clickhouse-server/clientstderr.log || echo 'FAIL' } install_and_run_server From 8579c26efb474e273972b1a49e6557ae80c9ab9b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 23 Sep 2019 19:18:19 +0300 Subject: [PATCH 203/309] Changed repository URL --- .../bug-report-or-unexpected-behaviour.md | 2 +- .github/ISSUE_TEMPLATE/performance-issue.md | 2 +- CHANGELOG.md | 2766 ++++++++--------- README.md | 2 +- SECURITY.md | 2 +- dbms/src/Common/SensitiveDataMasker.h | 2 +- .../PredicateExpressionsOptimizer.h | 2 +- dbms/src/Interpreters/RowRefs.h | 2 +- .../ReplicatedMergeTreePartCheckThread.cpp | 2 +- .../instructions/developer_instruction_ru.md | 6 +- .../instructions/easy_tasks_sorted_ru.md | 4 +- .../test_block_structure_mismatch/test.py | 2 +- dbms/tests/performance/if_array_string.xml | 2 +- .../0_stateless/00504_insert_miss_columns.sh | 2 +- .../0_stateless/00506_union_distributed.sql | 2 +- docker/client/README.md | 2 +- docker/server/README.md | 2 +- docker/test/README.md | 2 +- .../example_datasets/metrica.md | 2 +- docs/en/interfaces/tcp.md | 2 +- docs/en/operations/backup.md | 2 +- .../operations/settings/query_complexity.md | 4 +- docs/en/operations/table_engines/mergetree.md | 2 +- docs/en/query_language/alter.md | 2 +- docs/en/query_language/create.md | 2 +- docs/en/query_language/operators.md | 2 +- docs/fa/interfaces/tcp.md | 2 +- .../example_datasets/metrica.md | 2 +- docs/ru/interfaces/tcp.md | 2 +- docs/ru/operations/backup.md | 2 +- .../operations/settings/query_complexity.md | 4 +- docs/ru/operations/table_engines/mergetree.md | 2 +- docs/ru/query_language/alter.md | 2 +- docs/ru/query_language/create.md | 2 +- docs/ru/query_language/operators.md | 2 +- docs/tools/github.py | 2 +- docs/zh/interfaces/tcp.md | 2 +- .../operations/settings/query_complexity.md | 4 +- docs/zh/operations/table_engines/mergetree.md | 2 +- docs/zh/query_language/create.md | 2 +- utils/ci/default-config | 2 +- utils/report/clickhouse-report | 4 +- website/README.md | 2 +- website/deprecated/reference_en.html | 4 +- website/deprecated/reference_ru.html | 4 +- website/index.html | 10 +- website/tutorial.html | 2 +- 47 files changed, 1442 insertions(+), 1442 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug-report-or-unexpected-behaviour.md b/.github/ISSUE_TEMPLATE/bug-report-or-unexpected-behaviour.md index 9526b99b22b..542442e2856 100644 --- a/.github/ISSUE_TEMPLATE/bug-report-or-unexpected-behaviour.md +++ b/.github/ISSUE_TEMPLATE/bug-report-or-unexpected-behaviour.md @@ -17,7 +17,7 @@ A clear and concise description of what works not as it is supposed to. * Which interface to use, if matters * Non-default settings, if any * `CREATE TABLE` statements for all tables involved -* Sample data for all these tables, use [clickhouse-obfuscator](https://github.com/yandex/ClickHouse/blob/master/dbms/programs/obfuscator/Obfuscator.cpp#L42-L80) if necessary +* Sample data for all these tables, use [clickhouse-obfuscator](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/programs/obfuscator/Obfuscator.cpp#L42-L80) if necessary * Queries to run that lead to unexpected result **Expected behavior** diff --git a/.github/ISSUE_TEMPLATE/performance-issue.md b/.github/ISSUE_TEMPLATE/performance-issue.md index 402617d00f7..96c8cb77afb 100644 --- a/.github/ISSUE_TEMPLATE/performance-issue.md +++ b/.github/ISSUE_TEMPLATE/performance-issue.md @@ -17,7 +17,7 @@ What exactly works slower than expected? * Which interface to use, if matters * Non-default settings, if any * `CREATE TABLE` statements for all tables involved -* Sample data for all these tables, use [clickhouse-obfuscator](https://github.com/yandex/ClickHouse/blob/master/dbms/programs/obfuscator/Obfuscator.cpp#L42-L80) if necessary +* Sample data for all these tables, use [clickhouse-obfuscator](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/programs/obfuscator/Obfuscator.cpp#L42-L80) if necessary * Queries to run that lead to slow performance **Expected performance** diff --git a/CHANGELOG.md b/CHANGELOG.md index c385831af85..9b03364e2d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,401 +2,401 @@ ### Bug Fix * This release also contains all bug fixes from 19.14.6.12. -* Fixed possible inconsistent state of table while executing `DROP` query for replicated table while zookeeper is not accessible. [#6045](https://github.com/yandex/ClickHouse/issues/6045) [#6413](https://github.com/yandex/ClickHouse/pull/6413) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) -* Fix for data race in StorageMerge [#6717](https://github.com/yandex/ClickHouse/pull/6717) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix bug introduced in query profiler which leads to endless recv from socket. [#6386](https://github.com/yandex/ClickHouse/pull/6386) ([alesapin](https://github.com/alesapin)) -* Fix excessive CPU usage while executing `JSONExtractRaw` function over a boolean value. [#6208](https://github.com/yandex/ClickHouse/pull/6208) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fixes the regression while pushing to materialized view. [#6415](https://github.com/yandex/ClickHouse/pull/6415) ([Ivan](https://github.com/abyss7)) -* Table function `url` had the vulnerability allowed the attacker to inject arbitrary HTTP headers in the request. This issue was found by [Nikita Tikhomirov](https://github.com/NSTikhomirov). [#6466](https://github.com/yandex/ClickHouse/pull/6466) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix useless `AST` check in Set index. [#6510](https://github.com/yandex/ClickHouse/issues/6510) [#6651](https://github.com/yandex/ClickHouse/pull/6651) ([Nikita Vasilev](https://github.com/nikvas0)) -* Fixed parsing of `AggregateFunction` values embedded in query. [#6575](https://github.com/yandex/ClickHouse/issues/6575) [#6773](https://github.com/yandex/ClickHouse/pull/6773) ([Zhichang Yu](https://github.com/yuzhichang)) -* Fixed wrong behaviour of `trim` functions family. [#6647](https://github.com/yandex/ClickHouse/pull/6647) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed possible inconsistent state of table while executing `DROP` query for replicated table while zookeeper is not accessible. [#6045](https://github.com/ClickHouse/ClickHouse/issues/6045) [#6413](https://github.com/ClickHouse/ClickHouse/pull/6413) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) +* Fix for data race in StorageMerge [#6717](https://github.com/ClickHouse/ClickHouse/pull/6717) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix bug introduced in query profiler which leads to endless recv from socket. [#6386](https://github.com/ClickHouse/ClickHouse/pull/6386) ([alesapin](https://github.com/alesapin)) +* Fix excessive CPU usage while executing `JSONExtractRaw` function over a boolean value. [#6208](https://github.com/ClickHouse/ClickHouse/pull/6208) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fixes the regression while pushing to materialized view. [#6415](https://github.com/ClickHouse/ClickHouse/pull/6415) ([Ivan](https://github.com/abyss7)) +* Table function `url` had the vulnerability allowed the attacker to inject arbitrary HTTP headers in the request. This issue was found by [Nikita Tikhomirov](https://github.com/NSTikhomirov). [#6466](https://github.com/ClickHouse/ClickHouse/pull/6466) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix useless `AST` check in Set index. [#6510](https://github.com/ClickHouse/ClickHouse/issues/6510) [#6651](https://github.com/ClickHouse/ClickHouse/pull/6651) ([Nikita Vasilev](https://github.com/nikvas0)) +* Fixed parsing of `AggregateFunction` values embedded in query. [#6575](https://github.com/ClickHouse/ClickHouse/issues/6575) [#6773](https://github.com/ClickHouse/ClickHouse/pull/6773) ([Zhichang Yu](https://github.com/yuzhichang)) +* Fixed wrong behaviour of `trim` functions family. [#6647](https://github.com/ClickHouse/ClickHouse/pull/6647) ([alexey-milovidov](https://github.com/alexey-milovidov)) ## ClickHouse release 19.14.6.12, 2019-09-19 ### Bug Fix -* Fix for function `АrrayEnumerateUniqRanked` with empty arrays in params. [#6928](https://github.com/yandex/ClickHouse/pull/6928) ([proller](https://github.com/proller)) -* Fixed subquery name in queries with `ARRAY JOIN` and `GLOBAL IN subquery` with alias. Use subquery alias for external table name if it is specified. [#6934](https://github.com/yandex/ClickHouse/pull/6934) ([Ivan](https://github.com/abyss7)) +* Fix for function `АrrayEnumerateUniqRanked` with empty arrays in params. [#6928](https://github.com/ClickHouse/ClickHouse/pull/6928) ([proller](https://github.com/proller)) +* Fixed subquery name in queries with `ARRAY JOIN` and `GLOBAL IN subquery` with alias. Use subquery alias for external table name if it is specified. [#6934](https://github.com/ClickHouse/ClickHouse/pull/6934) ([Ivan](https://github.com/abyss7)) ### Build/Testing/Packaging Improvement -* Fix [flapping](https://clickhouse-test-reports.s3.yandex.net/6944/aab95fd5175a513413c7395a73a82044bdafb906/functional_stateless_tests_(debug).html) test `00715_fetch_merged_or_mutated_part_zookeeper` by rewriting it to a shell scripts because it needs to wait for mutations to apply. [#6977](https://github.com/yandex/ClickHouse/pull/6977) ([Alexander Kazakov](https://github.com/Akazz)) -* Fixed UBSan and MemSan failure in function `groupUniqArray` with emtpy array argument. It was caused by placing of empty `PaddedPODArray` into hash table zero cell because constructor for zero cell value was not called. [#6937](https://github.com/yandex/ClickHouse/pull/6937) ([Amos Bird](https://github.com/amosbird)) +* Fix [flapping](https://clickhouse-test-reports.s3.yandex.net/6944/aab95fd5175a513413c7395a73a82044bdafb906/functional_stateless_tests_(debug).html) test `00715_fetch_merged_or_mutated_part_zookeeper` by rewriting it to a shell scripts because it needs to wait for mutations to apply. [#6977](https://github.com/ClickHouse/ClickHouse/pull/6977) ([Alexander Kazakov](https://github.com/Akazz)) +* Fixed UBSan and MemSan failure in function `groupUniqArray` with emtpy array argument. It was caused by placing of empty `PaddedPODArray` into hash table zero cell because constructor for zero cell value was not called. [#6937](https://github.com/ClickHouse/ClickHouse/pull/6937) ([Amos Bird](https://github.com/amosbird)) ## ClickHouse release 19.14.3.3, 2019-09-10 ### New Feature -* `WITH FILL` modifier for `ORDER BY`. (continuation of [#5069](https://github.com/yandex/ClickHouse/issues/5069)) [#6610](https://github.com/yandex/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) -* `WITH TIES` modifier for `LIMIT`. (continuation of [#5069](https://github.com/yandex/ClickHouse/issues/5069)) [#6610](https://github.com/yandex/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) -* Parse unquoted `NULL` literal as NULL (if setting `format_csv_unquoted_null_literal_as_null=1`). Initialize null fields with default values if data type of this field is not nullable (if setting `input_format_null_as_default=1`). [#5990](https://github.com/yandex/ClickHouse/issues/5990) [#6055](https://github.com/yandex/ClickHouse/pull/6055) ([tavplubix](https://github.com/tavplubix)) -* Support for wildcards in paths of table functions `file` and `hdfs`. If the path contains wildcards, the table will be readonly. Example of usage: `select * from hdfs('hdfs://hdfs1:9000/some_dir/another_dir/*/file{0..9}{0..9}')` and `select * from file('some_dir/{some_file,another_file,yet_another}.tsv', 'TSV', 'value UInt32')`. [#6092](https://github.com/yandex/ClickHouse/pull/6092) ([Olga Khvostikova](https://github.com/stavrolia)) -* New `system.metric_log` table which stores values of `system.events` and `system.metrics` with specified time interval. [#6363](https://github.com/yandex/ClickHouse/issues/6363) [#6467](https://github.com/yandex/ClickHouse/pull/6467) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6530](https://github.com/yandex/ClickHouse/pull/6530) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Allow to write ClickHouse text logs to `system.text_log` table. [#6037](https://github.com/yandex/ClickHouse/issues/6037) [#6103](https://github.com/yandex/ClickHouse/pull/6103) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6164](https://github.com/yandex/ClickHouse/pull/6164) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Show private symbols in stack traces (this is done via parsing symbol tables of ELF files). Added information about file and line number in stack traces if debug info is present. Speedup symbol name lookup with indexing symbols present in program. Added new SQL functions for introspection: `demangle` and `addressToLine`. Renamed function `symbolizeAddress` to `addressToSymbol` for consistency. Function `addressToSymbol` will return mangled name for performance reasons and you have to apply `demangle`. Added setting `allow_introspection_functions` which is turned off by default. [#6201](https://github.com/yandex/ClickHouse/pull/6201) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Table function `values` (the name is case-insensitive). It allows to read from `VALUES` list proposed in [#5984](https://github.com/yandex/ClickHouse/issues/5984). Example: `SELECT * FROM VALUES('a UInt64, s String', (1, 'one'), (2, 'two'), (3, 'three'))`. [#6217](https://github.com/yandex/ClickHouse/issues/6217). [#6209](https://github.com/yandex/ClickHouse/pull/6209) ([dimarub2000](https://github.com/dimarub2000)) -* Added an ability to alter storage settings. Syntax: `ALTER TABLE
MODIFY SETTING = `. [#6366](https://github.com/yandex/ClickHouse/pull/6366) [#6669](https://github.com/yandex/ClickHouse/pull/6669) [#6685](https://github.com/yandex/ClickHouse/pull/6685) ([alesapin](https://github.com/alesapin)) -* Support for removing of detached parts. Syntax: `ALTER TABLE DROP DETACHED PART ''`. [#6158](https://github.com/yandex/ClickHouse/pull/6158) ([tavplubix](https://github.com/tavplubix)) -* Table constraints. Allows to add constraint to table definition which will be checked at insert. [#5273](https://github.com/yandex/ClickHouse/pull/5273) ([Gleb Novikov](https://github.com/NanoBjorn)) [#6652](https://github.com/yandex/ClickHouse/pull/6652) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Suppport for cascaded materialized views. [#6324](https://github.com/yandex/ClickHouse/pull/6324) ([Amos Bird](https://github.com/amosbird)) -* Turn on query profiler by default to sample every query execution thread once a second. [#6283](https://github.com/yandex/ClickHouse/pull/6283) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Input format `ORC`. [#6454](https://github.com/yandex/ClickHouse/pull/6454) [#6703](https://github.com/yandex/ClickHouse/pull/6703) ([akonyaev90](https://github.com/akonyaev90)) -* Added two new functions: `sigmoid` and `tanh` (that are useful for machine learning applications). [#6254](https://github.com/yandex/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Function `hasToken(haystack, token)`, `hasTokenCaseInsensitive(haystack, token)` to check if given token is in haystack. Token is a maximal length substring between two non alphanumeric ASCII characters (or boundaries of haystack). Token must be a constant string. Supported by tokenbf_v1 index specialization. [#6596](https://github.com/yandex/ClickHouse/pull/6596), [#6662](https://github.com/yandex/ClickHouse/pull/6662) ([Vasily Nemkov](https://github.com/Enmk)) -* New function `neighbor(value, offset[, default_value])`. Allows to reach prev/next value within column in a block of data. [#5925](https://github.com/yandex/ClickHouse/pull/5925) ([Alex Krash](https://github.com/alex-krash)) [6685365ab8c5b74f9650492c88a012596eb1b0c6](https://github.com/yandex/ClickHouse/commit/6685365ab8c5b74f9650492c88a012596eb1b0c6) [341e2e4587a18065c2da1ca888c73389f48ce36c](https://github.com/yandex/ClickHouse/commit/341e2e4587a18065c2da1ca888c73389f48ce36c) [Alexey Milovidov](https://github.com/alexey-milovidov) -* Created a function `currentUser()`, returning login of authorized user. Added alias `user()` for compatibility with MySQL. [#6470](https://github.com/yandex/ClickHouse/pull/6470) ([Alex Krash](https://github.com/alex-krash)) -* New aggregate functions `quantilesExactInclusive` and `quantilesExactExclusive` which were proposed in [#5885](https://github.com/yandex/ClickHouse/issues/5885). [#6477](https://github.com/yandex/ClickHouse/pull/6477) ([dimarub2000](https://github.com/dimarub2000)) -* Function `bitmapRange(bitmap, range_begin, range_end)` which returns new set with specified range (not include the `range_end`). [#6314](https://github.com/yandex/ClickHouse/pull/6314) ([Zhichang Yu](https://github.com/yuzhichang)) -* Function `geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precision)` which creates array of precision-long strings of geohash-boxes covering provided area. [#6127](https://github.com/yandex/ClickHouse/pull/6127) ([Vasily Nemkov](https://github.com/Enmk)) -* Implement support for INSERT query with `Kafka` tables. [#6012](https://github.com/yandex/ClickHouse/pull/6012) ([Ivan](https://github.com/abyss7)) -* Added support for `_partition` and `_timestamp` virtual columns to Kafka engine. [#6400](https://github.com/yandex/ClickHouse/pull/6400) ([Ivan](https://github.com/abyss7)) -* Possibility to remove sensitive data from `query_log`, server logs, process list with regexp-based rules. [#5710](https://github.com/yandex/ClickHouse/pull/5710) ([filimonov](https://github.com/filimonov)) +* `WITH FILL` modifier for `ORDER BY`. (continuation of [#5069](https://github.com/ClickHouse/ClickHouse/issues/5069)) [#6610](https://github.com/ClickHouse/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) +* `WITH TIES` modifier for `LIMIT`. (continuation of [#5069](https://github.com/ClickHouse/ClickHouse/issues/5069)) [#6610](https://github.com/ClickHouse/ClickHouse/pull/6610) ([Anton Popov](https://github.com/CurtizJ)) +* Parse unquoted `NULL` literal as NULL (if setting `format_csv_unquoted_null_literal_as_null=1`). Initialize null fields with default values if data type of this field is not nullable (if setting `input_format_null_as_default=1`). [#5990](https://github.com/ClickHouse/ClickHouse/issues/5990) [#6055](https://github.com/ClickHouse/ClickHouse/pull/6055) ([tavplubix](https://github.com/tavplubix)) +* Support for wildcards in paths of table functions `file` and `hdfs`. If the path contains wildcards, the table will be readonly. Example of usage: `select * from hdfs('hdfs://hdfs1:9000/some_dir/another_dir/*/file{0..9}{0..9}')` and `select * from file('some_dir/{some_file,another_file,yet_another}.tsv', 'TSV', 'value UInt32')`. [#6092](https://github.com/ClickHouse/ClickHouse/pull/6092) ([Olga Khvostikova](https://github.com/stavrolia)) +* New `system.metric_log` table which stores values of `system.events` and `system.metrics` with specified time interval. [#6363](https://github.com/ClickHouse/ClickHouse/issues/6363) [#6467](https://github.com/ClickHouse/ClickHouse/pull/6467) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6530](https://github.com/ClickHouse/ClickHouse/pull/6530) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Allow to write ClickHouse text logs to `system.text_log` table. [#6037](https://github.com/ClickHouse/ClickHouse/issues/6037) [#6103](https://github.com/ClickHouse/ClickHouse/pull/6103) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) [#6164](https://github.com/ClickHouse/ClickHouse/pull/6164) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Show private symbols in stack traces (this is done via parsing symbol tables of ELF files). Added information about file and line number in stack traces if debug info is present. Speedup symbol name lookup with indexing symbols present in program. Added new SQL functions for introspection: `demangle` and `addressToLine`. Renamed function `symbolizeAddress` to `addressToSymbol` for consistency. Function `addressToSymbol` will return mangled name for performance reasons and you have to apply `demangle`. Added setting `allow_introspection_functions` which is turned off by default. [#6201](https://github.com/ClickHouse/ClickHouse/pull/6201) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Table function `values` (the name is case-insensitive). It allows to read from `VALUES` list proposed in [#5984](https://github.com/ClickHouse/ClickHouse/issues/5984). Example: `SELECT * FROM VALUES('a UInt64, s String', (1, 'one'), (2, 'two'), (3, 'three'))`. [#6217](https://github.com/ClickHouse/ClickHouse/issues/6217). [#6209](https://github.com/ClickHouse/ClickHouse/pull/6209) ([dimarub2000](https://github.com/dimarub2000)) +* Added an ability to alter storage settings. Syntax: `ALTER TABLE
MODIFY SETTING = `. [#6366](https://github.com/ClickHouse/ClickHouse/pull/6366) [#6669](https://github.com/ClickHouse/ClickHouse/pull/6669) [#6685](https://github.com/ClickHouse/ClickHouse/pull/6685) ([alesapin](https://github.com/alesapin)) +* Support for removing of detached parts. Syntax: `ALTER TABLE DROP DETACHED PART ''`. [#6158](https://github.com/ClickHouse/ClickHouse/pull/6158) ([tavplubix](https://github.com/tavplubix)) +* Table constraints. Allows to add constraint to table definition which will be checked at insert. [#5273](https://github.com/ClickHouse/ClickHouse/pull/5273) ([Gleb Novikov](https://github.com/NanoBjorn)) [#6652](https://github.com/ClickHouse/ClickHouse/pull/6652) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Suppport for cascaded materialized views. [#6324](https://github.com/ClickHouse/ClickHouse/pull/6324) ([Amos Bird](https://github.com/amosbird)) +* Turn on query profiler by default to sample every query execution thread once a second. [#6283](https://github.com/ClickHouse/ClickHouse/pull/6283) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Input format `ORC`. [#6454](https://github.com/ClickHouse/ClickHouse/pull/6454) [#6703](https://github.com/ClickHouse/ClickHouse/pull/6703) ([akonyaev90](https://github.com/akonyaev90)) +* Added two new functions: `sigmoid` and `tanh` (that are useful for machine learning applications). [#6254](https://github.com/ClickHouse/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Function `hasToken(haystack, token)`, `hasTokenCaseInsensitive(haystack, token)` to check if given token is in haystack. Token is a maximal length substring between two non alphanumeric ASCII characters (or boundaries of haystack). Token must be a constant string. Supported by tokenbf_v1 index specialization. [#6596](https://github.com/ClickHouse/ClickHouse/pull/6596), [#6662](https://github.com/ClickHouse/ClickHouse/pull/6662) ([Vasily Nemkov](https://github.com/Enmk)) +* New function `neighbor(value, offset[, default_value])`. Allows to reach prev/next value within column in a block of data. [#5925](https://github.com/ClickHouse/ClickHouse/pull/5925) ([Alex Krash](https://github.com/alex-krash)) [6685365ab8c5b74f9650492c88a012596eb1b0c6](https://github.com/ClickHouse/ClickHouse/commit/6685365ab8c5b74f9650492c88a012596eb1b0c6) [341e2e4587a18065c2da1ca888c73389f48ce36c](https://github.com/ClickHouse/ClickHouse/commit/341e2e4587a18065c2da1ca888c73389f48ce36c) [Alexey Milovidov](https://github.com/alexey-milovidov) +* Created a function `currentUser()`, returning login of authorized user. Added alias `user()` for compatibility with MySQL. [#6470](https://github.com/ClickHouse/ClickHouse/pull/6470) ([Alex Krash](https://github.com/alex-krash)) +* New aggregate functions `quantilesExactInclusive` and `quantilesExactExclusive` which were proposed in [#5885](https://github.com/ClickHouse/ClickHouse/issues/5885). [#6477](https://github.com/ClickHouse/ClickHouse/pull/6477) ([dimarub2000](https://github.com/dimarub2000)) +* Function `bitmapRange(bitmap, range_begin, range_end)` which returns new set with specified range (not include the `range_end`). [#6314](https://github.com/ClickHouse/ClickHouse/pull/6314) ([Zhichang Yu](https://github.com/yuzhichang)) +* Function `geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precision)` which creates array of precision-long strings of geohash-boxes covering provided area. [#6127](https://github.com/ClickHouse/ClickHouse/pull/6127) ([Vasily Nemkov](https://github.com/Enmk)) +* Implement support for INSERT query with `Kafka` tables. [#6012](https://github.com/ClickHouse/ClickHouse/pull/6012) ([Ivan](https://github.com/abyss7)) +* Added support for `_partition` and `_timestamp` virtual columns to Kafka engine. [#6400](https://github.com/ClickHouse/ClickHouse/pull/6400) ([Ivan](https://github.com/abyss7)) +* Possibility to remove sensitive data from `query_log`, server logs, process list with regexp-based rules. [#5710](https://github.com/ClickHouse/ClickHouse/pull/5710) ([filimonov](https://github.com/filimonov)) ### Experimental Feature -* Input and output data format `Template`. It allows to specify custom format string for input and output. [#4354](https://github.com/yandex/ClickHouse/issues/4354) [#6727](https://github.com/yandex/ClickHouse/pull/6727) ([tavplubix](https://github.com/tavplubix)) -* Implementation of `LIVE VIEW` tables that were originally proposed in [#2898](https://github.com/yandex/ClickHouse/pull/2898), prepared in [#3925](https://github.com/yandex/ClickHouse/issues/3925), and then updated in [#5541](https://github.com/yandex/ClickHouse/issues/5541). See [#5541](https://github.com/yandex/ClickHouse/issues/5541) for detailed description. [#5541](https://github.com/yandex/ClickHouse/issues/5541) ([vzakaznikov](https://github.com/vzakaznikov)) [#6425](https://github.com/yandex/ClickHouse/pull/6425) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) [#6656](https://github.com/yandex/ClickHouse/pull/6656) ([vzakaznikov](https://github.com/vzakaznikov)) Note that `LIVE VIEW` feature may be removed in next versions. +* Input and output data format `Template`. It allows to specify custom format string for input and output. [#4354](https://github.com/ClickHouse/ClickHouse/issues/4354) [#6727](https://github.com/ClickHouse/ClickHouse/pull/6727) ([tavplubix](https://github.com/tavplubix)) +* Implementation of `LIVE VIEW` tables that were originally proposed in [#2898](https://github.com/ClickHouse/ClickHouse/pull/2898), prepared in [#3925](https://github.com/ClickHouse/ClickHouse/issues/3925), and then updated in [#5541](https://github.com/ClickHouse/ClickHouse/issues/5541). See [#5541](https://github.com/ClickHouse/ClickHouse/issues/5541) for detailed description. [#5541](https://github.com/ClickHouse/ClickHouse/issues/5541) ([vzakaznikov](https://github.com/vzakaznikov)) [#6425](https://github.com/ClickHouse/ClickHouse/pull/6425) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) [#6656](https://github.com/ClickHouse/ClickHouse/pull/6656) ([vzakaznikov](https://github.com/vzakaznikov)) Note that `LIVE VIEW` feature may be removed in next versions. ### Bug Fix * This release also contains all bug fixes from 19.13 and 19.11. -* Fix segmentation fault when the table has skip indices and vertical merge happens. [#6723](https://github.com/yandex/ClickHouse/pull/6723) ([alesapin](https://github.com/alesapin)) -* Fix per-column TTL with non-trivial column defaults. Previously in case of force TTL merge with `OPTIMIZE ... FINAL` query, expired values was replaced by type defaults instead of user-specified column defaults. [#6796](https://github.com/yandex/ClickHouse/pull/6796) ([Anton Popov](https://github.com/CurtizJ)) -* Fix Kafka messages duplication problem on normal server restart. [#6597](https://github.com/yandex/ClickHouse/pull/6597) ([Ivan](https://github.com/abyss7)) -* Fixed infinite loop when reading Kafka messages. Do not pause/resume consumer on subscription at all - otherwise it may get paused indefinitely in some scenarios. [#6354](https://github.com/yandex/ClickHouse/pull/6354) ([Ivan](https://github.com/abyss7)) -* Fix `Key expression contains comparison between inconvertible types` exception in `bitmapContains` function. [#6136](https://github.com/yandex/ClickHouse/issues/6136) [#6146](https://github.com/yandex/ClickHouse/issues/6146) [#6156](https://github.com/yandex/ClickHouse/pull/6156) ([dimarub2000](https://github.com/dimarub2000)) -* Fix segfault with enabled `optimize_skip_unused_shards` and missing sharding key. [#6384](https://github.com/yandex/ClickHouse/pull/6384) ([Anton Popov](https://github.com/CurtizJ)) -* Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [#6514](https://github.com/yandex/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Removed extra verbose logging in MySQL interface [#6389](https://github.com/yandex/ClickHouse/pull/6389) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Return ability to parse boolean settings from 'true' and 'false' in configuration file. [#6278](https://github.com/yandex/ClickHouse/pull/6278) ([alesapin](https://github.com/alesapin)) -* Fix crash in `quantile` and `median` function over `Nullable(Decimal128)`. [#6378](https://github.com/yandex/ClickHouse/pull/6378) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed possible incomplete result returned by `SELECT` query with `WHERE` condition on primary key contained conversion to Float type. It was caused by incorrect checking of monotonicity in `toFloat` function. [#6248](https://github.com/yandex/ClickHouse/issues/6248) [#6374](https://github.com/yandex/ClickHouse/pull/6374) ([dimarub2000](https://github.com/dimarub2000)) -* Check `max_expanded_ast_elements` setting for mutations. Clear mutations after `TRUNCATE TABLE`. [#6205](https://github.com/yandex/ClickHouse/pull/6205) ([Winter Zhang](https://github.com/zhang2014)) -* Fix JOIN results for key columns when used with `join_use_nulls`. Attach Nulls instead of columns defaults. [#6249](https://github.com/yandex/ClickHouse/pull/6249) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix for skip indices with vertical merge and alter. Fix for `Bad size of marks file` exception. [#6594](https://github.com/yandex/ClickHouse/issues/6594) [#6713](https://github.com/yandex/ClickHouse/pull/6713) ([alesapin](https://github.com/alesapin)) -* Fix rare crash in `ALTER MODIFY COLUMN` and vertical merge when one of merged/altered parts is empty (0 rows) [#6746](https://github.com/yandex/ClickHouse/issues/6746) [#6780](https://github.com/yandex/ClickHouse/pull/6780) ([alesapin](https://github.com/alesapin)) -* Fixed bug in conversion of `LowCardinality` types in `AggregateFunctionFactory`. This fixes [#6257](https://github.com/yandex/ClickHouse/issues/6257). [#6281](https://github.com/yandex/ClickHouse/pull/6281) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix wrong behavior and possible segfaults in `topK` and `topKWeighted` aggregated functions. [#6404](https://github.com/yandex/ClickHouse/pull/6404) ([Anton Popov](https://github.com/CurtizJ)) -* Fixed unsafe code around `getIdentifier` function. [#6401](https://github.com/yandex/ClickHouse/issues/6401) [#6409](https://github.com/yandex/ClickHouse/pull/6409) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed bug in MySQL wire protocol (is used while connecting to ClickHouse form MySQL client). Caused by heap buffer overflow in `PacketPayloadWriteBuffer`. [#6212](https://github.com/yandex/ClickHouse/pull/6212) ([Yuriy Baranov](https://github.com/yurriy)) -* Fixed memory leak in `bitmapSubsetInRange` function. [#6819](https://github.com/yandex/ClickHouse/pull/6819) ([Zhichang Yu](https://github.com/yuzhichang)) -* Fix rare bug when mutation executed after granularity change. [#6816](https://github.com/yandex/ClickHouse/pull/6816) ([alesapin](https://github.com/alesapin)) -* Allow protobuf message with all fields by default. [#6132](https://github.com/yandex/ClickHouse/pull/6132) ([Vitaly Baranov](https://github.com/vitlibar)) -* Resolve a bug with `nullIf` function when we send a `NULL` argument on the second argument. [#6446](https://github.com/yandex/ClickHouse/pull/6446) ([Guillaume Tassery](https://github.com/YiuRULE)) -* Fix rare bug with wrong memory allocation/deallocation in complex key cache dictionaries with string fields which leads to infinite memory consumption (looks like memory leak). Bug reproduces when string size was a power of two starting from eight (8, 16, 32, etc). [#6447](https://github.com/yandex/ClickHouse/pull/6447) ([alesapin](https://github.com/alesapin)) -* Fixed Gorilla encoding on small sequences which caused exception `Cannot write after end of buffer`. [#6398](https://github.com/yandex/ClickHouse/issues/6398) [#6444](https://github.com/yandex/ClickHouse/pull/6444) ([Vasily Nemkov](https://github.com/Enmk)) -* Allow to use not nullable types in JOINs with `join_use_nulls` enabled. [#6705](https://github.com/yandex/ClickHouse/pull/6705) ([Artem Zuikov](https://github.com/4ertus2)) -* Disable `Poco::AbstractConfiguration` substitutions in query in `clickhouse-client`. [#6706](https://github.com/yandex/ClickHouse/pull/6706) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Avoid deadlock in `REPLACE PARTITION`. [#6677](https://github.com/yandex/ClickHouse/pull/6677) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Using `arrayReduce` for constant arguments may lead to segfault. [#6242](https://github.com/yandex/ClickHouse/issues/6242) [#6326](https://github.com/yandex/ClickHouse/pull/6326) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix inconsistent parts which can appear if replica was restored after `DROP PARTITION`. [#6522](https://github.com/yandex/ClickHouse/issues/6522) [#6523](https://github.com/yandex/ClickHouse/pull/6523) ([tavplubix](https://github.com/tavplubix)) -* Fixed hang in `JSONExtractRaw` function. [#6195](https://github.com/yandex/ClickHouse/issues/6195) [#6198](https://github.com/yandex/ClickHouse/pull/6198) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix bug with incorrect skip indices serialization and aggregation with adaptive granularity. [#6594](https://github.com/yandex/ClickHouse/issues/6594). [#6748](https://github.com/yandex/ClickHouse/pull/6748) ([alesapin](https://github.com/alesapin)) -* Fix `WITH ROLLUP` and `WITH CUBE` modifiers of `GROUP BY` with two-level aggregation. [#6225](https://github.com/yandex/ClickHouse/pull/6225) ([Anton Popov](https://github.com/CurtizJ)) -* Fix bug with writing secondary indices marks with adaptive granularity. [#6126](https://github.com/yandex/ClickHouse/pull/6126) ([alesapin](https://github.com/alesapin)) -* Fix initialization order while server startup. Since `StorageMergeTree::background_task_handle` is initialized in `startup()` the `MergeTreeBlockOutputStream::write()` may try to use it before initialization. Just check if it is initialized. [#6080](https://github.com/yandex/ClickHouse/pull/6080) ([Ivan](https://github.com/abyss7)) -* Clearing the data buffer from the previous read operation that was completed with an error. [#6026](https://github.com/yandex/ClickHouse/pull/6026) ([Nikolay](https://github.com/bopohaa)) -* Fix bug with enabling adaptive granularity when creating new replica for Replicated*MergeTree table. [#6394](https://github.com/yandex/ClickHouse/issues/6394) [#6452](https://github.com/yandex/ClickHouse/pull/6452) ([alesapin](https://github.com/alesapin)) -* Fixed possible crash during server startup in case of exception happened in `libunwind` during exception at access to uninitialised `ThreadStatus` structure. [#6456](https://github.com/yandex/ClickHouse/pull/6456) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) -* Fix crash in `yandexConsistentHash` function. Found by fuzz test. [#6304](https://github.com/yandex/ClickHouse/issues/6304) [#6305](https://github.com/yandex/ClickHouse/pull/6305) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed the possibility of hanging queries when server is overloaded and global thread pool becomes near full. This have higher chance to happen on clusters with large number of shards (hundreds), because distributed queries allocate a thread per connection to each shard. For example, this issue may reproduce if a cluster of 330 shards is processing 30 concurrent distributed queries. This issue affects all versions starting from 19.2. [#6301](https://github.com/yandex/ClickHouse/pull/6301) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed logic of `arrayEnumerateUniqRanked` function. [#6423](https://github.com/yandex/ClickHouse/pull/6423) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix segfault when decoding symbol table. [#6603](https://github.com/yandex/ClickHouse/pull/6603) ([Amos Bird](https://github.com/amosbird)) -* Fixed irrelevant exception in cast of `LowCardinality(Nullable)` to not-Nullable column in case if it doesn't contain Nulls (e.g. in query like `SELECT CAST(CAST('Hello' AS LowCardinality(Nullable(String))) AS String)`. [#6094](https://github.com/yandex/ClickHouse/issues/6094) [#6119](https://github.com/yandex/ClickHouse/pull/6119) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Removed extra quoting of description in `system.settings` table. [#6696](https://github.com/yandex/ClickHouse/issues/6696) [#6699](https://github.com/yandex/ClickHouse/pull/6699) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Avoid possible deadlock in `TRUNCATE` of Replicated table. [#6695](https://github.com/yandex/ClickHouse/pull/6695) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix reading in order of sorting key. [#6189](https://github.com/yandex/ClickHouse/pull/6189) ([Anton Popov](https://github.com/CurtizJ)) -* Fix `ALTER TABLE ... UPDATE` query for tables with `enable_mixed_granularity_parts=1`. [#6543](https://github.com/yandex/ClickHouse/pull/6543) ([alesapin](https://github.com/alesapin)) -* Fixed the case when server may close listening sockets but not shutdown and continue serving remaining queries. You may end up with two running clickhouse-server processes. Sometimes, the server may return an error `bad_function_call` for remaining queries. [#6231](https://github.com/yandex/ClickHouse/pull/6231) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix bug opened by [#4405](https://github.com/yandex/ClickHouse/pull/4405) (since 19.4.0). Reproduces in queries to Distributed tables over MergeTree tables when we doesn't query any columns (`SELECT 1`). [#6236](https://github.com/yandex/ClickHouse/pull/6236) ([alesapin](https://github.com/alesapin)) -* Fixed overflow in integer division of signed type to unsigned type. The behaviour was exactly as in C or C++ language (integer promotion rules) that may be surprising. Please note that the overflow is still possible when dividing large signed number to large unsigned number or vice-versa (but that case is less usual). The issue existed in all server versions. [#6214](https://github.com/yandex/ClickHouse/issues/6214) [#6233](https://github.com/yandex/ClickHouse/pull/6233) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Limit maximum sleep time for throttling when `max_execution_speed` or `max_execution_speed_bytes` is set. Fixed false errors like `Estimated query execution time (inf seconds) is too long`. [#5547](https://github.com/yandex/ClickHouse/issues/5547) [#6232](https://github.com/yandex/ClickHouse/pull/6232) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed issues about using `MATERIALIZED` columns and aliases in `MaterializedView`. [#448](https://github.com/yandex/ClickHouse/issues/448) [#3484](https://github.com/yandex/ClickHouse/issues/3484) [#3450](https://github.com/yandex/ClickHouse/issues/3450) [#2878](https://github.com/yandex/ClickHouse/issues/2878) [#2285](https://github.com/yandex/ClickHouse/issues/2285) [#3796](https://github.com/yandex/ClickHouse/pull/3796) ([Amos Bird](https://github.com/amosbird)) [#6316](https://github.com/yandex/ClickHouse/pull/6316) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix `FormatFactory` behaviour for input streams which are not implemented as processor. [#6495](https://github.com/yandex/ClickHouse/pull/6495) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fixed typo. [#6631](https://github.com/yandex/ClickHouse/pull/6631) ([Alex Ryndin](https://github.com/alexryndin)) -* Typo in the error message ( is -> are ). [#6839](https://github.com/yandex/ClickHouse/pull/6839) ([Denis Zhuravlev](https://github.com/den-crane)) -* Fixed error while parsing of columns list from string if type contained a comma (this issue was relevant for `File`, `URL`, `HDFS` storages) [#6217](https://github.com/yandex/ClickHouse/issues/6217). [#6209](https://github.com/yandex/ClickHouse/pull/6209) ([dimarub2000](https://github.com/dimarub2000)) +* Fix segmentation fault when the table has skip indices and vertical merge happens. [#6723](https://github.com/ClickHouse/ClickHouse/pull/6723) ([alesapin](https://github.com/alesapin)) +* Fix per-column TTL with non-trivial column defaults. Previously in case of force TTL merge with `OPTIMIZE ... FINAL` query, expired values was replaced by type defaults instead of user-specified column defaults. [#6796](https://github.com/ClickHouse/ClickHouse/pull/6796) ([Anton Popov](https://github.com/CurtizJ)) +* Fix Kafka messages duplication problem on normal server restart. [#6597](https://github.com/ClickHouse/ClickHouse/pull/6597) ([Ivan](https://github.com/abyss7)) +* Fixed infinite loop when reading Kafka messages. Do not pause/resume consumer on subscription at all - otherwise it may get paused indefinitely in some scenarios. [#6354](https://github.com/ClickHouse/ClickHouse/pull/6354) ([Ivan](https://github.com/abyss7)) +* Fix `Key expression contains comparison between inconvertible types` exception in `bitmapContains` function. [#6136](https://github.com/ClickHouse/ClickHouse/issues/6136) [#6146](https://github.com/ClickHouse/ClickHouse/issues/6146) [#6156](https://github.com/ClickHouse/ClickHouse/pull/6156) ([dimarub2000](https://github.com/dimarub2000)) +* Fix segfault with enabled `optimize_skip_unused_shards` and missing sharding key. [#6384](https://github.com/ClickHouse/ClickHouse/pull/6384) ([Anton Popov](https://github.com/CurtizJ)) +* Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [#6514](https://github.com/ClickHouse/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Removed extra verbose logging in MySQL interface [#6389](https://github.com/ClickHouse/ClickHouse/pull/6389) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Return ability to parse boolean settings from 'true' and 'false' in configuration file. [#6278](https://github.com/ClickHouse/ClickHouse/pull/6278) ([alesapin](https://github.com/alesapin)) +* Fix crash in `quantile` and `median` function over `Nullable(Decimal128)`. [#6378](https://github.com/ClickHouse/ClickHouse/pull/6378) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed possible incomplete result returned by `SELECT` query with `WHERE` condition on primary key contained conversion to Float type. It was caused by incorrect checking of monotonicity in `toFloat` function. [#6248](https://github.com/ClickHouse/ClickHouse/issues/6248) [#6374](https://github.com/ClickHouse/ClickHouse/pull/6374) ([dimarub2000](https://github.com/dimarub2000)) +* Check `max_expanded_ast_elements` setting for mutations. Clear mutations after `TRUNCATE TABLE`. [#6205](https://github.com/ClickHouse/ClickHouse/pull/6205) ([Winter Zhang](https://github.com/zhang2014)) +* Fix JOIN results for key columns when used with `join_use_nulls`. Attach Nulls instead of columns defaults. [#6249](https://github.com/ClickHouse/ClickHouse/pull/6249) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix for skip indices with vertical merge and alter. Fix for `Bad size of marks file` exception. [#6594](https://github.com/ClickHouse/ClickHouse/issues/6594) [#6713](https://github.com/ClickHouse/ClickHouse/pull/6713) ([alesapin](https://github.com/alesapin)) +* Fix rare crash in `ALTER MODIFY COLUMN` and vertical merge when one of merged/altered parts is empty (0 rows) [#6746](https://github.com/ClickHouse/ClickHouse/issues/6746) [#6780](https://github.com/ClickHouse/ClickHouse/pull/6780) ([alesapin](https://github.com/alesapin)) +* Fixed bug in conversion of `LowCardinality` types in `AggregateFunctionFactory`. This fixes [#6257](https://github.com/ClickHouse/ClickHouse/issues/6257). [#6281](https://github.com/ClickHouse/ClickHouse/pull/6281) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fix wrong behavior and possible segfaults in `topK` and `topKWeighted` aggregated functions. [#6404](https://github.com/ClickHouse/ClickHouse/pull/6404) ([Anton Popov](https://github.com/CurtizJ)) +* Fixed unsafe code around `getIdentifier` function. [#6401](https://github.com/ClickHouse/ClickHouse/issues/6401) [#6409](https://github.com/ClickHouse/ClickHouse/pull/6409) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed bug in MySQL wire protocol (is used while connecting to ClickHouse form MySQL client). Caused by heap buffer overflow in `PacketPayloadWriteBuffer`. [#6212](https://github.com/ClickHouse/ClickHouse/pull/6212) ([Yuriy Baranov](https://github.com/yurriy)) +* Fixed memory leak in `bitmapSubsetInRange` function. [#6819](https://github.com/ClickHouse/ClickHouse/pull/6819) ([Zhichang Yu](https://github.com/yuzhichang)) +* Fix rare bug when mutation executed after granularity change. [#6816](https://github.com/ClickHouse/ClickHouse/pull/6816) ([alesapin](https://github.com/alesapin)) +* Allow protobuf message with all fields by default. [#6132](https://github.com/ClickHouse/ClickHouse/pull/6132) ([Vitaly Baranov](https://github.com/vitlibar)) +* Resolve a bug with `nullIf` function when we send a `NULL` argument on the second argument. [#6446](https://github.com/ClickHouse/ClickHouse/pull/6446) ([Guillaume Tassery](https://github.com/YiuRULE)) +* Fix rare bug with wrong memory allocation/deallocation in complex key cache dictionaries with string fields which leads to infinite memory consumption (looks like memory leak). Bug reproduces when string size was a power of two starting from eight (8, 16, 32, etc). [#6447](https://github.com/ClickHouse/ClickHouse/pull/6447) ([alesapin](https://github.com/alesapin)) +* Fixed Gorilla encoding on small sequences which caused exception `Cannot write after end of buffer`. [#6398](https://github.com/ClickHouse/ClickHouse/issues/6398) [#6444](https://github.com/ClickHouse/ClickHouse/pull/6444) ([Vasily Nemkov](https://github.com/Enmk)) +* Allow to use not nullable types in JOINs with `join_use_nulls` enabled. [#6705](https://github.com/ClickHouse/ClickHouse/pull/6705) ([Artem Zuikov](https://github.com/4ertus2)) +* Disable `Poco::AbstractConfiguration` substitutions in query in `clickhouse-client`. [#6706](https://github.com/ClickHouse/ClickHouse/pull/6706) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Avoid deadlock in `REPLACE PARTITION`. [#6677](https://github.com/ClickHouse/ClickHouse/pull/6677) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Using `arrayReduce` for constant arguments may lead to segfault. [#6242](https://github.com/ClickHouse/ClickHouse/issues/6242) [#6326](https://github.com/ClickHouse/ClickHouse/pull/6326) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix inconsistent parts which can appear if replica was restored after `DROP PARTITION`. [#6522](https://github.com/ClickHouse/ClickHouse/issues/6522) [#6523](https://github.com/ClickHouse/ClickHouse/pull/6523) ([tavplubix](https://github.com/tavplubix)) +* Fixed hang in `JSONExtractRaw` function. [#6195](https://github.com/ClickHouse/ClickHouse/issues/6195) [#6198](https://github.com/ClickHouse/ClickHouse/pull/6198) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix bug with incorrect skip indices serialization and aggregation with adaptive granularity. [#6594](https://github.com/ClickHouse/ClickHouse/issues/6594). [#6748](https://github.com/ClickHouse/ClickHouse/pull/6748) ([alesapin](https://github.com/alesapin)) +* Fix `WITH ROLLUP` and `WITH CUBE` modifiers of `GROUP BY` with two-level aggregation. [#6225](https://github.com/ClickHouse/ClickHouse/pull/6225) ([Anton Popov](https://github.com/CurtizJ)) +* Fix bug with writing secondary indices marks with adaptive granularity. [#6126](https://github.com/ClickHouse/ClickHouse/pull/6126) ([alesapin](https://github.com/alesapin)) +* Fix initialization order while server startup. Since `StorageMergeTree::background_task_handle` is initialized in `startup()` the `MergeTreeBlockOutputStream::write()` may try to use it before initialization. Just check if it is initialized. [#6080](https://github.com/ClickHouse/ClickHouse/pull/6080) ([Ivan](https://github.com/abyss7)) +* Clearing the data buffer from the previous read operation that was completed with an error. [#6026](https://github.com/ClickHouse/ClickHouse/pull/6026) ([Nikolay](https://github.com/bopohaa)) +* Fix bug with enabling adaptive granularity when creating new replica for Replicated*MergeTree table. [#6394](https://github.com/ClickHouse/ClickHouse/issues/6394) [#6452](https://github.com/ClickHouse/ClickHouse/pull/6452) ([alesapin](https://github.com/alesapin)) +* Fixed possible crash during server startup in case of exception happened in `libunwind` during exception at access to uninitialised `ThreadStatus` structure. [#6456](https://github.com/ClickHouse/ClickHouse/pull/6456) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) +* Fix crash in `yandexConsistentHash` function. Found by fuzz test. [#6304](https://github.com/ClickHouse/ClickHouse/issues/6304) [#6305](https://github.com/ClickHouse/ClickHouse/pull/6305) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed the possibility of hanging queries when server is overloaded and global thread pool becomes near full. This have higher chance to happen on clusters with large number of shards (hundreds), because distributed queries allocate a thread per connection to each shard. For example, this issue may reproduce if a cluster of 330 shards is processing 30 concurrent distributed queries. This issue affects all versions starting from 19.2. [#6301](https://github.com/ClickHouse/ClickHouse/pull/6301) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed logic of `arrayEnumerateUniqRanked` function. [#6423](https://github.com/ClickHouse/ClickHouse/pull/6423) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix segfault when decoding symbol table. [#6603](https://github.com/ClickHouse/ClickHouse/pull/6603) ([Amos Bird](https://github.com/amosbird)) +* Fixed irrelevant exception in cast of `LowCardinality(Nullable)` to not-Nullable column in case if it doesn't contain Nulls (e.g. in query like `SELECT CAST(CAST('Hello' AS LowCardinality(Nullable(String))) AS String)`. [#6094](https://github.com/ClickHouse/ClickHouse/issues/6094) [#6119](https://github.com/ClickHouse/ClickHouse/pull/6119) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Removed extra quoting of description in `system.settings` table. [#6696](https://github.com/ClickHouse/ClickHouse/issues/6696) [#6699](https://github.com/ClickHouse/ClickHouse/pull/6699) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Avoid possible deadlock in `TRUNCATE` of Replicated table. [#6695](https://github.com/ClickHouse/ClickHouse/pull/6695) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix reading in order of sorting key. [#6189](https://github.com/ClickHouse/ClickHouse/pull/6189) ([Anton Popov](https://github.com/CurtizJ)) +* Fix `ALTER TABLE ... UPDATE` query for tables with `enable_mixed_granularity_parts=1`. [#6543](https://github.com/ClickHouse/ClickHouse/pull/6543) ([alesapin](https://github.com/alesapin)) +* Fixed the case when server may close listening sockets but not shutdown and continue serving remaining queries. You may end up with two running clickhouse-server processes. Sometimes, the server may return an error `bad_function_call` for remaining queries. [#6231](https://github.com/ClickHouse/ClickHouse/pull/6231) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix bug opened by [#4405](https://github.com/ClickHouse/ClickHouse/pull/4405) (since 19.4.0). Reproduces in queries to Distributed tables over MergeTree tables when we doesn't query any columns (`SELECT 1`). [#6236](https://github.com/ClickHouse/ClickHouse/pull/6236) ([alesapin](https://github.com/alesapin)) +* Fixed overflow in integer division of signed type to unsigned type. The behaviour was exactly as in C or C++ language (integer promotion rules) that may be surprising. Please note that the overflow is still possible when dividing large signed number to large unsigned number or vice-versa (but that case is less usual). The issue existed in all server versions. [#6214](https://github.com/ClickHouse/ClickHouse/issues/6214) [#6233](https://github.com/ClickHouse/ClickHouse/pull/6233) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Limit maximum sleep time for throttling when `max_execution_speed` or `max_execution_speed_bytes` is set. Fixed false errors like `Estimated query execution time (inf seconds) is too long`. [#5547](https://github.com/ClickHouse/ClickHouse/issues/5547) [#6232](https://github.com/ClickHouse/ClickHouse/pull/6232) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed issues about using `MATERIALIZED` columns and aliases in `MaterializedView`. [#448](https://github.com/ClickHouse/ClickHouse/issues/448) [#3484](https://github.com/ClickHouse/ClickHouse/issues/3484) [#3450](https://github.com/ClickHouse/ClickHouse/issues/3450) [#2878](https://github.com/ClickHouse/ClickHouse/issues/2878) [#2285](https://github.com/ClickHouse/ClickHouse/issues/2285) [#3796](https://github.com/ClickHouse/ClickHouse/pull/3796) ([Amos Bird](https://github.com/amosbird)) [#6316](https://github.com/ClickHouse/ClickHouse/pull/6316) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix `FormatFactory` behaviour for input streams which are not implemented as processor. [#6495](https://github.com/ClickHouse/ClickHouse/pull/6495) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fixed typo. [#6631](https://github.com/ClickHouse/ClickHouse/pull/6631) ([Alex Ryndin](https://github.com/alexryndin)) +* Typo in the error message ( is -> are ). [#6839](https://github.com/ClickHouse/ClickHouse/pull/6839) ([Denis Zhuravlev](https://github.com/den-crane)) +* Fixed error while parsing of columns list from string if type contained a comma (this issue was relevant for `File`, `URL`, `HDFS` storages) [#6217](https://github.com/ClickHouse/ClickHouse/issues/6217). [#6209](https://github.com/ClickHouse/ClickHouse/pull/6209) ([dimarub2000](https://github.com/dimarub2000)) ### Security Fix * This release also contains all bug security fixes from 19.13 and 19.11. -* Fixed the possibility of a fabricated query to cause server crash due to stack overflow in SQL parser. Fixed the possibility of stack overflow in Merge and Distributed tables, materialized views and conditions for row-level security that involve subqueries. [#6433](https://github.com/yandex/ClickHouse/pull/6433) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed the possibility of a fabricated query to cause server crash due to stack overflow in SQL parser. Fixed the possibility of stack overflow in Merge and Distributed tables, materialized views and conditions for row-level security that involve subqueries. [#6433](https://github.com/ClickHouse/ClickHouse/pull/6433) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Improvement -* Correct implementation of ternary logic for `AND/OR`. [#6048](https://github.com/yandex/ClickHouse/pull/6048) ([Alexander Kazakov](https://github.com/Akazz)) -* Now values and rows with expired TTL will be removed after `OPTIMIZE ... FINAL` query from old parts without TTL infos or with outdated TTL infos, e.g. after `ALTER ... MODIFY TTL` query. Added queries `SYSTEM STOP/START TTL MERGES` to disallow/allow assign merges with TTL and filter expired values in all merges. [#6274](https://github.com/yandex/ClickHouse/pull/6274) ([Anton Popov](https://github.com/CurtizJ)) -* Possibility to change the location of ClickHouse history file for client using `CLICKHOUSE_HISTORY_FILE` env. [#6840](https://github.com/yandex/ClickHouse/pull/6840) ([filimonov](https://github.com/filimonov)) -* Remove `dry_run` flag from `InterpreterSelectQuery`. ... [#6375](https://github.com/yandex/ClickHouse/pull/6375) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Support `ASOF JOIN` with `ON` section. [#6211](https://github.com/yandex/ClickHouse/pull/6211) ([Artem Zuikov](https://github.com/4ertus2)) -* Better support of skip indexes for mutations and replication. Support for `MATERIALIZE/CLEAR INDEX ... IN PARTITION` query. `UPDATE x = x` recalculates all indices that use column `x`. [#5053](https://github.com/yandex/ClickHouse/pull/5053) ([Nikita Vasilev](https://github.com/nikvas0)) -* Allow to `ATTACH` live views (for example, at the server startup) regardless to `allow_experimental_live_view` setting. [#6754](https://github.com/yandex/ClickHouse/pull/6754) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* For stack traces gathered by query profiler, do not include stack frames generated by the query profiler itself. [#6250](https://github.com/yandex/ClickHouse/pull/6250) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Now table functions `values`, `file`, `url`, `hdfs` have support for ALIAS columns. [#6255](https://github.com/yandex/ClickHouse/pull/6255) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Throw an exception if `config.d` file doesn't have the corresponding root element as the config file. [#6123](https://github.com/yandex/ClickHouse/pull/6123) ([dimarub2000](https://github.com/dimarub2000)) -* Print extra info in exception message for `no space left on device`. [#6182](https://github.com/yandex/ClickHouse/issues/6182), [#6252](https://github.com/yandex/ClickHouse/issues/6252) [#6352](https://github.com/yandex/ClickHouse/pull/6352) ([tavplubix](https://github.com/tavplubix)) -* When determining shards of a `Distributed` table to be covered by a read query (for `optimize_skip_unused_shards` = 1) ClickHouse now checks conditions from both `prewhere` and `where` clauses of select statement. [#6521](https://github.com/yandex/ClickHouse/pull/6521) ([Alexander Kazakov](https://github.com/Akazz)) -* Enabled `SIMDJSON` for machines without AVX2 but with SSE 4.2 and PCLMUL instruction set. [#6285](https://github.com/yandex/ClickHouse/issues/6285) [#6320](https://github.com/yandex/ClickHouse/pull/6320) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* ClickHouse can work on filesystems without `O_DIRECT` support (such as ZFS and BtrFS) without additional tuning. [#4449](https://github.com/yandex/ClickHouse/issues/4449) [#6730](https://github.com/yandex/ClickHouse/pull/6730) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Support push down predicate for final subquery. [#6120](https://github.com/yandex/ClickHouse/pull/6120) ([TCeason](https://github.com/TCeason)) [#6162](https://github.com/yandex/ClickHouse/pull/6162) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Better `JOIN ON` keys extraction [#6131](https://github.com/yandex/ClickHouse/pull/6131) ([Artem Zuikov](https://github.com/4ertus2)) -* Upated `SIMDJSON`. [#6285](https://github.com/yandex/ClickHouse/issues/6285). [#6306](https://github.com/yandex/ClickHouse/pull/6306) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Optimize selecting of smallest column for `SELECT count()` query. [#6344](https://github.com/yandex/ClickHouse/pull/6344) ([Amos Bird](https://github.com/amosbird)) -* Added `strict` parameter in `windowFunnel()`. When the `strict` is set, the `windowFunnel()` applies conditions only for the unique values. [#6548](https://github.com/yandex/ClickHouse/pull/6548) ([achimbab](https://github.com/achimbab)) -* Safer interface of `mysqlxx::Pool`. [#6150](https://github.com/yandex/ClickHouse/pull/6150) ([avasiliev](https://github.com/avasiliev)) -* Options line size when executing with `--help` option now corresponds with terminal size. [#6590](https://github.com/yandex/ClickHouse/pull/6590) ([dimarub2000](https://github.com/dimarub2000)) -* Disable "read in order" optimization for aggregation without keys. [#6599](https://github.com/yandex/ClickHouse/pull/6599) ([Anton Popov](https://github.com/CurtizJ)) -* HTTP status code for `INCORRECT_DATA` and `TYPE_MISMATCH` error codes was changed from default `500 Internal Server Error` to `400 Bad Request`. [#6271](https://github.com/yandex/ClickHouse/pull/6271) ([Alexander Rodin](https://github.com/a-rodin)) -* Move Join object from `ExpressionAction` into `AnalyzedJoin`. `ExpressionAnalyzer` and `ExpressionAction` do not know about `Join` class anymore. Its logic is hidden by `AnalyzedJoin` iface. [#6801](https://github.com/yandex/ClickHouse/pull/6801) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed possible deadlock of distributed queries when one of shards is localhost but the query is sent via network connection. [#6759](https://github.com/yandex/ClickHouse/pull/6759) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Changed semantic of multiple tables `RENAME` to avoid possible deadlocks. [#6757](https://github.com/yandex/ClickHouse/issues/6757). [#6756](https://github.com/yandex/ClickHouse/pull/6756) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Rewritten MySQL compatibility server to prevent loading full packet payload in memory. Decreased memory consumption for each connection to approximately `2 * DBMS_DEFAULT_BUFFER_SIZE` (read/write buffers). [#5811](https://github.com/yandex/ClickHouse/pull/5811) ([Yuriy Baranov](https://github.com/yurriy)) -* Move AST alias interpreting logic out of parser that doesn't have to know anything about query semantics. [#6108](https://github.com/yandex/ClickHouse/pull/6108) ([Artem Zuikov](https://github.com/4ertus2)) -* Slightly more safe parsing of `NamesAndTypesList`. [#6408](https://github.com/yandex/ClickHouse/issues/6408). [#6410](https://github.com/yandex/ClickHouse/pull/6410) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `clickhouse-copier`: Allow use `where_condition` from config with `partition_key` alias in query for checking partition existence (Earlier it was used only in reading data queries). [#6577](https://github.com/yandex/ClickHouse/pull/6577) ([proller](https://github.com/proller)) -* Added optional message argument in `throwIf`. ([#5772](https://github.com/yandex/ClickHouse/issues/5772)) [#6329](https://github.com/yandex/ClickHouse/pull/6329) ([Vdimir](https://github.com/Vdimir)) -* Server exception got while sending insertion data is now being processed in client as well. [#5891](https://github.com/yandex/ClickHouse/issues/5891) [#6711](https://github.com/yandex/ClickHouse/pull/6711) ([dimarub2000](https://github.com/dimarub2000)) -* Added a metric `DistributedFilesToInsert` that shows the total number of files in filesystem that are selected to send to remote servers by Distributed tables. The number is summed across all shards. [#6600](https://github.com/yandex/ClickHouse/pull/6600) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Move most of JOINs prepare logic from `ExpressionAction/ExpressionAnalyzer` to `AnalyzedJoin`. [#6785](https://github.com/yandex/ClickHouse/pull/6785) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix TSan [warning](https://clickhouse-test-reports.s3.yandex.net/6399/c1c1d1daa98e199e620766f1bd06a5921050a00d/functional_stateful_tests_(thread).html) 'lock-order-inversion'. [#6740](https://github.com/yandex/ClickHouse/pull/6740) ([Vasily Nemkov](https://github.com/Enmk)) -* Better information messages about lack of Linux capabilities. Logging fatal errors with "fatal" level, that will make it easier to find in `system.text_log`. [#6441](https://github.com/yandex/ClickHouse/pull/6441) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* When enable dumping temporary data to the disk to restrict memory usage during `GROUP BY`, `ORDER BY`, it didn't check the free disk space. The fix add a new setting `min_free_disk_space`, when the free disk space it smaller then the threshold, the query will stop and throw `ErrorCodes::NOT_ENOUGH_SPACE`. [#6678](https://github.com/yandex/ClickHouse/pull/6678) ([Weiqing Xu](https://github.com/weiqxu)) [#6691](https://github.com/yandex/ClickHouse/pull/6691) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Removed recursive rwlock by thread. It makes no sense, because threads are reused between queries. `SELECT` query may acquire a lock in one thread, hold a lock from another thread and exit from first thread. In the same time, first thread can be reused by `DROP` query. This will lead to false "Attempt to acquire exclusive lock recursively" messages. [#6771](https://github.com/yandex/ClickHouse/pull/6771) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Split `ExpressionAnalyzer.appendJoin()`. Prepare a place in `ExpressionAnalyzer` for `MergeJoin`. [#6524](https://github.com/yandex/ClickHouse/pull/6524) ([Artem Zuikov](https://github.com/4ertus2)) -* Added `mysql_native_password` authentication plugin to MySQL compatibility server. [#6194](https://github.com/yandex/ClickHouse/pull/6194) ([Yuriy Baranov](https://github.com/yurriy)) -* Less number of `clock_gettime` calls; fixed ABI compatibility between debug/release in `Allocator` (insignificant issue). [#6197](https://github.com/yandex/ClickHouse/pull/6197) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Move `collectUsedColumns` from `ExpressionAnalyzer` to `SyntaxAnalyzer`. `SyntaxAnalyzer` makes `required_source_columns` itself now. [#6416](https://github.com/yandex/ClickHouse/pull/6416) ([Artem Zuikov](https://github.com/4ertus2)) -* Add setting `joined_subquery_requires_alias` to require aliases for subselects and table functions in `FROM` that more than one table is present (i.e. queries with JOINs). [#6733](https://github.com/yandex/ClickHouse/pull/6733) ([Artem Zuikov](https://github.com/4ertus2)) -* Extract `GetAggregatesVisitor` class from `ExpressionAnalyzer`. [#6458](https://github.com/yandex/ClickHouse/pull/6458) ([Artem Zuikov](https://github.com/4ertus2)) -* `system.query_log`: change data type of `type` column to `Enum`. [#6265](https://github.com/yandex/ClickHouse/pull/6265) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) -* Static linking of `sha256_password` authentication plugin. [#6512](https://github.com/yandex/ClickHouse/pull/6512) ([Yuriy Baranov](https://github.com/yurriy)) -* Avoid extra dependency for the setting `compile` to work. In previous versions, the user may get error like `cannot open crti.o`, `unable to find library -lc` etc. [#6309](https://github.com/yandex/ClickHouse/pull/6309) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* More validation of the input that may come from malicious replica. [#6303](https://github.com/yandex/ClickHouse/pull/6303) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Now `clickhouse-obfuscator` file is available in `clickhouse-client` package. In previous versions it was available as `clickhouse obfuscator` (with whitespace). [#5816](https://github.com/yandex/ClickHouse/issues/5816) [#6609](https://github.com/yandex/ClickHouse/pull/6609) ([dimarub2000](https://github.com/dimarub2000)) -* Fixed deadlock when we have at least two queries that read at least two tables in different order and another query that performs DDL operation on one of tables. Fixed another very rare deadlock. [#6764](https://github.com/yandex/ClickHouse/pull/6764) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added `os_thread_ids` column to `system.processes` and `system.query_log` for better debugging possibilities. [#6763](https://github.com/yandex/ClickHouse/pull/6763) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* A workaround for PHP mysqlnd extension bugs which occur when `sha256_password` is used as a default authentication plugin (described in [#6031](https://github.com/yandex/ClickHouse/issues/6031)). [#6113](https://github.com/yandex/ClickHouse/pull/6113) ([Yuriy Baranov](https://github.com/yurriy)) -* Remove unneeded place with changed nullability columns. [#6693](https://github.com/yandex/ClickHouse/pull/6693) ([Artem Zuikov](https://github.com/4ertus2)) -* Set default value of `queue_max_wait_ms` to zero, because current value (five seconds) makes no sense. There are rare circumstances when this settings has any use. Added settings `replace_running_query_max_wait_ms`, `kafka_max_wait_ms` and `connection_pool_max_wait_ms` for disambiguation. [#6692](https://github.com/yandex/ClickHouse/pull/6692) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Extract `SelectQueryExpressionAnalyzer` from `ExpressionAnalyzer`. Keep the last one for non-select queries. [#6499](https://github.com/yandex/ClickHouse/pull/6499) ([Artem Zuikov](https://github.com/4ertus2)) -* Removed duplicating input and output formats. [#6239](https://github.com/yandex/ClickHouse/pull/6239) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Allow user to override `poll_interval` and `idle_connection_timeout` settings on connection. [#6230](https://github.com/yandex/ClickHouse/pull/6230) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `MergeTree` now has an additional option `ttl_only_drop_parts` (disabled by default) to avoid partial pruning of parts, so that they dropped completely when all the rows in a part are expired. [#6191](https://github.com/yandex/ClickHouse/pull/6191) ([Sergi Vladykin](https://github.com/svladykin)) -* Type checks for set index functions. Throw exception if function got a wrong type. This fixes fuzz test with UBSan. [#6511](https://github.com/yandex/ClickHouse/pull/6511) ([Nikita Vasilev](https://github.com/nikvas0)) +* Correct implementation of ternary logic for `AND/OR`. [#6048](https://github.com/ClickHouse/ClickHouse/pull/6048) ([Alexander Kazakov](https://github.com/Akazz)) +* Now values and rows with expired TTL will be removed after `OPTIMIZE ... FINAL` query from old parts without TTL infos or with outdated TTL infos, e.g. after `ALTER ... MODIFY TTL` query. Added queries `SYSTEM STOP/START TTL MERGES` to disallow/allow assign merges with TTL and filter expired values in all merges. [#6274](https://github.com/ClickHouse/ClickHouse/pull/6274) ([Anton Popov](https://github.com/CurtizJ)) +* Possibility to change the location of ClickHouse history file for client using `CLICKHOUSE_HISTORY_FILE` env. [#6840](https://github.com/ClickHouse/ClickHouse/pull/6840) ([filimonov](https://github.com/filimonov)) +* Remove `dry_run` flag from `InterpreterSelectQuery`. ... [#6375](https://github.com/ClickHouse/ClickHouse/pull/6375) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Support `ASOF JOIN` with `ON` section. [#6211](https://github.com/ClickHouse/ClickHouse/pull/6211) ([Artem Zuikov](https://github.com/4ertus2)) +* Better support of skip indexes for mutations and replication. Support for `MATERIALIZE/CLEAR INDEX ... IN PARTITION` query. `UPDATE x = x` recalculates all indices that use column `x`. [#5053](https://github.com/ClickHouse/ClickHouse/pull/5053) ([Nikita Vasilev](https://github.com/nikvas0)) +* Allow to `ATTACH` live views (for example, at the server startup) regardless to `allow_experimental_live_view` setting. [#6754](https://github.com/ClickHouse/ClickHouse/pull/6754) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* For stack traces gathered by query profiler, do not include stack frames generated by the query profiler itself. [#6250](https://github.com/ClickHouse/ClickHouse/pull/6250) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Now table functions `values`, `file`, `url`, `hdfs` have support for ALIAS columns. [#6255](https://github.com/ClickHouse/ClickHouse/pull/6255) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Throw an exception if `config.d` file doesn't have the corresponding root element as the config file. [#6123](https://github.com/ClickHouse/ClickHouse/pull/6123) ([dimarub2000](https://github.com/dimarub2000)) +* Print extra info in exception message for `no space left on device`. [#6182](https://github.com/ClickHouse/ClickHouse/issues/6182), [#6252](https://github.com/ClickHouse/ClickHouse/issues/6252) [#6352](https://github.com/ClickHouse/ClickHouse/pull/6352) ([tavplubix](https://github.com/tavplubix)) +* When determining shards of a `Distributed` table to be covered by a read query (for `optimize_skip_unused_shards` = 1) ClickHouse now checks conditions from both `prewhere` and `where` clauses of select statement. [#6521](https://github.com/ClickHouse/ClickHouse/pull/6521) ([Alexander Kazakov](https://github.com/Akazz)) +* Enabled `SIMDJSON` for machines without AVX2 but with SSE 4.2 and PCLMUL instruction set. [#6285](https://github.com/ClickHouse/ClickHouse/issues/6285) [#6320](https://github.com/ClickHouse/ClickHouse/pull/6320) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* ClickHouse can work on filesystems without `O_DIRECT` support (such as ZFS and BtrFS) without additional tuning. [#4449](https://github.com/ClickHouse/ClickHouse/issues/4449) [#6730](https://github.com/ClickHouse/ClickHouse/pull/6730) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Support push down predicate for final subquery. [#6120](https://github.com/ClickHouse/ClickHouse/pull/6120) ([TCeason](https://github.com/TCeason)) [#6162](https://github.com/ClickHouse/ClickHouse/pull/6162) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Better `JOIN ON` keys extraction [#6131](https://github.com/ClickHouse/ClickHouse/pull/6131) ([Artem Zuikov](https://github.com/4ertus2)) +* Upated `SIMDJSON`. [#6285](https://github.com/ClickHouse/ClickHouse/issues/6285). [#6306](https://github.com/ClickHouse/ClickHouse/pull/6306) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Optimize selecting of smallest column for `SELECT count()` query. [#6344](https://github.com/ClickHouse/ClickHouse/pull/6344) ([Amos Bird](https://github.com/amosbird)) +* Added `strict` parameter in `windowFunnel()`. When the `strict` is set, the `windowFunnel()` applies conditions only for the unique values. [#6548](https://github.com/ClickHouse/ClickHouse/pull/6548) ([achimbab](https://github.com/achimbab)) +* Safer interface of `mysqlxx::Pool`. [#6150](https://github.com/ClickHouse/ClickHouse/pull/6150) ([avasiliev](https://github.com/avasiliev)) +* Options line size when executing with `--help` option now corresponds with terminal size. [#6590](https://github.com/ClickHouse/ClickHouse/pull/6590) ([dimarub2000](https://github.com/dimarub2000)) +* Disable "read in order" optimization for aggregation without keys. [#6599](https://github.com/ClickHouse/ClickHouse/pull/6599) ([Anton Popov](https://github.com/CurtizJ)) +* HTTP status code for `INCORRECT_DATA` and `TYPE_MISMATCH` error codes was changed from default `500 Internal Server Error` to `400 Bad Request`. [#6271](https://github.com/ClickHouse/ClickHouse/pull/6271) ([Alexander Rodin](https://github.com/a-rodin)) +* Move Join object from `ExpressionAction` into `AnalyzedJoin`. `ExpressionAnalyzer` and `ExpressionAction` do not know about `Join` class anymore. Its logic is hidden by `AnalyzedJoin` iface. [#6801](https://github.com/ClickHouse/ClickHouse/pull/6801) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed possible deadlock of distributed queries when one of shards is localhost but the query is sent via network connection. [#6759](https://github.com/ClickHouse/ClickHouse/pull/6759) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Changed semantic of multiple tables `RENAME` to avoid possible deadlocks. [#6757](https://github.com/ClickHouse/ClickHouse/issues/6757). [#6756](https://github.com/ClickHouse/ClickHouse/pull/6756) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Rewritten MySQL compatibility server to prevent loading full packet payload in memory. Decreased memory consumption for each connection to approximately `2 * DBMS_DEFAULT_BUFFER_SIZE` (read/write buffers). [#5811](https://github.com/ClickHouse/ClickHouse/pull/5811) ([Yuriy Baranov](https://github.com/yurriy)) +* Move AST alias interpreting logic out of parser that doesn't have to know anything about query semantics. [#6108](https://github.com/ClickHouse/ClickHouse/pull/6108) ([Artem Zuikov](https://github.com/4ertus2)) +* Slightly more safe parsing of `NamesAndTypesList`. [#6408](https://github.com/ClickHouse/ClickHouse/issues/6408). [#6410](https://github.com/ClickHouse/ClickHouse/pull/6410) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* `clickhouse-copier`: Allow use `where_condition` from config with `partition_key` alias in query for checking partition existence (Earlier it was used only in reading data queries). [#6577](https://github.com/ClickHouse/ClickHouse/pull/6577) ([proller](https://github.com/proller)) +* Added optional message argument in `throwIf`. ([#5772](https://github.com/ClickHouse/ClickHouse/issues/5772)) [#6329](https://github.com/ClickHouse/ClickHouse/pull/6329) ([Vdimir](https://github.com/Vdimir)) +* Server exception got while sending insertion data is now being processed in client as well. [#5891](https://github.com/ClickHouse/ClickHouse/issues/5891) [#6711](https://github.com/ClickHouse/ClickHouse/pull/6711) ([dimarub2000](https://github.com/dimarub2000)) +* Added a metric `DistributedFilesToInsert` that shows the total number of files in filesystem that are selected to send to remote servers by Distributed tables. The number is summed across all shards. [#6600](https://github.com/ClickHouse/ClickHouse/pull/6600) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Move most of JOINs prepare logic from `ExpressionAction/ExpressionAnalyzer` to `AnalyzedJoin`. [#6785](https://github.com/ClickHouse/ClickHouse/pull/6785) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix TSan [warning](https://clickhouse-test-reports.s3.yandex.net/6399/c1c1d1daa98e199e620766f1bd06a5921050a00d/functional_stateful_tests_(thread).html) 'lock-order-inversion'. [#6740](https://github.com/ClickHouse/ClickHouse/pull/6740) ([Vasily Nemkov](https://github.com/Enmk)) +* Better information messages about lack of Linux capabilities. Logging fatal errors with "fatal" level, that will make it easier to find in `system.text_log`. [#6441](https://github.com/ClickHouse/ClickHouse/pull/6441) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* When enable dumping temporary data to the disk to restrict memory usage during `GROUP BY`, `ORDER BY`, it didn't check the free disk space. The fix add a new setting `min_free_disk_space`, when the free disk space it smaller then the threshold, the query will stop and throw `ErrorCodes::NOT_ENOUGH_SPACE`. [#6678](https://github.com/ClickHouse/ClickHouse/pull/6678) ([Weiqing Xu](https://github.com/weiqxu)) [#6691](https://github.com/ClickHouse/ClickHouse/pull/6691) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Removed recursive rwlock by thread. It makes no sense, because threads are reused between queries. `SELECT` query may acquire a lock in one thread, hold a lock from another thread and exit from first thread. In the same time, first thread can be reused by `DROP` query. This will lead to false "Attempt to acquire exclusive lock recursively" messages. [#6771](https://github.com/ClickHouse/ClickHouse/pull/6771) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Split `ExpressionAnalyzer.appendJoin()`. Prepare a place in `ExpressionAnalyzer` for `MergeJoin`. [#6524](https://github.com/ClickHouse/ClickHouse/pull/6524) ([Artem Zuikov](https://github.com/4ertus2)) +* Added `mysql_native_password` authentication plugin to MySQL compatibility server. [#6194](https://github.com/ClickHouse/ClickHouse/pull/6194) ([Yuriy Baranov](https://github.com/yurriy)) +* Less number of `clock_gettime` calls; fixed ABI compatibility between debug/release in `Allocator` (insignificant issue). [#6197](https://github.com/ClickHouse/ClickHouse/pull/6197) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Move `collectUsedColumns` from `ExpressionAnalyzer` to `SyntaxAnalyzer`. `SyntaxAnalyzer` makes `required_source_columns` itself now. [#6416](https://github.com/ClickHouse/ClickHouse/pull/6416) ([Artem Zuikov](https://github.com/4ertus2)) +* Add setting `joined_subquery_requires_alias` to require aliases for subselects and table functions in `FROM` that more than one table is present (i.e. queries with JOINs). [#6733](https://github.com/ClickHouse/ClickHouse/pull/6733) ([Artem Zuikov](https://github.com/4ertus2)) +* Extract `GetAggregatesVisitor` class from `ExpressionAnalyzer`. [#6458](https://github.com/ClickHouse/ClickHouse/pull/6458) ([Artem Zuikov](https://github.com/4ertus2)) +* `system.query_log`: change data type of `type` column to `Enum`. [#6265](https://github.com/ClickHouse/ClickHouse/pull/6265) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) +* Static linking of `sha256_password` authentication plugin. [#6512](https://github.com/ClickHouse/ClickHouse/pull/6512) ([Yuriy Baranov](https://github.com/yurriy)) +* Avoid extra dependency for the setting `compile` to work. In previous versions, the user may get error like `cannot open crti.o`, `unable to find library -lc` etc. [#6309](https://github.com/ClickHouse/ClickHouse/pull/6309) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* More validation of the input that may come from malicious replica. [#6303](https://github.com/ClickHouse/ClickHouse/pull/6303) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Now `clickhouse-obfuscator` file is available in `clickhouse-client` package. In previous versions it was available as `clickhouse obfuscator` (with whitespace). [#5816](https://github.com/ClickHouse/ClickHouse/issues/5816) [#6609](https://github.com/ClickHouse/ClickHouse/pull/6609) ([dimarub2000](https://github.com/dimarub2000)) +* Fixed deadlock when we have at least two queries that read at least two tables in different order and another query that performs DDL operation on one of tables. Fixed another very rare deadlock. [#6764](https://github.com/ClickHouse/ClickHouse/pull/6764) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added `os_thread_ids` column to `system.processes` and `system.query_log` for better debugging possibilities. [#6763](https://github.com/ClickHouse/ClickHouse/pull/6763) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* A workaround for PHP mysqlnd extension bugs which occur when `sha256_password` is used as a default authentication plugin (described in [#6031](https://github.com/ClickHouse/ClickHouse/issues/6031)). [#6113](https://github.com/ClickHouse/ClickHouse/pull/6113) ([Yuriy Baranov](https://github.com/yurriy)) +* Remove unneeded place with changed nullability columns. [#6693](https://github.com/ClickHouse/ClickHouse/pull/6693) ([Artem Zuikov](https://github.com/4ertus2)) +* Set default value of `queue_max_wait_ms` to zero, because current value (five seconds) makes no sense. There are rare circumstances when this settings has any use. Added settings `replace_running_query_max_wait_ms`, `kafka_max_wait_ms` and `connection_pool_max_wait_ms` for disambiguation. [#6692](https://github.com/ClickHouse/ClickHouse/pull/6692) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Extract `SelectQueryExpressionAnalyzer` from `ExpressionAnalyzer`. Keep the last one for non-select queries. [#6499](https://github.com/ClickHouse/ClickHouse/pull/6499) ([Artem Zuikov](https://github.com/4ertus2)) +* Removed duplicating input and output formats. [#6239](https://github.com/ClickHouse/ClickHouse/pull/6239) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Allow user to override `poll_interval` and `idle_connection_timeout` settings on connection. [#6230](https://github.com/ClickHouse/ClickHouse/pull/6230) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* `MergeTree` now has an additional option `ttl_only_drop_parts` (disabled by default) to avoid partial pruning of parts, so that they dropped completely when all the rows in a part are expired. [#6191](https://github.com/ClickHouse/ClickHouse/pull/6191) ([Sergi Vladykin](https://github.com/svladykin)) +* Type checks for set index functions. Throw exception if function got a wrong type. This fixes fuzz test with UBSan. [#6511](https://github.com/ClickHouse/ClickHouse/pull/6511) ([Nikita Vasilev](https://github.com/nikvas0)) ### Performance Improvement -* Optimize queries with `ORDER BY expressions` clause, where `expressions` have coinciding prefix with sorting key in `MergeTree` tables. This optimization is controlled by `optimize_read_in_order` setting. [#6054](https://github.com/yandex/ClickHouse/pull/6054) [#6629](https://github.com/yandex/ClickHouse/pull/6629) ([Anton Popov](https://github.com/CurtizJ)) -* Allow to use multiple threads during parts loading and removal. [#6372](https://github.com/yandex/ClickHouse/issues/6372) [#6074](https://github.com/yandex/ClickHouse/issues/6074) [#6438](https://github.com/yandex/ClickHouse/pull/6438) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Implemented batch variant of updating aggregate function states. It may lead to performance benefits. [#6435](https://github.com/yandex/ClickHouse/pull/6435) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Using `FastOps` library for functions `exp`, `log`, `sigmoid`, `tanh`. FastOps is a fast vector math library from Michael Parakhin (Yandex CTO). Improved performance of `exp` and `log` functions more than 6 times. The functions `exp` and `log` from `Float32` argument will return `Float32` (in previous versions they always return `Float64`). Now `exp(nan)` may return `inf`. The result of `exp` and `log` functions may be not the nearest machine representable number to the true answer. [#6254](https://github.com/yandex/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) Using Danila Kutenin variant to make fastops working [#6317](https://github.com/yandex/ClickHouse/pull/6317) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Disable consecutive key optimization for `UInt8/16`. [#6298](https://github.com/yandex/ClickHouse/pull/6298) [#6701](https://github.com/yandex/ClickHouse/pull/6701) ([akuzm](https://github.com/akuzm)) -* Improved performance of `simdjson` library by getting rid of dynamic allocation in `ParsedJson::Iterator`. [#6479](https://github.com/yandex/ClickHouse/pull/6479) ([Vitaly Baranov](https://github.com/vitlibar)) -* Pre-fault pages when allocating memory with `mmap()`. [#6667](https://github.com/yandex/ClickHouse/pull/6667) ([akuzm](https://github.com/akuzm)) -* Fix performance bug in `Decimal` comparison. [#6380](https://github.com/yandex/ClickHouse/pull/6380) ([Artem Zuikov](https://github.com/4ertus2)) +* Optimize queries with `ORDER BY expressions` clause, where `expressions` have coinciding prefix with sorting key in `MergeTree` tables. This optimization is controlled by `optimize_read_in_order` setting. [#6054](https://github.com/ClickHouse/ClickHouse/pull/6054) [#6629](https://github.com/ClickHouse/ClickHouse/pull/6629) ([Anton Popov](https://github.com/CurtizJ)) +* Allow to use multiple threads during parts loading and removal. [#6372](https://github.com/ClickHouse/ClickHouse/issues/6372) [#6074](https://github.com/ClickHouse/ClickHouse/issues/6074) [#6438](https://github.com/ClickHouse/ClickHouse/pull/6438) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Implemented batch variant of updating aggregate function states. It may lead to performance benefits. [#6435](https://github.com/ClickHouse/ClickHouse/pull/6435) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Using `FastOps` library for functions `exp`, `log`, `sigmoid`, `tanh`. FastOps is a fast vector math library from Michael Parakhin (Yandex CTO). Improved performance of `exp` and `log` functions more than 6 times. The functions `exp` and `log` from `Float32` argument will return `Float32` (in previous versions they always return `Float64`). Now `exp(nan)` may return `inf`. The result of `exp` and `log` functions may be not the nearest machine representable number to the true answer. [#6254](https://github.com/ClickHouse/ClickHouse/pull/6254) ([alexey-milovidov](https://github.com/alexey-milovidov)) Using Danila Kutenin variant to make fastops working [#6317](https://github.com/ClickHouse/ClickHouse/pull/6317) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Disable consecutive key optimization for `UInt8/16`. [#6298](https://github.com/ClickHouse/ClickHouse/pull/6298) [#6701](https://github.com/ClickHouse/ClickHouse/pull/6701) ([akuzm](https://github.com/akuzm)) +* Improved performance of `simdjson` library by getting rid of dynamic allocation in `ParsedJson::Iterator`. [#6479](https://github.com/ClickHouse/ClickHouse/pull/6479) ([Vitaly Baranov](https://github.com/vitlibar)) +* Pre-fault pages when allocating memory with `mmap()`. [#6667](https://github.com/ClickHouse/ClickHouse/pull/6667) ([akuzm](https://github.com/akuzm)) +* Fix performance bug in `Decimal` comparison. [#6380](https://github.com/ClickHouse/ClickHouse/pull/6380) ([Artem Zuikov](https://github.com/4ertus2)) ### Build/Testing/Packaging Improvement -* Remove Compiler (runtime template instantiation) because we've win over it's performance. [#6646](https://github.com/yandex/ClickHouse/pull/6646) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added performance test to show degradation of performance in gcc-9 in more isolated way. [#6302](https://github.com/yandex/ClickHouse/pull/6302) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added table function `numbers_mt`, which is multithreaded version of `numbers`. Updated performance tests with hash functions. [#6554](https://github.com/yandex/ClickHouse/pull/6554) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Comparison mode in `clickhouse-benchmark` [#6220](https://github.com/yandex/ClickHouse/issues/6220) [#6343](https://github.com/yandex/ClickHouse/pull/6343) ([dimarub2000](https://github.com/dimarub2000)) -* Best effort for printing stack traces. Also added `SIGPROF` as a debugging signal to print stack trace of a running thread. [#6529](https://github.com/yandex/ClickHouse/pull/6529) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Every function in its own file, part 10. [#6321](https://github.com/yandex/ClickHouse/pull/6321) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Remove doubled const `TABLE_IS_READ_ONLY`. [#6566](https://github.com/yandex/ClickHouse/pull/6566) ([filimonov](https://github.com/filimonov)) -* Formatting changes for `StringHashMap` PR [#5417](https://github.com/yandex/ClickHouse/issues/5417). [#6700](https://github.com/yandex/ClickHouse/pull/6700) ([akuzm](https://github.com/akuzm)) -* Better subquery for join creation in `ExpressionAnalyzer`. [#6824](https://github.com/yandex/ClickHouse/pull/6824) ([Artem Zuikov](https://github.com/4ertus2)) -* Remove a redundant condition (found by PVS Studio). [#6775](https://github.com/yandex/ClickHouse/pull/6775) ([akuzm](https://github.com/akuzm)) -* Separate the hash table interface for `ReverseIndex`. [#6672](https://github.com/yandex/ClickHouse/pull/6672) ([akuzm](https://github.com/akuzm)) -* Refactoring of settings. [#6689](https://github.com/yandex/ClickHouse/pull/6689) ([alesapin](https://github.com/alesapin)) -* Add comments for `set` index functions. [#6319](https://github.com/yandex/ClickHouse/pull/6319) ([Nikita Vasilev](https://github.com/nikvas0)) -* Increase OOM score in debug version on Linux. [#6152](https://github.com/yandex/ClickHouse/pull/6152) ([akuzm](https://github.com/akuzm)) -* HDFS HA now work in debug build. [#6650](https://github.com/yandex/ClickHouse/pull/6650) ([Weiqing Xu](https://github.com/weiqxu)) -* Added a test to `transform_query_for_external_database`. [#6388](https://github.com/yandex/ClickHouse/pull/6388) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add test for multiple materialized views for Kafka table. [#6509](https://github.com/yandex/ClickHouse/pull/6509) ([Ivan](https://github.com/abyss7)) -* Make a better build scheme. [#6500](https://github.com/yandex/ClickHouse/pull/6500) ([Ivan](https://github.com/abyss7)) -* Fixed `test_external_dictionaries` integration in case it was executed under non root user. [#6507](https://github.com/yandex/ClickHouse/pull/6507) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* The bug reproduces when total size of written packets exceeds `DBMS_DEFAULT_BUFFER_SIZE`. [#6204](https://github.com/yandex/ClickHouse/pull/6204) ([Yuriy Baranov](https://github.com/yurriy)) -* Added a test for `RENAME` table race condition [#6752](https://github.com/yandex/ClickHouse/pull/6752) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Avoid data race on Settings in `KILL QUERY`. [#6753](https://github.com/yandex/ClickHouse/pull/6753) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add integration test for handling errors by a cache dictionary. [#6755](https://github.com/yandex/ClickHouse/pull/6755) ([Vitaly Baranov](https://github.com/vitlibar)) -* Move `input_format_defaults_for_omitted_fields` to incompatible changes [#6573](https://github.com/yandex/ClickHouse/pull/6573) ([Artem Zuikov](https://github.com/4ertus2)) -* Disable parsing of ELF object files on Mac OS, because it makes no sense. [#6578](https://github.com/yandex/ClickHouse/pull/6578) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Attempt to make changelog generator better. [#6327](https://github.com/yandex/ClickHouse/pull/6327) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Adding `-Wshadow` switch to the GCC. [#6325](https://github.com/yandex/ClickHouse/pull/6325) ([kreuzerkrieg](https://github.com/kreuzerkrieg)) -* Removed obsolete code for `mimalloc` support. [#6715](https://github.com/yandex/ClickHouse/pull/6715) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `zlib-ng` determines x86 capabilities and saves this info to global variables. This is done in defalteInit call, which may be made by different threads simultaneously. To avoid multithreaded writes, do it on library startup. [#6141](https://github.com/yandex/ClickHouse/pull/6141) ([akuzm](https://github.com/akuzm)) -* Regression test for a bug which in join which was fixed in [#5192](https://github.com/yandex/ClickHouse/issues/5192). [#6147](https://github.com/yandex/ClickHouse/pull/6147) ([Bakhtiyor Ruziev](https://github.com/theruziev)) -* Fixed MSan report. [#6144](https://github.com/yandex/ClickHouse/pull/6144) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix flapping TTL test. [#6782](https://github.com/yandex/ClickHouse/pull/6782) ([Anton Popov](https://github.com/CurtizJ)) -* Fixed false data race in `MergeTreeDataPart::is_frozen` field. [#6583](https://github.com/yandex/ClickHouse/pull/6583) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed timeouts in fuzz test. In previous version, it managed to find false hangup in query `SELECT * FROM numbers_mt(gccMurmurHash(''))`. [#6582](https://github.com/yandex/ClickHouse/pull/6582) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added debug checks to `static_cast` of columns. [#6581](https://github.com/yandex/ClickHouse/pull/6581) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Support for Oracle Linux in official RPM packages. [#6356](https://github.com/yandex/ClickHouse/issues/6356) [#6585](https://github.com/yandex/ClickHouse/pull/6585) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Changed json perftests from `once` to `loop` type. [#6536](https://github.com/yandex/ClickHouse/pull/6536) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* `odbc-bridge.cpp` defines `main()` so it should not be included in `clickhouse-lib`. [#6538](https://github.com/yandex/ClickHouse/pull/6538) ([Orivej Desh](https://github.com/orivej)) -* Test for crash in `FULL|RIGHT JOIN` with nulls in right table's keys. [#6362](https://github.com/yandex/ClickHouse/pull/6362) ([Artem Zuikov](https://github.com/4ertus2)) -* Added a test for the limit on expansion of aliases just in case. [#6442](https://github.com/yandex/ClickHouse/pull/6442) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Switched from `boost::filesystem` to `std::filesystem` where appropriate. [#6253](https://github.com/yandex/ClickHouse/pull/6253) [#6385](https://github.com/yandex/ClickHouse/pull/6385) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added RPM packages to website. [#6251](https://github.com/yandex/ClickHouse/pull/6251) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add a test for fixed `Unknown identifier` exception in `IN` section. [#6708](https://github.com/yandex/ClickHouse/pull/6708) ([Artem Zuikov](https://github.com/4ertus2)) -* Simplify `shared_ptr_helper` because people facing difficulties understanding it. [#6675](https://github.com/yandex/ClickHouse/pull/6675) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added performance tests for fixed Gorilla and DoubleDelta codec. [#6179](https://github.com/yandex/ClickHouse/pull/6179) ([Vasily Nemkov](https://github.com/Enmk)) -* Split the integration test `test_dictionaries` into 4 separate tests. [#6776](https://github.com/yandex/ClickHouse/pull/6776) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix PVS-Studio warning in `PipelineExecutor`. [#6777](https://github.com/yandex/ClickHouse/pull/6777) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Allow to use `library` dictionary source with ASan. [#6482](https://github.com/yandex/ClickHouse/pull/6482) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added option to generate changelog from a list of PRs. [#6350](https://github.com/yandex/ClickHouse/pull/6350) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Lock the `TinyLog` storage when reading. [#6226](https://github.com/yandex/ClickHouse/pull/6226) ([akuzm](https://github.com/akuzm)) -* Check for broken symlinks in CI. [#6634](https://github.com/yandex/ClickHouse/pull/6634) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Increase timeout for "stack overflow" test because it may take a long time in debug build. [#6637](https://github.com/yandex/ClickHouse/pull/6637) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added a check for double whitespaces. [#6643](https://github.com/yandex/ClickHouse/pull/6643) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix `new/delete` memory tracking when build with sanitizers. Tracking is not clear. It only prevents memory limit exceptions in tests. [#6450](https://github.com/yandex/ClickHouse/pull/6450) ([Artem Zuikov](https://github.com/4ertus2)) -* Enable back the check of undefined symbols while linking. [#6453](https://github.com/yandex/ClickHouse/pull/6453) ([Ivan](https://github.com/abyss7)) -* Avoid rebuilding `hyperscan` every day. [#6307](https://github.com/yandex/ClickHouse/pull/6307) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed UBSan report in `ProtobufWriter`. [#6163](https://github.com/yandex/ClickHouse/pull/6163) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Don't allow to use query profiler with sanitizers because it is not compatible. [#6769](https://github.com/yandex/ClickHouse/pull/6769) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add test for reloading a dictionary after fail by timer. [#6114](https://github.com/yandex/ClickHouse/pull/6114) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix inconsistency in `PipelineExecutor::prepareProcessor` argument type. [#6494](https://github.com/yandex/ClickHouse/pull/6494) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Added a test for bad URIs. [#6493](https://github.com/yandex/ClickHouse/pull/6493) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added more checks to `CAST` function. This should get more information about segmentation fault in fuzzy test. [#6346](https://github.com/yandex/ClickHouse/pull/6346) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Added `gcc-9` support to `docker/builder` container that builds image locally. [#6333](https://github.com/yandex/ClickHouse/pull/6333) ([Gleb Novikov](https://github.com/NanoBjorn)) -* Test for primary key with `LowCardinality(String)`. [#5044](https://github.com/yandex/ClickHouse/issues/5044) [#6219](https://github.com/yandex/ClickHouse/pull/6219) ([dimarub2000](https://github.com/dimarub2000)) -* Fixed tests affected by slow stack traces printing. [#6315](https://github.com/yandex/ClickHouse/pull/6315) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add a test case for crash in `groupUniqArray` fixed in [#6029](https://github.com/yandex/ClickHouse/pull/6029). [#4402](https://github.com/yandex/ClickHouse/issues/4402) [#6129](https://github.com/yandex/ClickHouse/pull/6129) ([akuzm](https://github.com/akuzm)) -* Fixed indices mutations tests. [#6645](https://github.com/yandex/ClickHouse/pull/6645) ([Nikita Vasilev](https://github.com/nikvas0)) -* In performance test, do not read query log for queries we didn't run. [#6427](https://github.com/yandex/ClickHouse/pull/6427) ([akuzm](https://github.com/akuzm)) -* Materialized view now could be created with any low cardinality types regardless to the setting about suspicious low cardinality types. [#6428](https://github.com/yandex/ClickHouse/pull/6428) ([Olga Khvostikova](https://github.com/stavrolia)) -* Updated tests for `send_logs_level` setting. [#6207](https://github.com/yandex/ClickHouse/pull/6207) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix build under gcc-8.2. [#6196](https://github.com/yandex/ClickHouse/pull/6196) ([Max Akhmedov](https://github.com/zlobober)) -* Fix build with internal libc++. [#6724](https://github.com/yandex/ClickHouse/pull/6724) ([Ivan](https://github.com/abyss7)) -* Fix shared build with `rdkafka` library [#6101](https://github.com/yandex/ClickHouse/pull/6101) ([Ivan](https://github.com/abyss7)) -* Fixes for Mac OS build (incomplete). [#6390](https://github.com/yandex/ClickHouse/pull/6390) ([alexey-milovidov](https://github.com/alexey-milovidov)) [#6429](https://github.com/yandex/ClickHouse/pull/6429) ([alex-zaitsev](https://github.com/alex-zaitsev)) -* Fix "splitted" build. [#6618](https://github.com/yandex/ClickHouse/pull/6618) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Other build fixes: [#6186](https://github.com/yandex/ClickHouse/pull/6186) ([Amos Bird](https://github.com/amosbird)) [#6486](https://github.com/yandex/ClickHouse/pull/6486) [#6348](https://github.com/yandex/ClickHouse/pull/6348) ([vxider](https://github.com/Vxider)) [#6744](https://github.com/yandex/ClickHouse/pull/6744) ([Ivan](https://github.com/abyss7)) [#6016](https://github.com/yandex/ClickHouse/pull/6016) [#6421](https://github.com/yandex/ClickHouse/pull/6421) [#6491](https://github.com/yandex/ClickHouse/pull/6491) ([proller](https://github.com/proller)) +* Remove Compiler (runtime template instantiation) because we've win over it's performance. [#6646](https://github.com/ClickHouse/ClickHouse/pull/6646) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added performance test to show degradation of performance in gcc-9 in more isolated way. [#6302](https://github.com/ClickHouse/ClickHouse/pull/6302) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added table function `numbers_mt`, which is multithreaded version of `numbers`. Updated performance tests with hash functions. [#6554](https://github.com/ClickHouse/ClickHouse/pull/6554) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Comparison mode in `clickhouse-benchmark` [#6220](https://github.com/ClickHouse/ClickHouse/issues/6220) [#6343](https://github.com/ClickHouse/ClickHouse/pull/6343) ([dimarub2000](https://github.com/dimarub2000)) +* Best effort for printing stack traces. Also added `SIGPROF` as a debugging signal to print stack trace of a running thread. [#6529](https://github.com/ClickHouse/ClickHouse/pull/6529) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Every function in its own file, part 10. [#6321](https://github.com/ClickHouse/ClickHouse/pull/6321) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Remove doubled const `TABLE_IS_READ_ONLY`. [#6566](https://github.com/ClickHouse/ClickHouse/pull/6566) ([filimonov](https://github.com/filimonov)) +* Formatting changes for `StringHashMap` PR [#5417](https://github.com/ClickHouse/ClickHouse/issues/5417). [#6700](https://github.com/ClickHouse/ClickHouse/pull/6700) ([akuzm](https://github.com/akuzm)) +* Better subquery for join creation in `ExpressionAnalyzer`. [#6824](https://github.com/ClickHouse/ClickHouse/pull/6824) ([Artem Zuikov](https://github.com/4ertus2)) +* Remove a redundant condition (found by PVS Studio). [#6775](https://github.com/ClickHouse/ClickHouse/pull/6775) ([akuzm](https://github.com/akuzm)) +* Separate the hash table interface for `ReverseIndex`. [#6672](https://github.com/ClickHouse/ClickHouse/pull/6672) ([akuzm](https://github.com/akuzm)) +* Refactoring of settings. [#6689](https://github.com/ClickHouse/ClickHouse/pull/6689) ([alesapin](https://github.com/alesapin)) +* Add comments for `set` index functions. [#6319](https://github.com/ClickHouse/ClickHouse/pull/6319) ([Nikita Vasilev](https://github.com/nikvas0)) +* Increase OOM score in debug version on Linux. [#6152](https://github.com/ClickHouse/ClickHouse/pull/6152) ([akuzm](https://github.com/akuzm)) +* HDFS HA now work in debug build. [#6650](https://github.com/ClickHouse/ClickHouse/pull/6650) ([Weiqing Xu](https://github.com/weiqxu)) +* Added a test to `transform_query_for_external_database`. [#6388](https://github.com/ClickHouse/ClickHouse/pull/6388) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add test for multiple materialized views for Kafka table. [#6509](https://github.com/ClickHouse/ClickHouse/pull/6509) ([Ivan](https://github.com/abyss7)) +* Make a better build scheme. [#6500](https://github.com/ClickHouse/ClickHouse/pull/6500) ([Ivan](https://github.com/abyss7)) +* Fixed `test_external_dictionaries` integration in case it was executed under non root user. [#6507](https://github.com/ClickHouse/ClickHouse/pull/6507) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* The bug reproduces when total size of written packets exceeds `DBMS_DEFAULT_BUFFER_SIZE`. [#6204](https://github.com/ClickHouse/ClickHouse/pull/6204) ([Yuriy Baranov](https://github.com/yurriy)) +* Added a test for `RENAME` table race condition [#6752](https://github.com/ClickHouse/ClickHouse/pull/6752) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Avoid data race on Settings in `KILL QUERY`. [#6753](https://github.com/ClickHouse/ClickHouse/pull/6753) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add integration test for handling errors by a cache dictionary. [#6755](https://github.com/ClickHouse/ClickHouse/pull/6755) ([Vitaly Baranov](https://github.com/vitlibar)) +* Move `input_format_defaults_for_omitted_fields` to incompatible changes [#6573](https://github.com/ClickHouse/ClickHouse/pull/6573) ([Artem Zuikov](https://github.com/4ertus2)) +* Disable parsing of ELF object files on Mac OS, because it makes no sense. [#6578](https://github.com/ClickHouse/ClickHouse/pull/6578) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Attempt to make changelog generator better. [#6327](https://github.com/ClickHouse/ClickHouse/pull/6327) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Adding `-Wshadow` switch to the GCC. [#6325](https://github.com/ClickHouse/ClickHouse/pull/6325) ([kreuzerkrieg](https://github.com/kreuzerkrieg)) +* Removed obsolete code for `mimalloc` support. [#6715](https://github.com/ClickHouse/ClickHouse/pull/6715) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* `zlib-ng` determines x86 capabilities and saves this info to global variables. This is done in defalteInit call, which may be made by different threads simultaneously. To avoid multithreaded writes, do it on library startup. [#6141](https://github.com/ClickHouse/ClickHouse/pull/6141) ([akuzm](https://github.com/akuzm)) +* Regression test for a bug which in join which was fixed in [#5192](https://github.com/ClickHouse/ClickHouse/issues/5192). [#6147](https://github.com/ClickHouse/ClickHouse/pull/6147) ([Bakhtiyor Ruziev](https://github.com/theruziev)) +* Fixed MSan report. [#6144](https://github.com/ClickHouse/ClickHouse/pull/6144) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix flapping TTL test. [#6782](https://github.com/ClickHouse/ClickHouse/pull/6782) ([Anton Popov](https://github.com/CurtizJ)) +* Fixed false data race in `MergeTreeDataPart::is_frozen` field. [#6583](https://github.com/ClickHouse/ClickHouse/pull/6583) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed timeouts in fuzz test. In previous version, it managed to find false hangup in query `SELECT * FROM numbers_mt(gccMurmurHash(''))`. [#6582](https://github.com/ClickHouse/ClickHouse/pull/6582) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added debug checks to `static_cast` of columns. [#6581](https://github.com/ClickHouse/ClickHouse/pull/6581) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Support for Oracle Linux in official RPM packages. [#6356](https://github.com/ClickHouse/ClickHouse/issues/6356) [#6585](https://github.com/ClickHouse/ClickHouse/pull/6585) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Changed json perftests from `once` to `loop` type. [#6536](https://github.com/ClickHouse/ClickHouse/pull/6536) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* `odbc-bridge.cpp` defines `main()` so it should not be included in `clickhouse-lib`. [#6538](https://github.com/ClickHouse/ClickHouse/pull/6538) ([Orivej Desh](https://github.com/orivej)) +* Test for crash in `FULL|RIGHT JOIN` with nulls in right table's keys. [#6362](https://github.com/ClickHouse/ClickHouse/pull/6362) ([Artem Zuikov](https://github.com/4ertus2)) +* Added a test for the limit on expansion of aliases just in case. [#6442](https://github.com/ClickHouse/ClickHouse/pull/6442) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Switched from `boost::filesystem` to `std::filesystem` where appropriate. [#6253](https://github.com/ClickHouse/ClickHouse/pull/6253) [#6385](https://github.com/ClickHouse/ClickHouse/pull/6385) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added RPM packages to website. [#6251](https://github.com/ClickHouse/ClickHouse/pull/6251) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add a test for fixed `Unknown identifier` exception in `IN` section. [#6708](https://github.com/ClickHouse/ClickHouse/pull/6708) ([Artem Zuikov](https://github.com/4ertus2)) +* Simplify `shared_ptr_helper` because people facing difficulties understanding it. [#6675](https://github.com/ClickHouse/ClickHouse/pull/6675) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added performance tests for fixed Gorilla and DoubleDelta codec. [#6179](https://github.com/ClickHouse/ClickHouse/pull/6179) ([Vasily Nemkov](https://github.com/Enmk)) +* Split the integration test `test_dictionaries` into 4 separate tests. [#6776](https://github.com/ClickHouse/ClickHouse/pull/6776) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fix PVS-Studio warning in `PipelineExecutor`. [#6777](https://github.com/ClickHouse/ClickHouse/pull/6777) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Allow to use `library` dictionary source with ASan. [#6482](https://github.com/ClickHouse/ClickHouse/pull/6482) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added option to generate changelog from a list of PRs. [#6350](https://github.com/ClickHouse/ClickHouse/pull/6350) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Lock the `TinyLog` storage when reading. [#6226](https://github.com/ClickHouse/ClickHouse/pull/6226) ([akuzm](https://github.com/akuzm)) +* Check for broken symlinks in CI. [#6634](https://github.com/ClickHouse/ClickHouse/pull/6634) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Increase timeout for "stack overflow" test because it may take a long time in debug build. [#6637](https://github.com/ClickHouse/ClickHouse/pull/6637) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added a check for double whitespaces. [#6643](https://github.com/ClickHouse/ClickHouse/pull/6643) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix `new/delete` memory tracking when build with sanitizers. Tracking is not clear. It only prevents memory limit exceptions in tests. [#6450](https://github.com/ClickHouse/ClickHouse/pull/6450) ([Artem Zuikov](https://github.com/4ertus2)) +* Enable back the check of undefined symbols while linking. [#6453](https://github.com/ClickHouse/ClickHouse/pull/6453) ([Ivan](https://github.com/abyss7)) +* Avoid rebuilding `hyperscan` every day. [#6307](https://github.com/ClickHouse/ClickHouse/pull/6307) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed UBSan report in `ProtobufWriter`. [#6163](https://github.com/ClickHouse/ClickHouse/pull/6163) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Don't allow to use query profiler with sanitizers because it is not compatible. [#6769](https://github.com/ClickHouse/ClickHouse/pull/6769) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add test for reloading a dictionary after fail by timer. [#6114](https://github.com/ClickHouse/ClickHouse/pull/6114) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fix inconsistency in `PipelineExecutor::prepareProcessor` argument type. [#6494](https://github.com/ClickHouse/ClickHouse/pull/6494) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Added a test for bad URIs. [#6493](https://github.com/ClickHouse/ClickHouse/pull/6493) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added more checks to `CAST` function. This should get more information about segmentation fault in fuzzy test. [#6346](https://github.com/ClickHouse/ClickHouse/pull/6346) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Added `gcc-9` support to `docker/builder` container that builds image locally. [#6333](https://github.com/ClickHouse/ClickHouse/pull/6333) ([Gleb Novikov](https://github.com/NanoBjorn)) +* Test for primary key with `LowCardinality(String)`. [#5044](https://github.com/ClickHouse/ClickHouse/issues/5044) [#6219](https://github.com/ClickHouse/ClickHouse/pull/6219) ([dimarub2000](https://github.com/dimarub2000)) +* Fixed tests affected by slow stack traces printing. [#6315](https://github.com/ClickHouse/ClickHouse/pull/6315) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add a test case for crash in `groupUniqArray` fixed in [#6029](https://github.com/ClickHouse/ClickHouse/pull/6029). [#4402](https://github.com/ClickHouse/ClickHouse/issues/4402) [#6129](https://github.com/ClickHouse/ClickHouse/pull/6129) ([akuzm](https://github.com/akuzm)) +* Fixed indices mutations tests. [#6645](https://github.com/ClickHouse/ClickHouse/pull/6645) ([Nikita Vasilev](https://github.com/nikvas0)) +* In performance test, do not read query log for queries we didn't run. [#6427](https://github.com/ClickHouse/ClickHouse/pull/6427) ([akuzm](https://github.com/akuzm)) +* Materialized view now could be created with any low cardinality types regardless to the setting about suspicious low cardinality types. [#6428](https://github.com/ClickHouse/ClickHouse/pull/6428) ([Olga Khvostikova](https://github.com/stavrolia)) +* Updated tests for `send_logs_level` setting. [#6207](https://github.com/ClickHouse/ClickHouse/pull/6207) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fix build under gcc-8.2. [#6196](https://github.com/ClickHouse/ClickHouse/pull/6196) ([Max Akhmedov](https://github.com/zlobober)) +* Fix build with internal libc++. [#6724](https://github.com/ClickHouse/ClickHouse/pull/6724) ([Ivan](https://github.com/abyss7)) +* Fix shared build with `rdkafka` library [#6101](https://github.com/ClickHouse/ClickHouse/pull/6101) ([Ivan](https://github.com/abyss7)) +* Fixes for Mac OS build (incomplete). [#6390](https://github.com/ClickHouse/ClickHouse/pull/6390) ([alexey-milovidov](https://github.com/alexey-milovidov)) [#6429](https://github.com/ClickHouse/ClickHouse/pull/6429) ([alex-zaitsev](https://github.com/alex-zaitsev)) +* Fix "splitted" build. [#6618](https://github.com/ClickHouse/ClickHouse/pull/6618) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Other build fixes: [#6186](https://github.com/ClickHouse/ClickHouse/pull/6186) ([Amos Bird](https://github.com/amosbird)) [#6486](https://github.com/ClickHouse/ClickHouse/pull/6486) [#6348](https://github.com/ClickHouse/ClickHouse/pull/6348) ([vxider](https://github.com/Vxider)) [#6744](https://github.com/ClickHouse/ClickHouse/pull/6744) ([Ivan](https://github.com/abyss7)) [#6016](https://github.com/ClickHouse/ClickHouse/pull/6016) [#6421](https://github.com/ClickHouse/ClickHouse/pull/6421) [#6491](https://github.com/ClickHouse/ClickHouse/pull/6491) ([proller](https://github.com/proller)) ### Backward Incompatible Change -* Removed rarely used table function `catBoostPool` and storage `CatBoostPool`. If you have used this table function, please write email to `clickhouse-feedback@yandex-team.com`. Note that CatBoost integration remains and will be supported. [#6279](https://github.com/yandex/ClickHouse/pull/6279) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Disable `ANY RIGHT JOIN` and `ANY FULL JOIN` by default. Set `any_join_get_any_from_right_table` setting to enable them. [#5126](https://github.com/yandex/ClickHouse/issues/5126) [#6351](https://github.com/yandex/ClickHouse/pull/6351) ([Artem Zuikov](https://github.com/4ertus2)) +* Removed rarely used table function `catBoostPool` and storage `CatBoostPool`. If you have used this table function, please write email to `clickhouse-feedback@yandex-team.com`. Note that CatBoost integration remains and will be supported. [#6279](https://github.com/ClickHouse/ClickHouse/pull/6279) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Disable `ANY RIGHT JOIN` and `ANY FULL JOIN` by default. Set `any_join_get_any_from_right_table` setting to enable them. [#5126](https://github.com/ClickHouse/ClickHouse/issues/5126) [#6351](https://github.com/ClickHouse/ClickHouse/pull/6351) ([Artem Zuikov](https://github.com/4ertus2)) ## ClickHouse release 19.11.11.57, 2019-09-13 -* Fix logical error causing segfaults when selecting from Kafka empty topic. [#6902](https://github.com/yandex/ClickHouse/issues/6902) [#6909](https://github.com/yandex/ClickHouse/pull/6909) ([Ivan](https://github.com/abyss7)) -* Fix for function `АrrayEnumerateUniqRanked` with empty arrays in params. [#6928](https://github.com/yandex/ClickHouse/pull/6928) ([proller](https://github.com/proller)) +* Fix logical error causing segfaults when selecting from Kafka empty topic. [#6902](https://github.com/ClickHouse/ClickHouse/issues/6902) [#6909](https://github.com/ClickHouse/ClickHouse/pull/6909) ([Ivan](https://github.com/abyss7)) +* Fix for function `АrrayEnumerateUniqRanked` with empty arrays in params. [#6928](https://github.com/ClickHouse/ClickHouse/pull/6928) ([proller](https://github.com/proller)) ## ClickHouse release 19.13.4.32, 2019-09-10 ### Bug Fix * This release also contains all bug security fixes from 19.11.9.52 and 19.11.10.54. -* Fixed data race in `system.parts` table and `ALTER` query. [#6245](https://github.com/yandex/ClickHouse/issues/6245) [#6513](https://github.com/yandex/ClickHouse/pull/6513) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed mismatched header in streams happened in case of reading from empty distributed table with sample and prewhere. [#6167](https://github.com/yandex/ClickHouse/issues/6167) ([Lixiang Qian](https://github.com/fancyqlx)) [#6823](https://github.com/yandex/ClickHouse/pull/6823) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fixed crash when using `IN` clause with a subquery with a tuple. [#6125](https://github.com/yandex/ClickHouse/issues/6125) [#6550](https://github.com/yandex/ClickHouse/pull/6550) ([tavplubix](https://github.com/tavplubix)) -* Fix case with same column names in `GLOBAL JOIN ON` section. [#6181](https://github.com/yandex/ClickHouse/pull/6181) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix crash when casting types to `Decimal` that do not support it. Throw exception instead. [#6297](https://github.com/yandex/ClickHouse/pull/6297) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed crash in `extractAll()` function. [#6644](https://github.com/yandex/ClickHouse/pull/6644) ([Artem Zuikov](https://github.com/4ertus2)) -* Query transformation for `MySQL`, `ODBC`, `JDBC` table functions now works properly for `SELECT WHERE` queries with multiple `AND` expressions. [#6381](https://github.com/yandex/ClickHouse/issues/6381) [#6676](https://github.com/yandex/ClickHouse/pull/6676) ([dimarub2000](https://github.com/dimarub2000)) -* Added previous declaration checks for MySQL 8 integration. [#6569](https://github.com/yandex/ClickHouse/pull/6569) ([Rafael David Tinoco](https://github.com/rafaeldtinoco)) +* Fixed data race in `system.parts` table and `ALTER` query. [#6245](https://github.com/ClickHouse/ClickHouse/issues/6245) [#6513](https://github.com/ClickHouse/ClickHouse/pull/6513) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed mismatched header in streams happened in case of reading from empty distributed table with sample and prewhere. [#6167](https://github.com/ClickHouse/ClickHouse/issues/6167) ([Lixiang Qian](https://github.com/fancyqlx)) [#6823](https://github.com/ClickHouse/ClickHouse/pull/6823) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fixed crash when using `IN` clause with a subquery with a tuple. [#6125](https://github.com/ClickHouse/ClickHouse/issues/6125) [#6550](https://github.com/ClickHouse/ClickHouse/pull/6550) ([tavplubix](https://github.com/tavplubix)) +* Fix case with same column names in `GLOBAL JOIN ON` section. [#6181](https://github.com/ClickHouse/ClickHouse/pull/6181) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix crash when casting types to `Decimal` that do not support it. Throw exception instead. [#6297](https://github.com/ClickHouse/ClickHouse/pull/6297) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed crash in `extractAll()` function. [#6644](https://github.com/ClickHouse/ClickHouse/pull/6644) ([Artem Zuikov](https://github.com/4ertus2)) +* Query transformation for `MySQL`, `ODBC`, `JDBC` table functions now works properly for `SELECT WHERE` queries with multiple `AND` expressions. [#6381](https://github.com/ClickHouse/ClickHouse/issues/6381) [#6676](https://github.com/ClickHouse/ClickHouse/pull/6676) ([dimarub2000](https://github.com/dimarub2000)) +* Added previous declaration checks for MySQL 8 integration. [#6569](https://github.com/ClickHouse/ClickHouse/pull/6569) ([Rafael David Tinoco](https://github.com/rafaeldtinoco)) ### Security Fix -* Fix two vulnerabilities in codecs in decompression phase (malicious user can fabricate compressed data that will lead to buffer overflow in decompression). [#6670](https://github.com/yandex/ClickHouse/pull/6670) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix two vulnerabilities in codecs in decompression phase (malicious user can fabricate compressed data that will lead to buffer overflow in decompression). [#6670](https://github.com/ClickHouse/ClickHouse/pull/6670) ([Artem Zuikov](https://github.com/4ertus2)) ## ClickHouse release 19.11.10.54, 2019-09-10 ### Bug Fix -* Do store offsets for Kafka messages manually to be able to commit them all at once for all partitions. Fixes potential duplication in "one consumer - many partitions" scenario. [#6872](https://github.com/yandex/ClickHouse/pull/6872) ([Ivan](https://github.com/abyss7)) +* Do store offsets for Kafka messages manually to be able to commit them all at once for all partitions. Fixes potential duplication in "one consumer - many partitions" scenario. [#6872](https://github.com/ClickHouse/ClickHouse/pull/6872) ([Ivan](https://github.com/abyss7)) ## ClickHouse release 19.11.9.52, 2019-09-6 -* Improve error handling in cache dictionaries. [#6737](https://github.com/yandex/ClickHouse/pull/6737) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fixed bug in function `arrayEnumerateUniqRanked`. [#6779](https://github.com/yandex/ClickHouse/pull/6779) ([proller](https://github.com/proller)) -* Fix `JSONExtract` function while extracting a `Tuple` from JSON. [#6718](https://github.com/yandex/ClickHouse/pull/6718) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fixed possible data loss after `ALTER DELETE` query on table with skipping index. [#6224](https://github.com/yandex/ClickHouse/issues/6224) [#6282](https://github.com/yandex/ClickHouse/pull/6282) ([Nikita Vasilev](https://github.com/nikvas0)) -* Fixed performance test. [#6392](https://github.com/yandex/ClickHouse/pull/6392) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Parquet: Fix reading boolean columns. [#6579](https://github.com/yandex/ClickHouse/pull/6579) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed wrong behaviour of `nullIf` function for constant arguments. [#6518](https://github.com/yandex/ClickHouse/pull/6518) ([Guillaume Tassery](https://github.com/YiuRULE)) [#6580](https://github.com/yandex/ClickHouse/pull/6580) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix Kafka messages duplication problem on normal server restart. [#6597](https://github.com/yandex/ClickHouse/pull/6597) ([Ivan](https://github.com/abyss7)) -* Fixed an issue when long `ALTER UPDATE` or `ALTER DELETE` may prevent regular merges to run. Prevent mutations from executing if there is no enough free threads available. [#6502](https://github.com/yandex/ClickHouse/issues/6502) [#6617](https://github.com/yandex/ClickHouse/pull/6617) ([tavplubix](https://github.com/tavplubix)) -* Fixed error with processing "timezone" in server configuration file. [#6709](https://github.com/yandex/ClickHouse/pull/6709) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix kafka tests. [#6805](https://github.com/yandex/ClickHouse/pull/6805) ([Ivan](https://github.com/abyss7)) +* Improve error handling in cache dictionaries. [#6737](https://github.com/ClickHouse/ClickHouse/pull/6737) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fixed bug in function `arrayEnumerateUniqRanked`. [#6779](https://github.com/ClickHouse/ClickHouse/pull/6779) ([proller](https://github.com/proller)) +* Fix `JSONExtract` function while extracting a `Tuple` from JSON. [#6718](https://github.com/ClickHouse/ClickHouse/pull/6718) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fixed possible data loss after `ALTER DELETE` query on table with skipping index. [#6224](https://github.com/ClickHouse/ClickHouse/issues/6224) [#6282](https://github.com/ClickHouse/ClickHouse/pull/6282) ([Nikita Vasilev](https://github.com/nikvas0)) +* Fixed performance test. [#6392](https://github.com/ClickHouse/ClickHouse/pull/6392) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Parquet: Fix reading boolean columns. [#6579](https://github.com/ClickHouse/ClickHouse/pull/6579) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed wrong behaviour of `nullIf` function for constant arguments. [#6518](https://github.com/ClickHouse/ClickHouse/pull/6518) ([Guillaume Tassery](https://github.com/YiuRULE)) [#6580](https://github.com/ClickHouse/ClickHouse/pull/6580) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix Kafka messages duplication problem on normal server restart. [#6597](https://github.com/ClickHouse/ClickHouse/pull/6597) ([Ivan](https://github.com/abyss7)) +* Fixed an issue when long `ALTER UPDATE` or `ALTER DELETE` may prevent regular merges to run. Prevent mutations from executing if there is no enough free threads available. [#6502](https://github.com/ClickHouse/ClickHouse/issues/6502) [#6617](https://github.com/ClickHouse/ClickHouse/pull/6617) ([tavplubix](https://github.com/tavplubix)) +* Fixed error with processing "timezone" in server configuration file. [#6709](https://github.com/ClickHouse/ClickHouse/pull/6709) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix kafka tests. [#6805](https://github.com/ClickHouse/ClickHouse/pull/6805) ([Ivan](https://github.com/abyss7)) ### Security Fix -* If the attacker has write access to ZooKeeper and is able to run custom server available from the network where ClickHouse run, it can create custom-built malicious server that will act as ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. Found by Eldar Zaitov, information security team at Yandex. [#6247](https://github.com/yandex/ClickHouse/pull/6247) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* If the attacker has write access to ZooKeeper and is able to run custom server available from the network where ClickHouse run, it can create custom-built malicious server that will act as ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. Found by Eldar Zaitov, information security team at Yandex. [#6247](https://github.com/ClickHouse/ClickHouse/pull/6247) ([alexey-milovidov](https://github.com/alexey-milovidov)) ## ClickHouse release 19.13.3.26, 2019-08-22 ### Bug Fix -* Fix `ALTER TABLE ... UPDATE` query for tables with `enable_mixed_granularity_parts=1`. [#6543](https://github.com/yandex/ClickHouse/pull/6543) ([alesapin](https://github.com/alesapin)) -* Fix NPE when using IN clause with a subquery with a tuple. [#6125](https://github.com/yandex/ClickHouse/issues/6125) [#6550](https://github.com/yandex/ClickHouse/pull/6550) ([tavplubix](https://github.com/tavplubix)) -* Fixed an issue that if a stale replica becomes alive, it may still have data parts that were removed by DROP PARTITION. [#6522](https://github.com/yandex/ClickHouse/issues/6522) [#6523](https://github.com/yandex/ClickHouse/pull/6523) ([tavplubix](https://github.com/tavplubix)) -* Fixed issue with parsing CSV [#6426](https://github.com/yandex/ClickHouse/issues/6426) [#6559](https://github.com/yandex/ClickHouse/pull/6559) ([tavplubix](https://github.com/tavplubix)) -* Fixed data race in system.parts table and ALTER query. This fixes [#6245](https://github.com/yandex/ClickHouse/issues/6245). [#6513](https://github.com/yandex/ClickHouse/pull/6513) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [#6514](https://github.com/yandex/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed possible data loss after `ALTER DELETE` query on table with skipping index. [#6224](https://github.com/yandex/ClickHouse/issues/6224) [#6282](https://github.com/yandex/ClickHouse/pull/6282) ([Nikita Vasilev](https://github.com/nikvas0)) +* Fix `ALTER TABLE ... UPDATE` query for tables with `enable_mixed_granularity_parts=1`. [#6543](https://github.com/ClickHouse/ClickHouse/pull/6543) ([alesapin](https://github.com/alesapin)) +* Fix NPE when using IN clause with a subquery with a tuple. [#6125](https://github.com/ClickHouse/ClickHouse/issues/6125) [#6550](https://github.com/ClickHouse/ClickHouse/pull/6550) ([tavplubix](https://github.com/tavplubix)) +* Fixed an issue that if a stale replica becomes alive, it may still have data parts that were removed by DROP PARTITION. [#6522](https://github.com/ClickHouse/ClickHouse/issues/6522) [#6523](https://github.com/ClickHouse/ClickHouse/pull/6523) ([tavplubix](https://github.com/tavplubix)) +* Fixed issue with parsing CSV [#6426](https://github.com/ClickHouse/ClickHouse/issues/6426) [#6559](https://github.com/ClickHouse/ClickHouse/pull/6559) ([tavplubix](https://github.com/tavplubix)) +* Fixed data race in system.parts table and ALTER query. This fixes [#6245](https://github.com/ClickHouse/ClickHouse/issues/6245). [#6513](https://github.com/ClickHouse/ClickHouse/pull/6513) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [#6514](https://github.com/ClickHouse/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed possible data loss after `ALTER DELETE` query on table with skipping index. [#6224](https://github.com/ClickHouse/ClickHouse/issues/6224) [#6282](https://github.com/ClickHouse/ClickHouse/pull/6282) ([Nikita Vasilev](https://github.com/nikvas0)) ### Security Fix -* If the attacker has write access to ZooKeeper and is able to run custom server available from the network where ClickHouse run, it can create custom-built malicious server that will act as ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. Found by Eldar Zaitov, information security team at Yandex. [#6247](https://github.com/yandex/ClickHouse/pull/6247) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* If the attacker has write access to ZooKeeper and is able to run custom server available from the network where ClickHouse run, it can create custom-built malicious server that will act as ClickHouse replica and register it in ZooKeeper. When another replica will fetch data part from malicious replica, it can force clickhouse-server to write to arbitrary path on filesystem. Found by Eldar Zaitov, information security team at Yandex. [#6247](https://github.com/ClickHouse/ClickHouse/pull/6247) ([alexey-milovidov](https://github.com/alexey-milovidov)) ## ClickHouse release 19.13.2.19, 2019-08-14 ### New Feature -* Sampling profiler on query level. [Example](https://gist.github.com/alexey-milovidov/92758583dd41c24c360fdb8d6a4da194). [#4247](https://github.com/yandex/ClickHouse/issues/4247) ([laplab](https://github.com/laplab)) [#6124](https://github.com/yandex/ClickHouse/pull/6124) ([alexey-milovidov](https://github.com/alexey-milovidov)) [#6250](https://github.com/yandex/ClickHouse/pull/6250) [#6283](https://github.com/yandex/ClickHouse/pull/6283) [#6386](https://github.com/yandex/ClickHouse/pull/6386) -* Allow to specify a list of columns with `COLUMNS('regexp')` expression that works like a more sophisticated variant of `*` asterisk. [#5951](https://github.com/yandex/ClickHouse/pull/5951) ([mfridental](https://github.com/mfridental)), ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `CREATE TABLE AS table_function()` is now possible [#6057](https://github.com/yandex/ClickHouse/pull/6057) ([dimarub2000](https://github.com/dimarub2000)) -* Adam optimizer for stochastic gradient descent is used by default in `stochasticLinearRegression()` and `stochasticLogisticRegression()` aggregate functions, because it shows good quality without almost any tuning. [#6000](https://github.com/yandex/ClickHouse/pull/6000) ([Quid37](https://github.com/Quid37)) -* Added functions for working with the сustom week number [#5212](https://github.com/yandex/ClickHouse/pull/5212) ([Andy Yang](https://github.com/andyyzh)) -* `RENAME` queries now work with all storages. [#5953](https://github.com/yandex/ClickHouse/pull/5953) ([Ivan](https://github.com/abyss7)) -* Now client receive logs from server with any desired level by setting `send_logs_level` regardless to the log level specified in server settings. [#5964](https://github.com/yandex/ClickHouse/pull/5964) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) +* Sampling profiler on query level. [Example](https://gist.github.com/alexey-milovidov/92758583dd41c24c360fdb8d6a4da194). [#4247](https://github.com/ClickHouse/ClickHouse/issues/4247) ([laplab](https://github.com/laplab)) [#6124](https://github.com/ClickHouse/ClickHouse/pull/6124) ([alexey-milovidov](https://github.com/alexey-milovidov)) [#6250](https://github.com/ClickHouse/ClickHouse/pull/6250) [#6283](https://github.com/ClickHouse/ClickHouse/pull/6283) [#6386](https://github.com/ClickHouse/ClickHouse/pull/6386) +* Allow to specify a list of columns with `COLUMNS('regexp')` expression that works like a more sophisticated variant of `*` asterisk. [#5951](https://github.com/ClickHouse/ClickHouse/pull/5951) ([mfridental](https://github.com/mfridental)), ([alexey-milovidov](https://github.com/alexey-milovidov)) +* `CREATE TABLE AS table_function()` is now possible [#6057](https://github.com/ClickHouse/ClickHouse/pull/6057) ([dimarub2000](https://github.com/dimarub2000)) +* Adam optimizer for stochastic gradient descent is used by default in `stochasticLinearRegression()` and `stochasticLogisticRegression()` aggregate functions, because it shows good quality without almost any tuning. [#6000](https://github.com/ClickHouse/ClickHouse/pull/6000) ([Quid37](https://github.com/Quid37)) +* Added functions for working with the сustom week number [#5212](https://github.com/ClickHouse/ClickHouse/pull/5212) ([Andy Yang](https://github.com/andyyzh)) +* `RENAME` queries now work with all storages. [#5953](https://github.com/ClickHouse/ClickHouse/pull/5953) ([Ivan](https://github.com/abyss7)) +* Now client receive logs from server with any desired level by setting `send_logs_level` regardless to the log level specified in server settings. [#5964](https://github.com/ClickHouse/ClickHouse/pull/5964) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) ### Backward Incompatible Change -* The setting `input_format_defaults_for_omitted_fields` is enabled by default. Inserts in Distibuted tables need this setting to be the same on cluster (you need to set it before rolling update). It enables calculation of complex default expressions for omitted fields in `JSONEachRow` and `CSV*` formats. It should be the expected behaviour but may lead to negligible performance difference. [#6043](https://github.com/yandex/ClickHouse/pull/6043) ([Artem Zuikov](https://github.com/4ertus2)), [#5625](https://github.com/yandex/ClickHouse/pull/5625) ([akuzm](https://github.com/akuzm)) +* The setting `input_format_defaults_for_omitted_fields` is enabled by default. Inserts in Distibuted tables need this setting to be the same on cluster (you need to set it before rolling update). It enables calculation of complex default expressions for omitted fields in `JSONEachRow` and `CSV*` formats. It should be the expected behaviour but may lead to negligible performance difference. [#6043](https://github.com/ClickHouse/ClickHouse/pull/6043) ([Artem Zuikov](https://github.com/4ertus2)), [#5625](https://github.com/ClickHouse/ClickHouse/pull/5625) ([akuzm](https://github.com/akuzm)) ### Experimental features -* New query processing pipeline. Use `experimental_use_processors=1` option to enable it. Use for your own trouble. [#4914](https://github.com/yandex/ClickHouse/pull/4914) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* New query processing pipeline. Use `experimental_use_processors=1` option to enable it. Use for your own trouble. [#4914](https://github.com/ClickHouse/ClickHouse/pull/4914) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) ### Bug Fix * Kafka integration has been fixed in this version. -* Fixed `DoubleDelta` encoding of `Int64` for large `DoubleDelta` values, improved `DoubleDelta` encoding for random data for `Int32`. [#5998](https://github.com/yandex/ClickHouse/pull/5998) ([Vasily Nemkov](https://github.com/Enmk)) -* Fixed overestimation of `max_rows_to_read` if the setting `merge_tree_uniform_read_distribution` is set to 0. [#6019](https://github.com/yandex/ClickHouse/pull/6019) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed `DoubleDelta` encoding of `Int64` for large `DoubleDelta` values, improved `DoubleDelta` encoding for random data for `Int32`. [#5998](https://github.com/ClickHouse/ClickHouse/pull/5998) ([Vasily Nemkov](https://github.com/Enmk)) +* Fixed overestimation of `max_rows_to_read` if the setting `merge_tree_uniform_read_distribution` is set to 0. [#6019](https://github.com/ClickHouse/ClickHouse/pull/6019) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Improvement -* Throws an exception if `config.d` file doesn't have the corresponding root element as the config file [#6123](https://github.com/yandex/ClickHouse/pull/6123) ([dimarub2000](https://github.com/dimarub2000)) +* Throws an exception if `config.d` file doesn't have the corresponding root element as the config file [#6123](https://github.com/ClickHouse/ClickHouse/pull/6123) ([dimarub2000](https://github.com/dimarub2000)) ### Performance Improvement -* Optimize `count()`. Now it uses the smallest column (if possible). [#6028](https://github.com/yandex/ClickHouse/pull/6028) ([Amos Bird](https://github.com/amosbird)) +* Optimize `count()`. Now it uses the smallest column (if possible). [#6028](https://github.com/ClickHouse/ClickHouse/pull/6028) ([Amos Bird](https://github.com/amosbird)) ### Build/Testing/Packaging Improvement -* Report memory usage in performance tests. [#5899](https://github.com/yandex/ClickHouse/pull/5899) ([akuzm](https://github.com/akuzm)) -* Fix build with external `libcxx` [#6010](https://github.com/yandex/ClickHouse/pull/6010) ([Ivan](https://github.com/abyss7)) -* Fix shared build with `rdkafka` library [#6101](https://github.com/yandex/ClickHouse/pull/6101) ([Ivan](https://github.com/abyss7)) +* Report memory usage in performance tests. [#5899](https://github.com/ClickHouse/ClickHouse/pull/5899) ([akuzm](https://github.com/akuzm)) +* Fix build with external `libcxx` [#6010](https://github.com/ClickHouse/ClickHouse/pull/6010) ([Ivan](https://github.com/abyss7)) +* Fix shared build with `rdkafka` library [#6101](https://github.com/ClickHouse/ClickHouse/pull/6101) ([Ivan](https://github.com/abyss7)) ## ClickHouse release 19.11.8.46, 2019-08-22 ### Bug Fix -* Fix `ALTER TABLE ... UPDATE` query for tables with `enable_mixed_granularity_parts=1`. [#6543](https://github.com/yandex/ClickHouse/pull/6543) ([alesapin](https://github.com/alesapin)) -* Fix NPE when using IN clause with a subquery with a tuple. [#6125](https://github.com/yandex/ClickHouse/issues/6125) [#6550](https://github.com/yandex/ClickHouse/pull/6550) ([tavplubix](https://github.com/tavplubix)) -* Fixed an issue that if a stale replica becomes alive, it may still have data parts that were removed by DROP PARTITION. [#6522](https://github.com/yandex/ClickHouse/issues/6522) [#6523](https://github.com/yandex/ClickHouse/pull/6523) ([tavplubix](https://github.com/tavplubix)) -* Fixed issue with parsing CSV [#6426](https://github.com/yandex/ClickHouse/issues/6426) [#6559](https://github.com/yandex/ClickHouse/pull/6559) ([tavplubix](https://github.com/tavplubix)) -* Fixed data race in system.parts table and ALTER query. This fixes [#6245](https://github.com/yandex/ClickHouse/issues/6245). [#6513](https://github.com/yandex/ClickHouse/pull/6513) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [#6514](https://github.com/yandex/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix `ALTER TABLE ... UPDATE` query for tables with `enable_mixed_granularity_parts=1`. [#6543](https://github.com/ClickHouse/ClickHouse/pull/6543) ([alesapin](https://github.com/alesapin)) +* Fix NPE when using IN clause with a subquery with a tuple. [#6125](https://github.com/ClickHouse/ClickHouse/issues/6125) [#6550](https://github.com/ClickHouse/ClickHouse/pull/6550) ([tavplubix](https://github.com/tavplubix)) +* Fixed an issue that if a stale replica becomes alive, it may still have data parts that were removed by DROP PARTITION. [#6522](https://github.com/ClickHouse/ClickHouse/issues/6522) [#6523](https://github.com/ClickHouse/ClickHouse/pull/6523) ([tavplubix](https://github.com/tavplubix)) +* Fixed issue with parsing CSV [#6426](https://github.com/ClickHouse/ClickHouse/issues/6426) [#6559](https://github.com/ClickHouse/ClickHouse/pull/6559) ([tavplubix](https://github.com/tavplubix)) +* Fixed data race in system.parts table and ALTER query. This fixes [#6245](https://github.com/ClickHouse/ClickHouse/issues/6245). [#6513](https://github.com/ClickHouse/ClickHouse/pull/6513) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed wrong code in mutations that may lead to memory corruption. Fixed segfault with read of address `0x14c0` that may happed due to concurrent `DROP TABLE` and `SELECT` from `system.parts` or `system.parts_columns`. Fixed race condition in preparation of mutation queries. Fixed deadlock caused by `OPTIMIZE` of Replicated tables and concurrent modification operations like ALTERs. [#6514](https://github.com/ClickHouse/ClickHouse/pull/6514) ([alexey-milovidov](https://github.com/alexey-milovidov)) ## ClickHouse release 19.11.7.40, 2019-08-14 ### Bug fix * Kafka integration has been fixed in this version. -* Fix segfault when using `arrayReduce` for constant arguments. [#6326](https://github.com/yandex/ClickHouse/pull/6326) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed `toFloat()` monotonicity. [#6374](https://github.com/yandex/ClickHouse/pull/6374) ([dimarub2000](https://github.com/dimarub2000)) -* Fix segfault with enabled `optimize_skip_unused_shards` and missing sharding key. [#6384](https://github.com/yandex/ClickHouse/pull/6384) ([CurtizJ](https://github.com/CurtizJ)) -* Fixed logic of `arrayEnumerateUniqRanked` function. [#6423](https://github.com/yandex/ClickHouse/pull/6423) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Removed extra verbose logging from MySQL handler. [#6389](https://github.com/yandex/ClickHouse/pull/6389) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix wrong behavior and possible segfaults in `topK` and `topKWeighted` aggregated functions. [#6404](https://github.com/yandex/ClickHouse/pull/6404) ([CurtizJ](https://github.com/CurtizJ)) -* Do not expose virtual columns in `system.columns` table. This is required for backward compatibility. [#6406](https://github.com/yandex/ClickHouse/pull/6406) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix bug with memory allocation for string fields in complex key cache dictionary. [#6447](https://github.com/yandex/ClickHouse/pull/6447) ([alesapin](https://github.com/alesapin)) -* Fix bug with enabling adaptive granularity when creating new replica for `Replicated*MergeTree` table. [#6452](https://github.com/yandex/ClickHouse/pull/6452) ([alesapin](https://github.com/alesapin)) -* Fix infinite loop when reading Kafka messages. [#6354](https://github.com/yandex/ClickHouse/pull/6354) ([abyss7](https://github.com/abyss7)) -* Fixed the possibility of a fabricated query to cause server crash due to stack overflow in SQL parser and possibility of stack overflow in `Merge` and `Distributed` tables [#6433](https://github.com/yandex/ClickHouse/pull/6433) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed Gorilla encoding error on small sequences. [#6444](https://github.com/yandex/ClickHouse/pull/6444) ([Enmk](https://github.com/Enmk)) +* Fix segfault when using `arrayReduce` for constant arguments. [#6326](https://github.com/ClickHouse/ClickHouse/pull/6326) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed `toFloat()` monotonicity. [#6374](https://github.com/ClickHouse/ClickHouse/pull/6374) ([dimarub2000](https://github.com/dimarub2000)) +* Fix segfault with enabled `optimize_skip_unused_shards` and missing sharding key. [#6384](https://github.com/ClickHouse/ClickHouse/pull/6384) ([CurtizJ](https://github.com/CurtizJ)) +* Fixed logic of `arrayEnumerateUniqRanked` function. [#6423](https://github.com/ClickHouse/ClickHouse/pull/6423) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Removed extra verbose logging from MySQL handler. [#6389](https://github.com/ClickHouse/ClickHouse/pull/6389) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix wrong behavior and possible segfaults in `topK` and `topKWeighted` aggregated functions. [#6404](https://github.com/ClickHouse/ClickHouse/pull/6404) ([CurtizJ](https://github.com/CurtizJ)) +* Do not expose virtual columns in `system.columns` table. This is required for backward compatibility. [#6406](https://github.com/ClickHouse/ClickHouse/pull/6406) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix bug with memory allocation for string fields in complex key cache dictionary. [#6447](https://github.com/ClickHouse/ClickHouse/pull/6447) ([alesapin](https://github.com/alesapin)) +* Fix bug with enabling adaptive granularity when creating new replica for `Replicated*MergeTree` table. [#6452](https://github.com/ClickHouse/ClickHouse/pull/6452) ([alesapin](https://github.com/alesapin)) +* Fix infinite loop when reading Kafka messages. [#6354](https://github.com/ClickHouse/ClickHouse/pull/6354) ([abyss7](https://github.com/abyss7)) +* Fixed the possibility of a fabricated query to cause server crash due to stack overflow in SQL parser and possibility of stack overflow in `Merge` and `Distributed` tables [#6433](https://github.com/ClickHouse/ClickHouse/pull/6433) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed Gorilla encoding error on small sequences. [#6444](https://github.com/ClickHouse/ClickHouse/pull/6444) ([Enmk](https://github.com/Enmk)) ### Improvement -* Allow user to override `poll_interval` and `idle_connection_timeout` settings on connection. [#6230](https://github.com/yandex/ClickHouse/pull/6230) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Allow user to override `poll_interval` and `idle_connection_timeout` settings on connection. [#6230](https://github.com/ClickHouse/ClickHouse/pull/6230) ([alexey-milovidov](https://github.com/alexey-milovidov)) ## ClickHouse release 19.11.5.28, 2019-08-05 ### Bug fix -* Fixed the possibility of hanging queries when server is overloaded. [#6301](https://github.com/yandex/ClickHouse/pull/6301) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix FPE in yandexConsistentHash function. This fixes [#6304](https://github.com/yandex/ClickHouse/issues/6304). [#6126](https://github.com/yandex/ClickHouse/pull/6126) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed bug in conversion of `LowCardinality` types in `AggregateFunctionFactory`. This fixes [#6257](https://github.com/yandex/ClickHouse/issues/6257). [#6281](https://github.com/yandex/ClickHouse/pull/6281) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix parsing of `bool` settings from `true` and `false` strings in configuration files. [#6278](https://github.com/yandex/ClickHouse/pull/6278) ([alesapin](https://github.com/alesapin)) -* Fix rare bug with incompatible stream headers in queries to `Distributed` table over `MergeTree` table when part of `WHERE` moves to `PREWHERE`. [#6236](https://github.com/yandex/ClickHouse/pull/6236) ([alesapin](https://github.com/alesapin)) -* Fixed overflow in integer division of signed type to unsigned type. This fixes [#6214](https://github.com/yandex/ClickHouse/issues/6214). [#6233](https://github.com/yandex/ClickHouse/pull/6233) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed the possibility of hanging queries when server is overloaded. [#6301](https://github.com/ClickHouse/ClickHouse/pull/6301) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix FPE in yandexConsistentHash function. This fixes [#6304](https://github.com/ClickHouse/ClickHouse/issues/6304). [#6126](https://github.com/ClickHouse/ClickHouse/pull/6126) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed bug in conversion of `LowCardinality` types in `AggregateFunctionFactory`. This fixes [#6257](https://github.com/ClickHouse/ClickHouse/issues/6257). [#6281](https://github.com/ClickHouse/ClickHouse/pull/6281) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fix parsing of `bool` settings from `true` and `false` strings in configuration files. [#6278](https://github.com/ClickHouse/ClickHouse/pull/6278) ([alesapin](https://github.com/alesapin)) +* Fix rare bug with incompatible stream headers in queries to `Distributed` table over `MergeTree` table when part of `WHERE` moves to `PREWHERE`. [#6236](https://github.com/ClickHouse/ClickHouse/pull/6236) ([alesapin](https://github.com/alesapin)) +* Fixed overflow in integer division of signed type to unsigned type. This fixes [#6214](https://github.com/ClickHouse/ClickHouse/issues/6214). [#6233](https://github.com/ClickHouse/ClickHouse/pull/6233) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Backward Incompatible Change * `Kafka` still broken. @@ -404,24 +404,24 @@ ## ClickHouse release 19.11.4.24, 2019-08-01 ### Bug Fix -* Fix bug with writing secondary indices marks with adaptive granularity. [#6126](https://github.com/yandex/ClickHouse/pull/6126) ([alesapin](https://github.com/alesapin)) -* Fix `WITH ROLLUP` and `WITH CUBE` modifiers of `GROUP BY` with two-level aggregation. [#6225](https://github.com/yandex/ClickHouse/pull/6225) ([Anton Popov](https://github.com/CurtizJ)) -* Fixed hang in `JSONExtractRaw` function. Fixed [#6195](https://github.com/yandex/ClickHouse/issues/6195) [#6198](https://github.com/yandex/ClickHouse/pull/6198) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix segfault in ExternalLoader::reloadOutdated(). [#6082](https://github.com/yandex/ClickHouse/pull/6082) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fixed the case when server may close listening sockets but not shutdown and continue serving remaining queries. You may end up with two running clickhouse-server processes. Sometimes, the server may return an error `bad_function_call` for remaining queries. [#6231](https://github.com/yandex/ClickHouse/pull/6231) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed useless and incorrect condition on update field for initial loading of external dictionaries via ODBC, MySQL, ClickHouse and HTTP. This fixes [#6069](https://github.com/yandex/ClickHouse/issues/6069) [#6083](https://github.com/yandex/ClickHouse/pull/6083) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed irrelevant exception in cast of `LowCardinality(Nullable)` to not-Nullable column in case if it doesn't contain Nulls (e.g. in query like `SELECT CAST(CAST('Hello' AS LowCardinality(Nullable(String))) AS String)`. [#6094](https://github.com/yandex/ClickHouse/issues/6094) [#6119](https://github.com/yandex/ClickHouse/pull/6119) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix non-deterministic result of "uniq" aggregate function in extreme rare cases. The bug was present in all ClickHouse versions. [#6058](https://github.com/yandex/ClickHouse/pull/6058) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Segfault when we set a little bit too high CIDR on the function `IPv6CIDRToRange`. [#6068](https://github.com/yandex/ClickHouse/pull/6068) ([Guillaume Tassery](https://github.com/YiuRULE)) -* Fixed small memory leak when server throw many exceptions from many different contexts. [#6144](https://github.com/yandex/ClickHouse/pull/6144) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix the situation when consumer got paused before subscription and not resumed afterwards. [#6075](https://github.com/yandex/ClickHouse/pull/6075) ([Ivan](https://github.com/abyss7)) Note that Kafka is broken in this version. -* Clearing the Kafka data buffer from the previous read operation that was completed with an error [#6026](https://github.com/yandex/ClickHouse/pull/6026) ([Nikolay](https://github.com/bopohaa)) Note that Kafka is broken in this version. -* Since `StorageMergeTree::background_task_handle` is initialized in `startup()` the `MergeTreeBlockOutputStream::write()` may try to use it before initialization. Just check if it is initialized. [#6080](https://github.com/yandex/ClickHouse/pull/6080) ([Ivan](https://github.com/abyss7)) +* Fix bug with writing secondary indices marks with adaptive granularity. [#6126](https://github.com/ClickHouse/ClickHouse/pull/6126) ([alesapin](https://github.com/alesapin)) +* Fix `WITH ROLLUP` and `WITH CUBE` modifiers of `GROUP BY` with two-level aggregation. [#6225](https://github.com/ClickHouse/ClickHouse/pull/6225) ([Anton Popov](https://github.com/CurtizJ)) +* Fixed hang in `JSONExtractRaw` function. Fixed [#6195](https://github.com/ClickHouse/ClickHouse/issues/6195) [#6198](https://github.com/ClickHouse/ClickHouse/pull/6198) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix segfault in ExternalLoader::reloadOutdated(). [#6082](https://github.com/ClickHouse/ClickHouse/pull/6082) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fixed the case when server may close listening sockets but not shutdown and continue serving remaining queries. You may end up with two running clickhouse-server processes. Sometimes, the server may return an error `bad_function_call` for remaining queries. [#6231](https://github.com/ClickHouse/ClickHouse/pull/6231) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed useless and incorrect condition on update field for initial loading of external dictionaries via ODBC, MySQL, ClickHouse and HTTP. This fixes [#6069](https://github.com/ClickHouse/ClickHouse/issues/6069) [#6083](https://github.com/ClickHouse/ClickHouse/pull/6083) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed irrelevant exception in cast of `LowCardinality(Nullable)` to not-Nullable column in case if it doesn't contain Nulls (e.g. in query like `SELECT CAST(CAST('Hello' AS LowCardinality(Nullable(String))) AS String)`. [#6094](https://github.com/ClickHouse/ClickHouse/issues/6094) [#6119](https://github.com/ClickHouse/ClickHouse/pull/6119) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fix non-deterministic result of "uniq" aggregate function in extreme rare cases. The bug was present in all ClickHouse versions. [#6058](https://github.com/ClickHouse/ClickHouse/pull/6058) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Segfault when we set a little bit too high CIDR on the function `IPv6CIDRToRange`. [#6068](https://github.com/ClickHouse/ClickHouse/pull/6068) ([Guillaume Tassery](https://github.com/YiuRULE)) +* Fixed small memory leak when server throw many exceptions from many different contexts. [#6144](https://github.com/ClickHouse/ClickHouse/pull/6144) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix the situation when consumer got paused before subscription and not resumed afterwards. [#6075](https://github.com/ClickHouse/ClickHouse/pull/6075) ([Ivan](https://github.com/abyss7)) Note that Kafka is broken in this version. +* Clearing the Kafka data buffer from the previous read operation that was completed with an error [#6026](https://github.com/ClickHouse/ClickHouse/pull/6026) ([Nikolay](https://github.com/bopohaa)) Note that Kafka is broken in this version. +* Since `StorageMergeTree::background_task_handle` is initialized in `startup()` the `MergeTreeBlockOutputStream::write()` may try to use it before initialization. Just check if it is initialized. [#6080](https://github.com/ClickHouse/ClickHouse/pull/6080) ([Ivan](https://github.com/abyss7)) ### Build/Testing/Packaging Improvement -* Added official `rpm` packages. [#5740](https://github.com/yandex/ClickHouse/pull/5740) ([proller](https://github.com/proller)) ([alesapin](https://github.com/alesapin)) -* Add an ability to build `.rpm` and `.tgz` packages with `packager` script. [#5769](https://github.com/yandex/ClickHouse/pull/5769) ([alesapin](https://github.com/alesapin)) -* Fixes for "Arcadia" build system. [#6223](https://github.com/yandex/ClickHouse/pull/6223) ([proller](https://github.com/proller)) +* Added official `rpm` packages. [#5740](https://github.com/ClickHouse/ClickHouse/pull/5740) ([proller](https://github.com/proller)) ([alesapin](https://github.com/alesapin)) +* Add an ability to build `.rpm` and `.tgz` packages with `packager` script. [#5769](https://github.com/ClickHouse/ClickHouse/pull/5769) ([alesapin](https://github.com/alesapin)) +* Fixes for "Arcadia" build system. [#6223](https://github.com/ClickHouse/ClickHouse/pull/6223) ([proller](https://github.com/proller)) ### Backward Incompatible Change * `Kafka` is broken in this version. @@ -430,841 +430,841 @@ ## ClickHouse release 19.11.3.11, 2019-07-18 ### New Feature -* Added support for prepared statements. [#5331](https://github.com/yandex/ClickHouse/pull/5331/) ([Alexander](https://github.com/sanych73)) [#5630](https://github.com/yandex/ClickHouse/pull/5630) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `DoubleDelta` and `Gorilla` column codecs [#5600](https://github.com/yandex/ClickHouse/pull/5600) ([Vasily Nemkov](https://github.com/Enmk)) -* Added `os_thread_priority` setting that allows to control the "nice" value of query processing threads that is used by OS to adjust dynamic scheduling priority. It requires `CAP_SYS_NICE` capabilities to work. This implements [#5858](https://github.com/yandex/ClickHouse/issues/5858) [#5909](https://github.com/yandex/ClickHouse/pull/5909) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Implement `_topic`, `_offset`, `_key` columns for Kafka engine [#5382](https://github.com/yandex/ClickHouse/pull/5382) ([Ivan](https://github.com/abyss7)) Note that Kafka is broken in this version. -* Add aggregate function combinator `-Resample` [#5590](https://github.com/yandex/ClickHouse/pull/5590) ([hcz](https://github.com/hczhcz)) -* Aggregate functions `groupArrayMovingSum(win_size)(x)` and `groupArrayMovingAvg(win_size)(x)`, which calculate moving sum/avg with or without window-size limitation. [#5595](https://github.com/yandex/ClickHouse/pull/5595) ([inv2004](https://github.com/inv2004)) -* Add synonim `arrayFlatten` <-> `flatten` [#5764](https://github.com/yandex/ClickHouse/pull/5764) ([hcz](https://github.com/hczhcz)) -* Intergate H3 function `geoToH3` from Uber. [#4724](https://github.com/yandex/ClickHouse/pull/4724) ([Remen Ivan](https://github.com/BHYCHIK)) [#5805](https://github.com/yandex/ClickHouse/pull/5805) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added support for prepared statements. [#5331](https://github.com/ClickHouse/ClickHouse/pull/5331/) ([Alexander](https://github.com/sanych73)) [#5630](https://github.com/ClickHouse/ClickHouse/pull/5630) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* `DoubleDelta` and `Gorilla` column codecs [#5600](https://github.com/ClickHouse/ClickHouse/pull/5600) ([Vasily Nemkov](https://github.com/Enmk)) +* Added `os_thread_priority` setting that allows to control the "nice" value of query processing threads that is used by OS to adjust dynamic scheduling priority. It requires `CAP_SYS_NICE` capabilities to work. This implements [#5858](https://github.com/ClickHouse/ClickHouse/issues/5858) [#5909](https://github.com/ClickHouse/ClickHouse/pull/5909) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Implement `_topic`, `_offset`, `_key` columns for Kafka engine [#5382](https://github.com/ClickHouse/ClickHouse/pull/5382) ([Ivan](https://github.com/abyss7)) Note that Kafka is broken in this version. +* Add aggregate function combinator `-Resample` [#5590](https://github.com/ClickHouse/ClickHouse/pull/5590) ([hcz](https://github.com/hczhcz)) +* Aggregate functions `groupArrayMovingSum(win_size)(x)` and `groupArrayMovingAvg(win_size)(x)`, which calculate moving sum/avg with or without window-size limitation. [#5595](https://github.com/ClickHouse/ClickHouse/pull/5595) ([inv2004](https://github.com/inv2004)) +* Add synonim `arrayFlatten` <-> `flatten` [#5764](https://github.com/ClickHouse/ClickHouse/pull/5764) ([hcz](https://github.com/hczhcz)) +* Intergate H3 function `geoToH3` from Uber. [#4724](https://github.com/ClickHouse/ClickHouse/pull/4724) ([Remen Ivan](https://github.com/BHYCHIK)) [#5805](https://github.com/ClickHouse/ClickHouse/pull/5805) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Bug Fix -* Implement DNS cache with asynchronous update. Separate thread resolves all hosts and updates DNS cache with period (setting `dns_cache_update_period`). It should help, when ip of hosts changes frequently. [#5857](https://github.com/yandex/ClickHouse/pull/5857) ([Anton Popov](https://github.com/CurtizJ)) -* Fix segfault in `Delta` codec which affects columns with values less than 32 bits size. The bug led to random memory corruption. [#5786](https://github.com/yandex/ClickHouse/pull/5786) ([alesapin](https://github.com/alesapin)) -* Fix segfault in TTL merge with non-physical columns in block. [#5819](https://github.com/yandex/ClickHouse/pull/5819) ([Anton Popov](https://github.com/CurtizJ)) -* Fix rare bug in checking of part with `LowCardinality` column. Previously `checkDataPart` always fails for part with `LowCardinality` column. [#5832](https://github.com/yandex/ClickHouse/pull/5832) ([alesapin](https://github.com/alesapin)) -* Avoid hanging connections when server thread pool is full. It is important for connections from `remote` table function or connections to a shard without replicas when there is long connection timeout. This fixes [#5878](https://github.com/yandex/ClickHouse/issues/5878) [#5881](https://github.com/yandex/ClickHouse/pull/5881) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Support for constant arguments to `evalMLModel` function. This fixes [#5817](https://github.com/yandex/ClickHouse/issues/5817) [#5820](https://github.com/yandex/ClickHouse/pull/5820) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed the issue when ClickHouse determines default time zone as `UCT` instead of `UTC`. This fixes [#5804](https://github.com/yandex/ClickHouse/issues/5804). [#5828](https://github.com/yandex/ClickHouse/pull/5828) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed buffer underflow in `visitParamExtractRaw`. This fixes [#5901](https://github.com/yandex/ClickHouse/issues/5901) [#5902](https://github.com/yandex/ClickHouse/pull/5902) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Now distributed `DROP/ALTER/TRUNCATE/OPTIMIZE ON CLUSTER` queries will be executed directly on leader replica. [#5757](https://github.com/yandex/ClickHouse/pull/5757) ([alesapin](https://github.com/alesapin)) -* Fix `coalesce` for `ColumnConst` with `ColumnNullable` + related changes. [#5755](https://github.com/yandex/ClickHouse/pull/5755) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix the `ReadBufferFromKafkaConsumer` so that it keeps reading new messages after `commit()` even if it was stalled before [#5852](https://github.com/yandex/ClickHouse/pull/5852) ([Ivan](https://github.com/abyss7)) -* Fix `FULL` and `RIGHT` JOIN results when joining on `Nullable` keys in right table. [#5859](https://github.com/yandex/ClickHouse/pull/5859) ([Artem Zuikov](https://github.com/4ertus2)) -* Possible fix of infinite sleeping of low-priority queries. [#5842](https://github.com/yandex/ClickHouse/pull/5842) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix race condition, which cause that some queries may not appear in query_log after `SYSTEM FLUSH LOGS` query. [#5456](https://github.com/yandex/ClickHouse/issues/5456) [#5685](https://github.com/yandex/ClickHouse/pull/5685) ([Anton Popov](https://github.com/CurtizJ)) -* Fixed `heap-use-after-free` ASan warning in ClusterCopier caused by watch which try to use already removed copier object. [#5871](https://github.com/yandex/ClickHouse/pull/5871) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fixed wrong `StringRef` pointer returned by some implementations of `IColumn::deserializeAndInsertFromArena`. This bug affected only unit-tests. [#5973](https://github.com/yandex/ClickHouse/pull/5973) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Prevent source and intermediate array join columns of masking same name columns. [#5941](https://github.com/yandex/ClickHouse/pull/5941) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix insert and select query to MySQL engine with MySQL style identifier quoting. [#5704](https://github.com/yandex/ClickHouse/pull/5704) ([Winter Zhang](https://github.com/zhang2014)) -* Now `CHECK TABLE` query can work with MergeTree engine family. It returns check status and message if any for each part (or file in case of simplier engines). Also, fix bug in fetch of a broken part. [#5865](https://github.com/yandex/ClickHouse/pull/5865) ([alesapin](https://github.com/alesapin)) -* Fix SPLIT_SHARED_LIBRARIES runtime [#5793](https://github.com/yandex/ClickHouse/pull/5793) ([Danila Kutenin](https://github.com/danlark1)) -* Fixed time zone initialization when `/etc/localtime` is a relative symlink like `../usr/share/zoneinfo/Europe/Moscow` [#5922](https://github.com/yandex/ClickHouse/pull/5922) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* clickhouse-copier: Fix use-after free on shutdown [#5752](https://github.com/yandex/ClickHouse/pull/5752) ([proller](https://github.com/proller)) -* Updated `simdjson`. Fixed the issue that some invalid JSONs with zero bytes successfully parse. [#5938](https://github.com/yandex/ClickHouse/pull/5938) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix shutdown of SystemLogs [#5802](https://github.com/yandex/ClickHouse/pull/5802) ([Anton Popov](https://github.com/CurtizJ)) -* Fix hanging when condition in invalidate_query depends on a dictionary. [#6011](https://github.com/yandex/ClickHouse/pull/6011) ([Vitaly Baranov](https://github.com/vitlibar)) +* Implement DNS cache with asynchronous update. Separate thread resolves all hosts and updates DNS cache with period (setting `dns_cache_update_period`). It should help, when ip of hosts changes frequently. [#5857](https://github.com/ClickHouse/ClickHouse/pull/5857) ([Anton Popov](https://github.com/CurtizJ)) +* Fix segfault in `Delta` codec which affects columns with values less than 32 bits size. The bug led to random memory corruption. [#5786](https://github.com/ClickHouse/ClickHouse/pull/5786) ([alesapin](https://github.com/alesapin)) +* Fix segfault in TTL merge with non-physical columns in block. [#5819](https://github.com/ClickHouse/ClickHouse/pull/5819) ([Anton Popov](https://github.com/CurtizJ)) +* Fix rare bug in checking of part with `LowCardinality` column. Previously `checkDataPart` always fails for part with `LowCardinality` column. [#5832](https://github.com/ClickHouse/ClickHouse/pull/5832) ([alesapin](https://github.com/alesapin)) +* Avoid hanging connections when server thread pool is full. It is important for connections from `remote` table function or connections to a shard without replicas when there is long connection timeout. This fixes [#5878](https://github.com/ClickHouse/ClickHouse/issues/5878) [#5881](https://github.com/ClickHouse/ClickHouse/pull/5881) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Support for constant arguments to `evalMLModel` function. This fixes [#5817](https://github.com/ClickHouse/ClickHouse/issues/5817) [#5820](https://github.com/ClickHouse/ClickHouse/pull/5820) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed the issue when ClickHouse determines default time zone as `UCT` instead of `UTC`. This fixes [#5804](https://github.com/ClickHouse/ClickHouse/issues/5804). [#5828](https://github.com/ClickHouse/ClickHouse/pull/5828) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed buffer underflow in `visitParamExtractRaw`. This fixes [#5901](https://github.com/ClickHouse/ClickHouse/issues/5901) [#5902](https://github.com/ClickHouse/ClickHouse/pull/5902) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Now distributed `DROP/ALTER/TRUNCATE/OPTIMIZE ON CLUSTER` queries will be executed directly on leader replica. [#5757](https://github.com/ClickHouse/ClickHouse/pull/5757) ([alesapin](https://github.com/alesapin)) +* Fix `coalesce` for `ColumnConst` with `ColumnNullable` + related changes. [#5755](https://github.com/ClickHouse/ClickHouse/pull/5755) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix the `ReadBufferFromKafkaConsumer` so that it keeps reading new messages after `commit()` even if it was stalled before [#5852](https://github.com/ClickHouse/ClickHouse/pull/5852) ([Ivan](https://github.com/abyss7)) +* Fix `FULL` and `RIGHT` JOIN results when joining on `Nullable` keys in right table. [#5859](https://github.com/ClickHouse/ClickHouse/pull/5859) ([Artem Zuikov](https://github.com/4ertus2)) +* Possible fix of infinite sleeping of low-priority queries. [#5842](https://github.com/ClickHouse/ClickHouse/pull/5842) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix race condition, which cause that some queries may not appear in query_log after `SYSTEM FLUSH LOGS` query. [#5456](https://github.com/ClickHouse/ClickHouse/issues/5456) [#5685](https://github.com/ClickHouse/ClickHouse/pull/5685) ([Anton Popov](https://github.com/CurtizJ)) +* Fixed `heap-use-after-free` ASan warning in ClusterCopier caused by watch which try to use already removed copier object. [#5871](https://github.com/ClickHouse/ClickHouse/pull/5871) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fixed wrong `StringRef` pointer returned by some implementations of `IColumn::deserializeAndInsertFromArena`. This bug affected only unit-tests. [#5973](https://github.com/ClickHouse/ClickHouse/pull/5973) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Prevent source and intermediate array join columns of masking same name columns. [#5941](https://github.com/ClickHouse/ClickHouse/pull/5941) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix insert and select query to MySQL engine with MySQL style identifier quoting. [#5704](https://github.com/ClickHouse/ClickHouse/pull/5704) ([Winter Zhang](https://github.com/zhang2014)) +* Now `CHECK TABLE` query can work with MergeTree engine family. It returns check status and message if any for each part (or file in case of simplier engines). Also, fix bug in fetch of a broken part. [#5865](https://github.com/ClickHouse/ClickHouse/pull/5865) ([alesapin](https://github.com/alesapin)) +* Fix SPLIT_SHARED_LIBRARIES runtime [#5793](https://github.com/ClickHouse/ClickHouse/pull/5793) ([Danila Kutenin](https://github.com/danlark1)) +* Fixed time zone initialization when `/etc/localtime` is a relative symlink like `../usr/share/zoneinfo/Europe/Moscow` [#5922](https://github.com/ClickHouse/ClickHouse/pull/5922) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* clickhouse-copier: Fix use-after free on shutdown [#5752](https://github.com/ClickHouse/ClickHouse/pull/5752) ([proller](https://github.com/proller)) +* Updated `simdjson`. Fixed the issue that some invalid JSONs with zero bytes successfully parse. [#5938](https://github.com/ClickHouse/ClickHouse/pull/5938) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix shutdown of SystemLogs [#5802](https://github.com/ClickHouse/ClickHouse/pull/5802) ([Anton Popov](https://github.com/CurtizJ)) +* Fix hanging when condition in invalidate_query depends on a dictionary. [#6011](https://github.com/ClickHouse/ClickHouse/pull/6011) ([Vitaly Baranov](https://github.com/vitlibar)) ### Improvement -* Allow unresolvable addresses in cluster configuration. They will be considered unavailable and tried to resolve at every connection attempt. This is especially useful for Kubernetes. This fixes [#5714](https://github.com/yandex/ClickHouse/issues/5714) [#5924](https://github.com/yandex/ClickHouse/pull/5924) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Close idle TCP connections (with one hour timeout by default). This is especially important for large clusters with multiple distributed tables on every server, because every server can possibly keep a connection pool to every other server, and after peak query concurrency, connections will stall. This fixes [#5879](https://github.com/yandex/ClickHouse/issues/5879) [#5880](https://github.com/yandex/ClickHouse/pull/5880) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Better quality of `topK` function. Changed the SavingSpace set behavior to remove the last element if the new element have a bigger weight. [#5833](https://github.com/yandex/ClickHouse/issues/5833) [#5850](https://github.com/yandex/ClickHouse/pull/5850) ([Guillaume Tassery](https://github.com/YiuRULE)) -* URL functions to work with domains now can work for incomplete URLs without scheme [#5725](https://github.com/yandex/ClickHouse/pull/5725) ([alesapin](https://github.com/alesapin)) -* Checksums added to the `system.parts_columns` table. [#5874](https://github.com/yandex/ClickHouse/pull/5874) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) -* Added `Enum` data type as a synonim for `Enum8` or `Enum16`. [#5886](https://github.com/yandex/ClickHouse/pull/5886) ([dimarub2000](https://github.com/dimarub2000)) -* Full bit transpose variant for `T64` codec. Could lead to better compression with `zstd`. [#5742](https://github.com/yandex/ClickHouse/pull/5742) ([Artem Zuikov](https://github.com/4ertus2)) -* Condition on `startsWith` function now can uses primary key. This fixes [#5310](https://github.com/yandex/ClickHouse/issues/5310) and [#5882](https://github.com/yandex/ClickHouse/issues/5882) [#5919](https://github.com/yandex/ClickHouse/pull/5919) ([dimarub2000](https://github.com/dimarub2000)) -* Allow to use `clickhouse-copier` with cross-replication cluster topology by permitting empty database name. [#5745](https://github.com/yandex/ClickHouse/pull/5745) ([nvartolomei](https://github.com/nvartolomei)) -* Use `UTC` as default timezone on a system without `tzdata` (e.g. bare Docker container). Before this patch, error message `Could not determine local time zone` was printed and server or client refused to start. [#5827](https://github.com/yandex/ClickHouse/pull/5827) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Returned back support for floating point argument in function `quantileTiming` for backward compatibility. [#5911](https://github.com/yandex/ClickHouse/pull/5911) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Show which table is missing column in error messages. [#5768](https://github.com/yandex/ClickHouse/pull/5768) ([Ivan](https://github.com/abyss7)) -* Disallow run query with same query_id by various users [#5430](https://github.com/yandex/ClickHouse/pull/5430) ([proller](https://github.com/proller)) -* More robust code for sending metrics to Graphite. It will work even during long multiple `RENAME TABLE` operation. [#5875](https://github.com/yandex/ClickHouse/pull/5875) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* More informative error messages will be displayed when ThreadPool cannot schedule a task for execution. This fixes [#5305](https://github.com/yandex/ClickHouse/issues/5305) [#5801](https://github.com/yandex/ClickHouse/pull/5801) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Inverting ngramSearch to be more intuitive [#5807](https://github.com/yandex/ClickHouse/pull/5807) ([Danila Kutenin](https://github.com/danlark1)) -* Add user parsing in HDFS engine builder [#5946](https://github.com/yandex/ClickHouse/pull/5946) ([akonyaev90](https://github.com/akonyaev90)) -* Update default value of `max_ast_elements parameter` [#5933](https://github.com/yandex/ClickHouse/pull/5933) ([Artem Konovalov](https://github.com/izebit)) -* Added a notion of obsolete settings. The obsolete setting `allow_experimental_low_cardinality_type` can be used with no effect. [0f15c01c6802f7ce1a1494c12c846be8c98944cd](https://github.com/yandex/ClickHouse/commit/0f15c01c6802f7ce1a1494c12c846be8c98944cd) [Alexey Milovidov](https://github.com/alexey-milovidov) +* Allow unresolvable addresses in cluster configuration. They will be considered unavailable and tried to resolve at every connection attempt. This is especially useful for Kubernetes. This fixes [#5714](https://github.com/ClickHouse/ClickHouse/issues/5714) [#5924](https://github.com/ClickHouse/ClickHouse/pull/5924) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Close idle TCP connections (with one hour timeout by default). This is especially important for large clusters with multiple distributed tables on every server, because every server can possibly keep a connection pool to every other server, and after peak query concurrency, connections will stall. This fixes [#5879](https://github.com/ClickHouse/ClickHouse/issues/5879) [#5880](https://github.com/ClickHouse/ClickHouse/pull/5880) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Better quality of `topK` function. Changed the SavingSpace set behavior to remove the last element if the new element have a bigger weight. [#5833](https://github.com/ClickHouse/ClickHouse/issues/5833) [#5850](https://github.com/ClickHouse/ClickHouse/pull/5850) ([Guillaume Tassery](https://github.com/YiuRULE)) +* URL functions to work with domains now can work for incomplete URLs without scheme [#5725](https://github.com/ClickHouse/ClickHouse/pull/5725) ([alesapin](https://github.com/alesapin)) +* Checksums added to the `system.parts_columns` table. [#5874](https://github.com/ClickHouse/ClickHouse/pull/5874) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)) +* Added `Enum` data type as a synonim for `Enum8` or `Enum16`. [#5886](https://github.com/ClickHouse/ClickHouse/pull/5886) ([dimarub2000](https://github.com/dimarub2000)) +* Full bit transpose variant for `T64` codec. Could lead to better compression with `zstd`. [#5742](https://github.com/ClickHouse/ClickHouse/pull/5742) ([Artem Zuikov](https://github.com/4ertus2)) +* Condition on `startsWith` function now can uses primary key. This fixes [#5310](https://github.com/ClickHouse/ClickHouse/issues/5310) and [#5882](https://github.com/ClickHouse/ClickHouse/issues/5882) [#5919](https://github.com/ClickHouse/ClickHouse/pull/5919) ([dimarub2000](https://github.com/dimarub2000)) +* Allow to use `clickhouse-copier` with cross-replication cluster topology by permitting empty database name. [#5745](https://github.com/ClickHouse/ClickHouse/pull/5745) ([nvartolomei](https://github.com/nvartolomei)) +* Use `UTC` as default timezone on a system without `tzdata` (e.g. bare Docker container). Before this patch, error message `Could not determine local time zone` was printed and server or client refused to start. [#5827](https://github.com/ClickHouse/ClickHouse/pull/5827) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Returned back support for floating point argument in function `quantileTiming` for backward compatibility. [#5911](https://github.com/ClickHouse/ClickHouse/pull/5911) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Show which table is missing column in error messages. [#5768](https://github.com/ClickHouse/ClickHouse/pull/5768) ([Ivan](https://github.com/abyss7)) +* Disallow run query with same query_id by various users [#5430](https://github.com/ClickHouse/ClickHouse/pull/5430) ([proller](https://github.com/proller)) +* More robust code for sending metrics to Graphite. It will work even during long multiple `RENAME TABLE` operation. [#5875](https://github.com/ClickHouse/ClickHouse/pull/5875) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* More informative error messages will be displayed when ThreadPool cannot schedule a task for execution. This fixes [#5305](https://github.com/ClickHouse/ClickHouse/issues/5305) [#5801](https://github.com/ClickHouse/ClickHouse/pull/5801) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Inverting ngramSearch to be more intuitive [#5807](https://github.com/ClickHouse/ClickHouse/pull/5807) ([Danila Kutenin](https://github.com/danlark1)) +* Add user parsing in HDFS engine builder [#5946](https://github.com/ClickHouse/ClickHouse/pull/5946) ([akonyaev90](https://github.com/akonyaev90)) +* Update default value of `max_ast_elements parameter` [#5933](https://github.com/ClickHouse/ClickHouse/pull/5933) ([Artem Konovalov](https://github.com/izebit)) +* Added a notion of obsolete settings. The obsolete setting `allow_experimental_low_cardinality_type` can be used with no effect. [0f15c01c6802f7ce1a1494c12c846be8c98944cd](https://github.com/ClickHouse/ClickHouse/commit/0f15c01c6802f7ce1a1494c12c846be8c98944cd) [Alexey Milovidov](https://github.com/alexey-milovidov) ### Performance Improvement -* Increase number of streams to SELECT from Merge table for more uniform distribution of threads. Added setting `max_streams_multiplier_for_merge_tables`. This fixes [#5797](https://github.com/yandex/ClickHouse/issues/5797) [#5915](https://github.com/yandex/ClickHouse/pull/5915) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Increase number of streams to SELECT from Merge table for more uniform distribution of threads. Added setting `max_streams_multiplier_for_merge_tables`. This fixes [#5797](https://github.com/ClickHouse/ClickHouse/issues/5797) [#5915](https://github.com/ClickHouse/ClickHouse/pull/5915) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Build/Testing/Packaging Improvement -* Add a backward compatibility test for client-server interaction with different versions of clickhouse. [#5868](https://github.com/yandex/ClickHouse/pull/5868) ([alesapin](https://github.com/alesapin)) -* Test coverage information in every commit and pull request. [#5896](https://github.com/yandex/ClickHouse/pull/5896) ([alesapin](https://github.com/alesapin)) -* Cooperate with address sanitizer to support our custom allocators (`Arena` and `ArenaWithFreeLists`) for better debugging of "use-after-free" errors. [#5728](https://github.com/yandex/ClickHouse/pull/5728) ([akuzm](https://github.com/akuzm)) -* Switch to [LLVM libunwind implementation](https://github.com/llvm-mirror/libunwind) for C++ exception handling and for stack traces printing [#4828](https://github.com/yandex/ClickHouse/pull/4828) ([Nikita Lapkov](https://github.com/laplab)) -* Add two more warnings from -Weverything [#5923](https://github.com/yandex/ClickHouse/pull/5923) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Allow to build ClickHouse with Memory Sanitizer. [#3949](https://github.com/yandex/ClickHouse/pull/3949) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed ubsan report about `bitTest` function in fuzz test. [#5943](https://github.com/yandex/ClickHouse/pull/5943) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Docker: added possibility to init a ClickHouse instance which requires authentication. [#5727](https://github.com/yandex/ClickHouse/pull/5727) ([Korviakov Andrey](https://github.com/shurshun)) -* Update librdkafka to version 1.1.0 [#5872](https://github.com/yandex/ClickHouse/pull/5872) ([Ivan](https://github.com/abyss7)) -* Add global timeout for integration tests and disable some of them in tests code. [#5741](https://github.com/yandex/ClickHouse/pull/5741) ([alesapin](https://github.com/alesapin)) -* Fix some ThreadSanitizer failures. [#5854](https://github.com/yandex/ClickHouse/pull/5854) ([akuzm](https://github.com/akuzm)) -* The `--no-undefined` option forces the linker to check all external names for existence while linking. It's very useful to track real dependencies between libraries in the split build mode. [#5855](https://github.com/yandex/ClickHouse/pull/5855) ([Ivan](https://github.com/abyss7)) -* Added performance test for [#5797](https://github.com/yandex/ClickHouse/issues/5797) [#5914](https://github.com/yandex/ClickHouse/pull/5914) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed compatibility with gcc-7. [#5840](https://github.com/yandex/ClickHouse/pull/5840) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added support for gcc-9. This fixes [#5717](https://github.com/yandex/ClickHouse/issues/5717) [#5774](https://github.com/yandex/ClickHouse/pull/5774) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed error when libunwind can be linked incorrectly. [#5948](https://github.com/yandex/ClickHouse/pull/5948) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed a few warnings found by PVS-Studio. [#5921](https://github.com/yandex/ClickHouse/pull/5921) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added initial support for `clang-tidy` static analyzer. [#5806](https://github.com/yandex/ClickHouse/pull/5806) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Convert BSD/Linux endian macros( 'be64toh' and 'htobe64') to the Mac OS X equivalents [#5785](https://github.com/yandex/ClickHouse/pull/5785) ([Fu Chen](https://github.com/fredchenbj)) -* Improved integration tests guide. [#5796](https://github.com/yandex/ClickHouse/pull/5796) ([Vladimir Chebotarev](https://github.com/excitoon)) -* Fixing build at macosx + gcc9 [#5822](https://github.com/yandex/ClickHouse/pull/5822) ([filimonov](https://github.com/filimonov)) -* Fix a hard-to-spot typo: aggreAGte -> aggregate. [#5753](https://github.com/yandex/ClickHouse/pull/5753) ([akuzm](https://github.com/akuzm)) -* Fix freebsd build [#5760](https://github.com/yandex/ClickHouse/pull/5760) ([proller](https://github.com/proller)) -* Add link to experimental YouTube channel to website [#5845](https://github.com/yandex/ClickHouse/pull/5845) ([Ivan Blinkov](https://github.com/blinkov)) -* CMake: add option for coverage flags: WITH_COVERAGE [#5776](https://github.com/yandex/ClickHouse/pull/5776) ([proller](https://github.com/proller)) -* Fix initial size of some inline PODArray's. [#5787](https://github.com/yandex/ClickHouse/pull/5787) ([akuzm](https://github.com/akuzm)) -* clickhouse-server.postinst: fix os detection for centos 6 [#5788](https://github.com/yandex/ClickHouse/pull/5788) ([proller](https://github.com/proller)) -* Added Arch linux package generation. [#5719](https://github.com/yandex/ClickHouse/pull/5719) ([Vladimir Chebotarev](https://github.com/excitoon)) -* Split Common/config.h by libs (dbms) [#5715](https://github.com/yandex/ClickHouse/pull/5715) ([proller](https://github.com/proller)) -* Fixes for "Arcadia" build platform [#5795](https://github.com/yandex/ClickHouse/pull/5795) ([proller](https://github.com/proller)) -* Fixes for unconventional build (gcc9, no submodules) [#5792](https://github.com/yandex/ClickHouse/pull/5792) ([proller](https://github.com/proller)) -* Require explicit type in unalignedStore because it was proven to be bug-prone [#5791](https://github.com/yandex/ClickHouse/pull/5791) ([akuzm](https://github.com/akuzm)) -* Fixes MacOS build [#5830](https://github.com/yandex/ClickHouse/pull/5830) ([filimonov](https://github.com/filimonov)) -* Performance test concerning the new JIT feature with bigger dataset, as requested here [#5263](https://github.com/yandex/ClickHouse/issues/5263) [#5887](https://github.com/yandex/ClickHouse/pull/5887) ([Guillaume Tassery](https://github.com/YiuRULE)) -* Run stateful tests in stress test [12693e568722f11e19859742f56428455501fd2a](https://github.com/yandex/ClickHouse/commit/12693e568722f11e19859742f56428455501fd2a) ([alesapin](https://github.com/alesapin)) +* Add a backward compatibility test for client-server interaction with different versions of clickhouse. [#5868](https://github.com/ClickHouse/ClickHouse/pull/5868) ([alesapin](https://github.com/alesapin)) +* Test coverage information in every commit and pull request. [#5896](https://github.com/ClickHouse/ClickHouse/pull/5896) ([alesapin](https://github.com/alesapin)) +* Cooperate with address sanitizer to support our custom allocators (`Arena` and `ArenaWithFreeLists`) for better debugging of "use-after-free" errors. [#5728](https://github.com/ClickHouse/ClickHouse/pull/5728) ([akuzm](https://github.com/akuzm)) +* Switch to [LLVM libunwind implementation](https://github.com/llvm-mirror/libunwind) for C++ exception handling and for stack traces printing [#4828](https://github.com/ClickHouse/ClickHouse/pull/4828) ([Nikita Lapkov](https://github.com/laplab)) +* Add two more warnings from -Weverything [#5923](https://github.com/ClickHouse/ClickHouse/pull/5923) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Allow to build ClickHouse with Memory Sanitizer. [#3949](https://github.com/ClickHouse/ClickHouse/pull/3949) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed ubsan report about `bitTest` function in fuzz test. [#5943](https://github.com/ClickHouse/ClickHouse/pull/5943) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Docker: added possibility to init a ClickHouse instance which requires authentication. [#5727](https://github.com/ClickHouse/ClickHouse/pull/5727) ([Korviakov Andrey](https://github.com/shurshun)) +* Update librdkafka to version 1.1.0 [#5872](https://github.com/ClickHouse/ClickHouse/pull/5872) ([Ivan](https://github.com/abyss7)) +* Add global timeout for integration tests and disable some of them in tests code. [#5741](https://github.com/ClickHouse/ClickHouse/pull/5741) ([alesapin](https://github.com/alesapin)) +* Fix some ThreadSanitizer failures. [#5854](https://github.com/ClickHouse/ClickHouse/pull/5854) ([akuzm](https://github.com/akuzm)) +* The `--no-undefined` option forces the linker to check all external names for existence while linking. It's very useful to track real dependencies between libraries in the split build mode. [#5855](https://github.com/ClickHouse/ClickHouse/pull/5855) ([Ivan](https://github.com/abyss7)) +* Added performance test for [#5797](https://github.com/ClickHouse/ClickHouse/issues/5797) [#5914](https://github.com/ClickHouse/ClickHouse/pull/5914) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed compatibility with gcc-7. [#5840](https://github.com/ClickHouse/ClickHouse/pull/5840) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added support for gcc-9. This fixes [#5717](https://github.com/ClickHouse/ClickHouse/issues/5717) [#5774](https://github.com/ClickHouse/ClickHouse/pull/5774) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed error when libunwind can be linked incorrectly. [#5948](https://github.com/ClickHouse/ClickHouse/pull/5948) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed a few warnings found by PVS-Studio. [#5921](https://github.com/ClickHouse/ClickHouse/pull/5921) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added initial support for `clang-tidy` static analyzer. [#5806](https://github.com/ClickHouse/ClickHouse/pull/5806) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Convert BSD/Linux endian macros( 'be64toh' and 'htobe64') to the Mac OS X equivalents [#5785](https://github.com/ClickHouse/ClickHouse/pull/5785) ([Fu Chen](https://github.com/fredchenbj)) +* Improved integration tests guide. [#5796](https://github.com/ClickHouse/ClickHouse/pull/5796) ([Vladimir Chebotarev](https://github.com/excitoon)) +* Fixing build at macosx + gcc9 [#5822](https://github.com/ClickHouse/ClickHouse/pull/5822) ([filimonov](https://github.com/filimonov)) +* Fix a hard-to-spot typo: aggreAGte -> aggregate. [#5753](https://github.com/ClickHouse/ClickHouse/pull/5753) ([akuzm](https://github.com/akuzm)) +* Fix freebsd build [#5760](https://github.com/ClickHouse/ClickHouse/pull/5760) ([proller](https://github.com/proller)) +* Add link to experimental YouTube channel to website [#5845](https://github.com/ClickHouse/ClickHouse/pull/5845) ([Ivan Blinkov](https://github.com/blinkov)) +* CMake: add option for coverage flags: WITH_COVERAGE [#5776](https://github.com/ClickHouse/ClickHouse/pull/5776) ([proller](https://github.com/proller)) +* Fix initial size of some inline PODArray's. [#5787](https://github.com/ClickHouse/ClickHouse/pull/5787) ([akuzm](https://github.com/akuzm)) +* clickhouse-server.postinst: fix os detection for centos 6 [#5788](https://github.com/ClickHouse/ClickHouse/pull/5788) ([proller](https://github.com/proller)) +* Added Arch linux package generation. [#5719](https://github.com/ClickHouse/ClickHouse/pull/5719) ([Vladimir Chebotarev](https://github.com/excitoon)) +* Split Common/config.h by libs (dbms) [#5715](https://github.com/ClickHouse/ClickHouse/pull/5715) ([proller](https://github.com/proller)) +* Fixes for "Arcadia" build platform [#5795](https://github.com/ClickHouse/ClickHouse/pull/5795) ([proller](https://github.com/proller)) +* Fixes for unconventional build (gcc9, no submodules) [#5792](https://github.com/ClickHouse/ClickHouse/pull/5792) ([proller](https://github.com/proller)) +* Require explicit type in unalignedStore because it was proven to be bug-prone [#5791](https://github.com/ClickHouse/ClickHouse/pull/5791) ([akuzm](https://github.com/akuzm)) +* Fixes MacOS build [#5830](https://github.com/ClickHouse/ClickHouse/pull/5830) ([filimonov](https://github.com/filimonov)) +* Performance test concerning the new JIT feature with bigger dataset, as requested here [#5263](https://github.com/ClickHouse/ClickHouse/issues/5263) [#5887](https://github.com/ClickHouse/ClickHouse/pull/5887) ([Guillaume Tassery](https://github.com/YiuRULE)) +* Run stateful tests in stress test [12693e568722f11e19859742f56428455501fd2a](https://github.com/ClickHouse/ClickHouse/commit/12693e568722f11e19859742f56428455501fd2a) ([alesapin](https://github.com/alesapin)) ### Backward Incompatible Change * `Kafka` is broken in this version. -* Enable `adaptive_index_granularity` = 10MB by default for new `MergeTree` tables. If you created new MergeTree tables on version 19.11+, downgrade to versions prior to 19.6 will be impossible. [#5628](https://github.com/yandex/ClickHouse/pull/5628) ([alesapin](https://github.com/alesapin)) -* Removed obsolete undocumented embedded dictionaries that were used by Yandex.Metrica. The functions `OSIn`, `SEIn`, `OSToRoot`, `SEToRoot`, `OSHierarchy`, `SEHierarchy` are no longer available. If you are using these functions, write email to clickhouse-feedback@yandex-team.com. Note: at the last moment we decided to keep these functions for a while. [#5780](https://github.com/yandex/ClickHouse/pull/5780) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Enable `adaptive_index_granularity` = 10MB by default for new `MergeTree` tables. If you created new MergeTree tables on version 19.11+, downgrade to versions prior to 19.6 will be impossible. [#5628](https://github.com/ClickHouse/ClickHouse/pull/5628) ([alesapin](https://github.com/alesapin)) +* Removed obsolete undocumented embedded dictionaries that were used by Yandex.Metrica. The functions `OSIn`, `SEIn`, `OSToRoot`, `SEToRoot`, `OSHierarchy`, `SEHierarchy` are no longer available. If you are using these functions, write email to clickhouse-feedback@yandex-team.com. Note: at the last moment we decided to keep these functions for a while. [#5780](https://github.com/ClickHouse/ClickHouse/pull/5780) ([alexey-milovidov](https://github.com/alexey-milovidov)) ## ClickHouse release 19.10.1.5, 2019-07-12 ### New Feature -* Add new column codec: `T64`. Made for (U)IntX/EnumX/Data(Time)/DecimalX columns. It should be good for columns with constant or small range values. Codec itself allows enlarge or shrink data type without re-compression. [#5557](https://github.com/yandex/ClickHouse/pull/5557) ([Artem Zuikov](https://github.com/4ertus2)) -* Add database engine `MySQL` that allow to view all the tables in remote MySQL server [#5599](https://github.com/yandex/ClickHouse/pull/5599) ([Winter Zhang](https://github.com/zhang2014)) -* `bitmapContains` implementation. It's 2x faster than `bitmapHasAny` if the second bitmap contains one element. [#5535](https://github.com/yandex/ClickHouse/pull/5535) ([Zhichang Yu](https://github.com/yuzhichang)) -* Support for `crc32` function (with behaviour exactly as in MySQL or PHP). Do not use it if you need a hash function. [#5661](https://github.com/yandex/ClickHouse/pull/5661) ([Remen Ivan](https://github.com/BHYCHIK)) -* Implemented `SYSTEM START/STOP DISTRIBUTED SENDS` queries to control asynchronous inserts into `Distributed` tables. [#4935](https://github.com/yandex/ClickHouse/pull/4935) ([Winter Zhang](https://github.com/zhang2014)) +* Add new column codec: `T64`. Made for (U)IntX/EnumX/Data(Time)/DecimalX columns. It should be good for columns with constant or small range values. Codec itself allows enlarge or shrink data type without re-compression. [#5557](https://github.com/ClickHouse/ClickHouse/pull/5557) ([Artem Zuikov](https://github.com/4ertus2)) +* Add database engine `MySQL` that allow to view all the tables in remote MySQL server [#5599](https://github.com/ClickHouse/ClickHouse/pull/5599) ([Winter Zhang](https://github.com/zhang2014)) +* `bitmapContains` implementation. It's 2x faster than `bitmapHasAny` if the second bitmap contains one element. [#5535](https://github.com/ClickHouse/ClickHouse/pull/5535) ([Zhichang Yu](https://github.com/yuzhichang)) +* Support for `crc32` function (with behaviour exactly as in MySQL or PHP). Do not use it if you need a hash function. [#5661](https://github.com/ClickHouse/ClickHouse/pull/5661) ([Remen Ivan](https://github.com/BHYCHIK)) +* Implemented `SYSTEM START/STOP DISTRIBUTED SENDS` queries to control asynchronous inserts into `Distributed` tables. [#4935](https://github.com/ClickHouse/ClickHouse/pull/4935) ([Winter Zhang](https://github.com/zhang2014)) ### Bug Fix -* Ignore query execution limits and max parts size for merge limits while executing mutations. [#5659](https://github.com/yandex/ClickHouse/pull/5659) ([Anton Popov](https://github.com/CurtizJ)) -* Fix bug which may lead to deduplication of normal blocks (extremely rare) and insertion of duplicate blocks (more often). [#5549](https://github.com/yandex/ClickHouse/pull/5549) ([alesapin](https://github.com/alesapin)) -* Fix of function `arrayEnumerateUniqRanked` for arguments with empty arrays [#5559](https://github.com/yandex/ClickHouse/pull/5559) ([proller](https://github.com/proller)) -* Don't subscribe to Kafka topics without intent to poll any messages. [#5698](https://github.com/yandex/ClickHouse/pull/5698) ([Ivan](https://github.com/abyss7)) -* Make setting `join_use_nulls` get no effect for types that cannot be inside Nullable [#5700](https://github.com/yandex/ClickHouse/pull/5700) ([Olga Khvostikova](https://github.com/stavrolia)) -* Fixed `Incorrect size of index granularity` errors [#5720](https://github.com/yandex/ClickHouse/pull/5720) ([coraxster](https://github.com/coraxster)) -* Fix Float to Decimal convert overflow [#5607](https://github.com/yandex/ClickHouse/pull/5607) ([coraxster](https://github.com/coraxster)) -* Flush buffer when `WriteBufferFromHDFS`'s destructor is called. This fixes writing into `HDFS`. [#5684](https://github.com/yandex/ClickHouse/pull/5684) ([Xindong Peng](https://github.com/eejoin)) +* Ignore query execution limits and max parts size for merge limits while executing mutations. [#5659](https://github.com/ClickHouse/ClickHouse/pull/5659) ([Anton Popov](https://github.com/CurtizJ)) +* Fix bug which may lead to deduplication of normal blocks (extremely rare) and insertion of duplicate blocks (more often). [#5549](https://github.com/ClickHouse/ClickHouse/pull/5549) ([alesapin](https://github.com/alesapin)) +* Fix of function `arrayEnumerateUniqRanked` for arguments with empty arrays [#5559](https://github.com/ClickHouse/ClickHouse/pull/5559) ([proller](https://github.com/proller)) +* Don't subscribe to Kafka topics without intent to poll any messages. [#5698](https://github.com/ClickHouse/ClickHouse/pull/5698) ([Ivan](https://github.com/abyss7)) +* Make setting `join_use_nulls` get no effect for types that cannot be inside Nullable [#5700](https://github.com/ClickHouse/ClickHouse/pull/5700) ([Olga Khvostikova](https://github.com/stavrolia)) +* Fixed `Incorrect size of index granularity` errors [#5720](https://github.com/ClickHouse/ClickHouse/pull/5720) ([coraxster](https://github.com/coraxster)) +* Fix Float to Decimal convert overflow [#5607](https://github.com/ClickHouse/ClickHouse/pull/5607) ([coraxster](https://github.com/coraxster)) +* Flush buffer when `WriteBufferFromHDFS`'s destructor is called. This fixes writing into `HDFS`. [#5684](https://github.com/ClickHouse/ClickHouse/pull/5684) ([Xindong Peng](https://github.com/eejoin)) ### Improvement -* Treat empty cells in `CSV` as default values when the setting `input_format_defaults_for_omitted_fields` is enabled. [#5625](https://github.com/yandex/ClickHouse/pull/5625) ([akuzm](https://github.com/akuzm)) -* Non-blocking loading of external dictionaries. [#5567](https://github.com/yandex/ClickHouse/pull/5567) ([Vitaly Baranov](https://github.com/vitlibar)) -* Network timeouts can be dynamically changed for already established connections according to the settings. [#4558](https://github.com/yandex/ClickHouse/pull/4558) ([Konstantin Podshumok](https://github.com/podshumok)) -* Using "public_suffix_list" for functions `firstSignificantSubdomain`, `cutToFirstSignificantSubdomain`. It's using a perfect hash table generated by `gperf` with a list generated from the file: [https://publicsuffix.org/list/public_suffix_list.dat](https://publicsuffix.org/list/public_suffix_list.dat). (for example, now we recognize the domain `ac.uk` as non-significant). [#5030](https://github.com/yandex/ClickHouse/pull/5030) ([Guillaume Tassery](https://github.com/YiuRULE)) -* Adopted `IPv6` data type in system tables; unified client info columns in `system.processes` and `system.query_log` [#5640](https://github.com/yandex/ClickHouse/pull/5640) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Using sessions for connections with MySQL compatibility protocol. #5476 [#5646](https://github.com/yandex/ClickHouse/pull/5646) ([Yuriy Baranov](https://github.com/yurriy)) -* Support more `ALTER` queries `ON CLUSTER`. [#5593](https://github.com/yandex/ClickHouse/pull/5593) [#5613](https://github.com/yandex/ClickHouse/pull/5613) ([sundyli](https://github.com/sundy-li)) -* Support `` section in `clickhouse-local` config file. [#5540](https://github.com/yandex/ClickHouse/pull/5540) ([proller](https://github.com/proller)) -* Allow run query with `remote` table function in `clickhouse-local` [#5627](https://github.com/yandex/ClickHouse/pull/5627) ([proller](https://github.com/proller)) +* Treat empty cells in `CSV` as default values when the setting `input_format_defaults_for_omitted_fields` is enabled. [#5625](https://github.com/ClickHouse/ClickHouse/pull/5625) ([akuzm](https://github.com/akuzm)) +* Non-blocking loading of external dictionaries. [#5567](https://github.com/ClickHouse/ClickHouse/pull/5567) ([Vitaly Baranov](https://github.com/vitlibar)) +* Network timeouts can be dynamically changed for already established connections according to the settings. [#4558](https://github.com/ClickHouse/ClickHouse/pull/4558) ([Konstantin Podshumok](https://github.com/podshumok)) +* Using "public_suffix_list" for functions `firstSignificantSubdomain`, `cutToFirstSignificantSubdomain`. It's using a perfect hash table generated by `gperf` with a list generated from the file: [https://publicsuffix.org/list/public_suffix_list.dat](https://publicsuffix.org/list/public_suffix_list.dat). (for example, now we recognize the domain `ac.uk` as non-significant). [#5030](https://github.com/ClickHouse/ClickHouse/pull/5030) ([Guillaume Tassery](https://github.com/YiuRULE)) +* Adopted `IPv6` data type in system tables; unified client info columns in `system.processes` and `system.query_log` [#5640](https://github.com/ClickHouse/ClickHouse/pull/5640) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Using sessions for connections with MySQL compatibility protocol. #5476 [#5646](https://github.com/ClickHouse/ClickHouse/pull/5646) ([Yuriy Baranov](https://github.com/yurriy)) +* Support more `ALTER` queries `ON CLUSTER`. [#5593](https://github.com/ClickHouse/ClickHouse/pull/5593) [#5613](https://github.com/ClickHouse/ClickHouse/pull/5613) ([sundyli](https://github.com/sundy-li)) +* Support `` section in `clickhouse-local` config file. [#5540](https://github.com/ClickHouse/ClickHouse/pull/5540) ([proller](https://github.com/proller)) +* Allow run query with `remote` table function in `clickhouse-local` [#5627](https://github.com/ClickHouse/ClickHouse/pull/5627) ([proller](https://github.com/proller)) ### Performance Improvement -* Add the possibility to write the final mark at the end of MergeTree columns. It allows to avoid useless reads for keys that are out of table data range. It is enabled only if adaptive index granularity is in use. [#5624](https://github.com/yandex/ClickHouse/pull/5624) ([alesapin](https://github.com/alesapin)) -* Improved performance of MergeTree tables on very slow filesystems by reducing number of `stat` syscalls. [#5648](https://github.com/yandex/ClickHouse/pull/5648) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed performance degradation in reading from MergeTree tables that was introduced in version 19.6. Fixes #5631. [#5633](https://github.com/yandex/ClickHouse/pull/5633) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add the possibility to write the final mark at the end of MergeTree columns. It allows to avoid useless reads for keys that are out of table data range. It is enabled only if adaptive index granularity is in use. [#5624](https://github.com/ClickHouse/ClickHouse/pull/5624) ([alesapin](https://github.com/alesapin)) +* Improved performance of MergeTree tables on very slow filesystems by reducing number of `stat` syscalls. [#5648](https://github.com/ClickHouse/ClickHouse/pull/5648) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed performance degradation in reading from MergeTree tables that was introduced in version 19.6. Fixes #5631. [#5633](https://github.com/ClickHouse/ClickHouse/pull/5633) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Build/Testing/Packaging Improvement -* Implemented `TestKeeper` as an implementation of ZooKeeper interface used for testing [#5643](https://github.com/yandex/ClickHouse/pull/5643) ([alexey-milovidov](https://github.com/alexey-milovidov)) ([levushkin aleksej](https://github.com/alexey-milovidov)) -* From now on `.sql` tests can be run isolated by server, in parallel, with random database. It allows to run them faster, add new tests with custom server configurations, and be sure that different tests doesn't affect each other. [#5554](https://github.com/yandex/ClickHouse/pull/5554) ([Ivan](https://github.com/abyss7)) -* Remove `` and `` from performance tests [#5672](https://github.com/yandex/ClickHouse/pull/5672) ([Olga Khvostikova](https://github.com/stavrolia)) -* Fixed "select_format" performance test for `Pretty` formats [#5642](https://github.com/yandex/ClickHouse/pull/5642) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Implemented `TestKeeper` as an implementation of ZooKeeper interface used for testing [#5643](https://github.com/ClickHouse/ClickHouse/pull/5643) ([alexey-milovidov](https://github.com/alexey-milovidov)) ([levushkin aleksej](https://github.com/alexey-milovidov)) +* From now on `.sql` tests can be run isolated by server, in parallel, with random database. It allows to run them faster, add new tests with custom server configurations, and be sure that different tests doesn't affect each other. [#5554](https://github.com/ClickHouse/ClickHouse/pull/5554) ([Ivan](https://github.com/abyss7)) +* Remove `` and `` from performance tests [#5672](https://github.com/ClickHouse/ClickHouse/pull/5672) ([Olga Khvostikova](https://github.com/stavrolia)) +* Fixed "select_format" performance test for `Pretty` formats [#5642](https://github.com/ClickHouse/ClickHouse/pull/5642) ([alexey-milovidov](https://github.com/alexey-milovidov)) ## ClickHouse release 19.9.3.31, 2019-07-05 ### Bug Fix -* Fix segfault in Delta codec which affects columns with values less than 32 bits size. The bug led to random memory corruption. [#5786](https://github.com/yandex/ClickHouse/pull/5786) ([alesapin](https://github.com/alesapin)) -* Fix rare bug in checking of part with LowCardinality column. [#5832](https://github.com/yandex/ClickHouse/pull/5832) ([alesapin](https://github.com/alesapin)) -* Fix segfault in TTL merge with non-physical columns in block. [#5819](https://github.com/yandex/ClickHouse/pull/5819) ([Anton Popov](https://github.com/CurtizJ)) -* Fix potential infinite sleeping of low-priority queries. [#5842](https://github.com/yandex/ClickHouse/pull/5842) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix how ClickHouse determines default time zone as UCT instead of UTC. [#5828](https://github.com/yandex/ClickHouse/pull/5828) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix bug about executing distributed DROP/ALTER/TRUNCATE/OPTIMIZE ON CLUSTER queries on follower replica before leader replica. Now they will be executed directly on leader replica. [#5757](https://github.com/yandex/ClickHouse/pull/5757) ([alesapin](https://github.com/alesapin)) -* Fix race condition, which cause that some queries may not appear in query_log instantly after SYSTEM FLUSH LOGS query. [#5685](https://github.com/yandex/ClickHouse/pull/5685) ([Anton Popov](https://github.com/CurtizJ)) -* Added missing support for constant arguments to `evalMLModel` function. [#5820](https://github.com/yandex/ClickHouse/pull/5820) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix segfault in Delta codec which affects columns with values less than 32 bits size. The bug led to random memory corruption. [#5786](https://github.com/ClickHouse/ClickHouse/pull/5786) ([alesapin](https://github.com/alesapin)) +* Fix rare bug in checking of part with LowCardinality column. [#5832](https://github.com/ClickHouse/ClickHouse/pull/5832) ([alesapin](https://github.com/alesapin)) +* Fix segfault in TTL merge with non-physical columns in block. [#5819](https://github.com/ClickHouse/ClickHouse/pull/5819) ([Anton Popov](https://github.com/CurtizJ)) +* Fix potential infinite sleeping of low-priority queries. [#5842](https://github.com/ClickHouse/ClickHouse/pull/5842) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix how ClickHouse determines default time zone as UCT instead of UTC. [#5828](https://github.com/ClickHouse/ClickHouse/pull/5828) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix bug about executing distributed DROP/ALTER/TRUNCATE/OPTIMIZE ON CLUSTER queries on follower replica before leader replica. Now they will be executed directly on leader replica. [#5757](https://github.com/ClickHouse/ClickHouse/pull/5757) ([alesapin](https://github.com/alesapin)) +* Fix race condition, which cause that some queries may not appear in query_log instantly after SYSTEM FLUSH LOGS query. [#5685](https://github.com/ClickHouse/ClickHouse/pull/5685) ([Anton Popov](https://github.com/CurtizJ)) +* Added missing support for constant arguments to `evalMLModel` function. [#5820](https://github.com/ClickHouse/ClickHouse/pull/5820) ([alexey-milovidov](https://github.com/alexey-milovidov)) ## ClickHouse release 19.7.5.29, 2019-07-05 ### Bug Fix -* Fix performance regression in some queries with JOIN. [#5192](https://github.com/yandex/ClickHouse/pull/5192) ([Winter Zhang](https://github.com/zhang2014)) +* Fix performance regression in some queries with JOIN. [#5192](https://github.com/ClickHouse/ClickHouse/pull/5192) ([Winter Zhang](https://github.com/zhang2014)) ## ClickHouse release 19.9.2.4, 2019-06-24 ### New Feature -* Print information about frozen parts in `system.parts` table. [#5471](https://github.com/yandex/ClickHouse/pull/5471) ([proller](https://github.com/proller)) -* Ask client password on clickhouse-client start on tty if not set in arguments [#5092](https://github.com/yandex/ClickHouse/pull/5092) ([proller](https://github.com/proller)) -* Implement `dictGet` and `dictGetOrDefault` functions for Decimal types. [#5394](https://github.com/yandex/ClickHouse/pull/5394) ([Artem Zuikov](https://github.com/4ertus2)) +* Print information about frozen parts in `system.parts` table. [#5471](https://github.com/ClickHouse/ClickHouse/pull/5471) ([proller](https://github.com/proller)) +* Ask client password on clickhouse-client start on tty if not set in arguments [#5092](https://github.com/ClickHouse/ClickHouse/pull/5092) ([proller](https://github.com/proller)) +* Implement `dictGet` and `dictGetOrDefault` functions for Decimal types. [#5394](https://github.com/ClickHouse/ClickHouse/pull/5394) ([Artem Zuikov](https://github.com/4ertus2)) ### Improvement -* Debian init: Add service stop timeout [#5522](https://github.com/yandex/ClickHouse/pull/5522) ([proller](https://github.com/proller)) -* Add setting forbidden by default to create table with suspicious types for LowCardinality [#5448](https://github.com/yandex/ClickHouse/pull/5448) ([Olga Khvostikova](https://github.com/stavrolia)) -* Regression functions return model weights when not used as State in function `evalMLMethod`. [#5411](https://github.com/yandex/ClickHouse/pull/5411) ([Quid37](https://github.com/Quid37)) -* Rename and improve regression methods. [#5492](https://github.com/yandex/ClickHouse/pull/5492) ([Quid37](https://github.com/Quid37)) -* Clearer interfaces of string searchers. [#5586](https://github.com/yandex/ClickHouse/pull/5586) ([Danila Kutenin](https://github.com/danlark1)) +* Debian init: Add service stop timeout [#5522](https://github.com/ClickHouse/ClickHouse/pull/5522) ([proller](https://github.com/proller)) +* Add setting forbidden by default to create table with suspicious types for LowCardinality [#5448](https://github.com/ClickHouse/ClickHouse/pull/5448) ([Olga Khvostikova](https://github.com/stavrolia)) +* Regression functions return model weights when not used as State in function `evalMLMethod`. [#5411](https://github.com/ClickHouse/ClickHouse/pull/5411) ([Quid37](https://github.com/Quid37)) +* Rename and improve regression methods. [#5492](https://github.com/ClickHouse/ClickHouse/pull/5492) ([Quid37](https://github.com/Quid37)) +* Clearer interfaces of string searchers. [#5586](https://github.com/ClickHouse/ClickHouse/pull/5586) ([Danila Kutenin](https://github.com/danlark1)) ### Bug Fix -* Fix potential data loss in Kafka [#5445](https://github.com/yandex/ClickHouse/pull/5445) ([Ivan](https://github.com/abyss7)) -* Fix potential infinite loop in `PrettySpace` format when called with zero columns [#5560](https://github.com/yandex/ClickHouse/pull/5560) ([Olga Khvostikova](https://github.com/stavrolia)) -* Fixed UInt32 overflow bug in linear models. Allow eval ML model for non-const model argument. [#5516](https://github.com/yandex/ClickHouse/pull/5516) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* `ALTER TABLE ... DROP INDEX IF EXISTS ...` should not raise an exception if provided index does not exist [#5524](https://github.com/yandex/ClickHouse/pull/5524) ([Gleb Novikov](https://github.com/NanoBjorn)) -* Fix segfault with `bitmapHasAny` in scalar subquery [#5528](https://github.com/yandex/ClickHouse/pull/5528) ([Zhichang Yu](https://github.com/yuzhichang)) -* Fixed error when replication connection pool doesn't retry to resolve host, even when DNS cache was dropped. [#5534](https://github.com/yandex/ClickHouse/pull/5534) ([alesapin](https://github.com/alesapin)) -* Fixed `ALTER ... MODIFY TTL` on ReplicatedMergeTree. [#5539](https://github.com/yandex/ClickHouse/pull/5539) ([Anton Popov](https://github.com/CurtizJ)) -* Fix INSERT into Distributed table with MATERIALIZED column [#5429](https://github.com/yandex/ClickHouse/pull/5429) ([Azat Khuzhin](https://github.com/azat)) -* Fix bad alloc when truncate Join storage [#5437](https://github.com/yandex/ClickHouse/pull/5437) ([TCeason](https://github.com/TCeason)) -* In recent versions of package tzdata some of files are symlinks now. The current mechanism for detecting default timezone gets broken and gives wrong names for some timezones. Now at least we force the timezone name to the contents of TZ if provided. [#5443](https://github.com/yandex/ClickHouse/pull/5443) ([Ivan](https://github.com/abyss7)) -* Fix some extremely rare cases with MultiVolnitsky searcher when the constant needles in sum are at least 16KB long. The algorithm missed or overwrote the previous results which can lead to the incorrect result of `multiSearchAny`. [#5588](https://github.com/yandex/ClickHouse/pull/5588) ([Danila Kutenin](https://github.com/danlark1)) -* Fix the issue when settings for ExternalData requests couldn't use ClickHouse settings. Also, for now, settings `date_time_input_format` and `low_cardinality_allow_in_native_format` cannot be used because of the ambiguity of names (in external data it can be interpreted as table format and in the query it can be a setting). [#5455](https://github.com/yandex/ClickHouse/pull/5455) ([Danila Kutenin](https://github.com/danlark1)) -* Fix bug when parts were removed only from FS without dropping them from Zookeeper. [#5520](https://github.com/yandex/ClickHouse/pull/5520) ([alesapin](https://github.com/alesapin)) -* Remove debug logging from MySQL protocol [#5478](https://github.com/yandex/ClickHouse/pull/5478) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Skip ZNONODE during DDL query processing [#5489](https://github.com/yandex/ClickHouse/pull/5489) ([Azat Khuzhin](https://github.com/azat)) -* Fix mix `UNION ALL` result column type. There were cases with inconsistent data and column types of resulting columns. [#5503](https://github.com/yandex/ClickHouse/pull/5503) ([Artem Zuikov](https://github.com/4ertus2)) -* Throw an exception on wrong integers in `dictGetT` functions instead of crash. [#5446](https://github.com/yandex/ClickHouse/pull/5446) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix wrong element_count and load_factor for hashed dictionary in `system.dictionaries` table. [#5440](https://github.com/yandex/ClickHouse/pull/5440) ([Azat Khuzhin](https://github.com/azat)) +* Fix potential data loss in Kafka [#5445](https://github.com/ClickHouse/ClickHouse/pull/5445) ([Ivan](https://github.com/abyss7)) +* Fix potential infinite loop in `PrettySpace` format when called with zero columns [#5560](https://github.com/ClickHouse/ClickHouse/pull/5560) ([Olga Khvostikova](https://github.com/stavrolia)) +* Fixed UInt32 overflow bug in linear models. Allow eval ML model for non-const model argument. [#5516](https://github.com/ClickHouse/ClickHouse/pull/5516) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* `ALTER TABLE ... DROP INDEX IF EXISTS ...` should not raise an exception if provided index does not exist [#5524](https://github.com/ClickHouse/ClickHouse/pull/5524) ([Gleb Novikov](https://github.com/NanoBjorn)) +* Fix segfault with `bitmapHasAny` in scalar subquery [#5528](https://github.com/ClickHouse/ClickHouse/pull/5528) ([Zhichang Yu](https://github.com/yuzhichang)) +* Fixed error when replication connection pool doesn't retry to resolve host, even when DNS cache was dropped. [#5534](https://github.com/ClickHouse/ClickHouse/pull/5534) ([alesapin](https://github.com/alesapin)) +* Fixed `ALTER ... MODIFY TTL` on ReplicatedMergeTree. [#5539](https://github.com/ClickHouse/ClickHouse/pull/5539) ([Anton Popov](https://github.com/CurtizJ)) +* Fix INSERT into Distributed table with MATERIALIZED column [#5429](https://github.com/ClickHouse/ClickHouse/pull/5429) ([Azat Khuzhin](https://github.com/azat)) +* Fix bad alloc when truncate Join storage [#5437](https://github.com/ClickHouse/ClickHouse/pull/5437) ([TCeason](https://github.com/TCeason)) +* In recent versions of package tzdata some of files are symlinks now. The current mechanism for detecting default timezone gets broken and gives wrong names for some timezones. Now at least we force the timezone name to the contents of TZ if provided. [#5443](https://github.com/ClickHouse/ClickHouse/pull/5443) ([Ivan](https://github.com/abyss7)) +* Fix some extremely rare cases with MultiVolnitsky searcher when the constant needles in sum are at least 16KB long. The algorithm missed or overwrote the previous results which can lead to the incorrect result of `multiSearchAny`. [#5588](https://github.com/ClickHouse/ClickHouse/pull/5588) ([Danila Kutenin](https://github.com/danlark1)) +* Fix the issue when settings for ExternalData requests couldn't use ClickHouse settings. Also, for now, settings `date_time_input_format` and `low_cardinality_allow_in_native_format` cannot be used because of the ambiguity of names (in external data it can be interpreted as table format and in the query it can be a setting). [#5455](https://github.com/ClickHouse/ClickHouse/pull/5455) ([Danila Kutenin](https://github.com/danlark1)) +* Fix bug when parts were removed only from FS without dropping them from Zookeeper. [#5520](https://github.com/ClickHouse/ClickHouse/pull/5520) ([alesapin](https://github.com/alesapin)) +* Remove debug logging from MySQL protocol [#5478](https://github.com/ClickHouse/ClickHouse/pull/5478) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Skip ZNONODE during DDL query processing [#5489](https://github.com/ClickHouse/ClickHouse/pull/5489) ([Azat Khuzhin](https://github.com/azat)) +* Fix mix `UNION ALL` result column type. There were cases with inconsistent data and column types of resulting columns. [#5503](https://github.com/ClickHouse/ClickHouse/pull/5503) ([Artem Zuikov](https://github.com/4ertus2)) +* Throw an exception on wrong integers in `dictGetT` functions instead of crash. [#5446](https://github.com/ClickHouse/ClickHouse/pull/5446) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix wrong element_count and load_factor for hashed dictionary in `system.dictionaries` table. [#5440](https://github.com/ClickHouse/ClickHouse/pull/5440) ([Azat Khuzhin](https://github.com/azat)) ### Build/Testing/Packaging Improvement -* Fixed build without `Brotli` HTTP compression support (`ENABLE_BROTLI=OFF` cmake variable). [#5521](https://github.com/yandex/ClickHouse/pull/5521) ([Anton Yuzhaninov](https://github.com/citrin)) -* Include roaring.h as roaring/roaring.h [#5523](https://github.com/yandex/ClickHouse/pull/5523) ([Orivej Desh](https://github.com/orivej)) -* Fix gcc9 warnings in hyperscan (#line directive is evil!) [#5546](https://github.com/yandex/ClickHouse/pull/5546) ([Danila Kutenin](https://github.com/danlark1)) -* Fix all warnings when compiling with gcc-9. Fix some contrib issues. Fix gcc9 ICE and submit it to bugzilla. [#5498](https://github.com/yandex/ClickHouse/pull/5498) ([Danila Kutenin](https://github.com/danlark1)) -* Fixed linking with lld [#5477](https://github.com/yandex/ClickHouse/pull/5477) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Remove unused specializations in dictionaries [#5452](https://github.com/yandex/ClickHouse/pull/5452) ([Artem Zuikov](https://github.com/4ertus2)) -* Improvement performance tests for formatting and parsing tables for different types of files [#5497](https://github.com/yandex/ClickHouse/pull/5497) ([Olga Khvostikova](https://github.com/stavrolia)) -* Fixes for parallel test run [#5506](https://github.com/yandex/ClickHouse/pull/5506) ([proller](https://github.com/proller)) -* Docker: use configs from clickhouse-test [#5531](https://github.com/yandex/ClickHouse/pull/5531) ([proller](https://github.com/proller)) -* Fix compile for FreeBSD [#5447](https://github.com/yandex/ClickHouse/pull/5447) ([proller](https://github.com/proller)) -* Upgrade boost to 1.70 [#5570](https://github.com/yandex/ClickHouse/pull/5570) ([proller](https://github.com/proller)) -* Fix build clickhouse as submodule [#5574](https://github.com/yandex/ClickHouse/pull/5574) ([proller](https://github.com/proller)) -* Improve JSONExtract performance tests [#5444](https://github.com/yandex/ClickHouse/pull/5444) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fixed build without `Brotli` HTTP compression support (`ENABLE_BROTLI=OFF` cmake variable). [#5521](https://github.com/ClickHouse/ClickHouse/pull/5521) ([Anton Yuzhaninov](https://github.com/citrin)) +* Include roaring.h as roaring/roaring.h [#5523](https://github.com/ClickHouse/ClickHouse/pull/5523) ([Orivej Desh](https://github.com/orivej)) +* Fix gcc9 warnings in hyperscan (#line directive is evil!) [#5546](https://github.com/ClickHouse/ClickHouse/pull/5546) ([Danila Kutenin](https://github.com/danlark1)) +* Fix all warnings when compiling with gcc-9. Fix some contrib issues. Fix gcc9 ICE and submit it to bugzilla. [#5498](https://github.com/ClickHouse/ClickHouse/pull/5498) ([Danila Kutenin](https://github.com/danlark1)) +* Fixed linking with lld [#5477](https://github.com/ClickHouse/ClickHouse/pull/5477) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Remove unused specializations in dictionaries [#5452](https://github.com/ClickHouse/ClickHouse/pull/5452) ([Artem Zuikov](https://github.com/4ertus2)) +* Improvement performance tests for formatting and parsing tables for different types of files [#5497](https://github.com/ClickHouse/ClickHouse/pull/5497) ([Olga Khvostikova](https://github.com/stavrolia)) +* Fixes for parallel test run [#5506](https://github.com/ClickHouse/ClickHouse/pull/5506) ([proller](https://github.com/proller)) +* Docker: use configs from clickhouse-test [#5531](https://github.com/ClickHouse/ClickHouse/pull/5531) ([proller](https://github.com/proller)) +* Fix compile for FreeBSD [#5447](https://github.com/ClickHouse/ClickHouse/pull/5447) ([proller](https://github.com/proller)) +* Upgrade boost to 1.70 [#5570](https://github.com/ClickHouse/ClickHouse/pull/5570) ([proller](https://github.com/proller)) +* Fix build clickhouse as submodule [#5574](https://github.com/ClickHouse/ClickHouse/pull/5574) ([proller](https://github.com/proller)) +* Improve JSONExtract performance tests [#5444](https://github.com/ClickHouse/ClickHouse/pull/5444) ([Vitaly Baranov](https://github.com/vitlibar)) ## ClickHouse release 19.8.3.8, 2019-06-11 ### New Features -* Added functions to work with JSON [#4686](https://github.com/yandex/ClickHouse/pull/4686) ([hcz](https://github.com/hczhcz)) [#5124](https://github.com/yandex/ClickHouse/pull/5124). ([Vitaly Baranov](https://github.com/vitlibar)) -* Add a function basename, with a similar behaviour to a basename function, which exists in a lot of languages (`os.path.basename` in python, `basename` in PHP, etc...). Work with both an UNIX-like path or a Windows path. [#5136](https://github.com/yandex/ClickHouse/pull/5136) ([Guillaume Tassery](https://github.com/YiuRULE)) -* Added `LIMIT n, m BY` or `LIMIT m OFFSET n BY` syntax to set offset of n for LIMIT BY clause. [#5138](https://github.com/yandex/ClickHouse/pull/5138) ([Anton Popov](https://github.com/CurtizJ)) -* Added new data type `SimpleAggregateFunction`, which allows to have columns with light aggregation in an `AggregatingMergeTree`. This can only be used with simple functions like `any`, `anyLast`, `sum`, `min`, `max`. [#4629](https://github.com/yandex/ClickHouse/pull/4629) ([Boris Granveaud](https://github.com/bgranvea)) -* Added support for non-constant arguments in function `ngramDistance` [#5198](https://github.com/yandex/ClickHouse/pull/5198) ([Danila Kutenin](https://github.com/danlark1)) -* Added functions `skewPop`, `skewSamp`, `kurtPop` and `kurtSamp` to compute for sequence skewness, sample skewness, kurtosis and sample kurtosis respectively. [#5200](https://github.com/yandex/ClickHouse/pull/5200) ([hcz](https://github.com/hczhcz)) -* Support rename operation for `MaterializeView` storage. [#5209](https://github.com/yandex/ClickHouse/pull/5209) ([Guillaume Tassery](https://github.com/YiuRULE)) -* Added server which allows connecting to ClickHouse using MySQL client. [#4715](https://github.com/yandex/ClickHouse/pull/4715) ([Yuriy Baranov](https://github.com/yurriy)) -* Add `toDecimal*OrZero` and `toDecimal*OrNull` functions. [#5291](https://github.com/yandex/ClickHouse/pull/5291) ([Artem Zuikov](https://github.com/4ertus2)) -* Support Decimal types in functions: `quantile`, `quantiles`, `median`, `quantileExactWeighted`, `quantilesExactWeighted`, medianExactWeighted. [#5304](https://github.com/yandex/ClickHouse/pull/5304) ([Artem Zuikov](https://github.com/4ertus2)) -* Added `toValidUTF8` function, which replaces all invalid UTF-8 characters by replacement character � (U+FFFD). [#5322](https://github.com/yandex/ClickHouse/pull/5322) ([Danila Kutenin](https://github.com/danlark1)) -* Added `format` function. Formatting constant pattern (simplified Python format pattern) with the strings listed in the arguments. [#5330](https://github.com/yandex/ClickHouse/pull/5330) ([Danila Kutenin](https://github.com/danlark1)) -* Added `system.detached_parts` table containing information about detached parts of `MergeTree` tables. [#5353](https://github.com/yandex/ClickHouse/pull/5353) ([akuzm](https://github.com/akuzm)) -* Added `ngramSearch` function to calculate the non-symmetric difference between needle and haystack. [#5418](https://github.com/yandex/ClickHouse/pull/5418)[#5422](https://github.com/yandex/ClickHouse/pull/5422) ([Danila Kutenin](https://github.com/danlark1)) -* Implementation of basic machine learning methods (stochastic linear regression and logistic regression) using aggregate functions interface. Has different strategies for updating model weights (simple gradient descent, momentum method, Nesterov method). Also supports mini-batches of custom size. [#4943](https://github.com/yandex/ClickHouse/pull/4943) ([Quid37](https://github.com/Quid37)) -* Implementation of `geohashEncode` and `geohashDecode` functions. [#5003](https://github.com/yandex/ClickHouse/pull/5003) ([Vasily Nemkov](https://github.com/Enmk)) -* Added aggregate function `timeSeriesGroupSum`, which can aggregate different time series that sample timestamp not alignment. It will use linear interpolation between two sample timestamp and then sum time-series together. Added aggregate function `timeSeriesGroupRateSum`, which calculates the rate of time-series and then sum rates together. [#4542](https://github.com/yandex/ClickHouse/pull/4542) ([Yangkuan Liu](https://github.com/LiuYangkuan)) -* Added functions `IPv4CIDRtoIPv4Range` and `IPv6CIDRtoIPv6Range` to calculate the lower and higher bounds for an IP in the subnet using a CIDR. [#5095](https://github.com/yandex/ClickHouse/pull/5095) ([Guillaume Tassery](https://github.com/YiuRULE)) -* Add a X-ClickHouse-Summary header when we send a query using HTTP with enabled setting `send_progress_in_http_headers`. Return the usual information of X-ClickHouse-Progress, with additional information like how many rows and bytes were inserted in the query. [#5116](https://github.com/yandex/ClickHouse/pull/5116) ([Guillaume Tassery](https://github.com/YiuRULE)) +* Added functions to work with JSON [#4686](https://github.com/ClickHouse/ClickHouse/pull/4686) ([hcz](https://github.com/hczhcz)) [#5124](https://github.com/ClickHouse/ClickHouse/pull/5124). ([Vitaly Baranov](https://github.com/vitlibar)) +* Add a function basename, with a similar behaviour to a basename function, which exists in a lot of languages (`os.path.basename` in python, `basename` in PHP, etc...). Work with both an UNIX-like path or a Windows path. [#5136](https://github.com/ClickHouse/ClickHouse/pull/5136) ([Guillaume Tassery](https://github.com/YiuRULE)) +* Added `LIMIT n, m BY` or `LIMIT m OFFSET n BY` syntax to set offset of n for LIMIT BY clause. [#5138](https://github.com/ClickHouse/ClickHouse/pull/5138) ([Anton Popov](https://github.com/CurtizJ)) +* Added new data type `SimpleAggregateFunction`, which allows to have columns with light aggregation in an `AggregatingMergeTree`. This can only be used with simple functions like `any`, `anyLast`, `sum`, `min`, `max`. [#4629](https://github.com/ClickHouse/ClickHouse/pull/4629) ([Boris Granveaud](https://github.com/bgranvea)) +* Added support for non-constant arguments in function `ngramDistance` [#5198](https://github.com/ClickHouse/ClickHouse/pull/5198) ([Danila Kutenin](https://github.com/danlark1)) +* Added functions `skewPop`, `skewSamp`, `kurtPop` and `kurtSamp` to compute for sequence skewness, sample skewness, kurtosis and sample kurtosis respectively. [#5200](https://github.com/ClickHouse/ClickHouse/pull/5200) ([hcz](https://github.com/hczhcz)) +* Support rename operation for `MaterializeView` storage. [#5209](https://github.com/ClickHouse/ClickHouse/pull/5209) ([Guillaume Tassery](https://github.com/YiuRULE)) +* Added server which allows connecting to ClickHouse using MySQL client. [#4715](https://github.com/ClickHouse/ClickHouse/pull/4715) ([Yuriy Baranov](https://github.com/yurriy)) +* Add `toDecimal*OrZero` and `toDecimal*OrNull` functions. [#5291](https://github.com/ClickHouse/ClickHouse/pull/5291) ([Artem Zuikov](https://github.com/4ertus2)) +* Support Decimal types in functions: `quantile`, `quantiles`, `median`, `quantileExactWeighted`, `quantilesExactWeighted`, medianExactWeighted. [#5304](https://github.com/ClickHouse/ClickHouse/pull/5304) ([Artem Zuikov](https://github.com/4ertus2)) +* Added `toValidUTF8` function, which replaces all invalid UTF-8 characters by replacement character � (U+FFFD). [#5322](https://github.com/ClickHouse/ClickHouse/pull/5322) ([Danila Kutenin](https://github.com/danlark1)) +* Added `format` function. Formatting constant pattern (simplified Python format pattern) with the strings listed in the arguments. [#5330](https://github.com/ClickHouse/ClickHouse/pull/5330) ([Danila Kutenin](https://github.com/danlark1)) +* Added `system.detached_parts` table containing information about detached parts of `MergeTree` tables. [#5353](https://github.com/ClickHouse/ClickHouse/pull/5353) ([akuzm](https://github.com/akuzm)) +* Added `ngramSearch` function to calculate the non-symmetric difference between needle and haystack. [#5418](https://github.com/ClickHouse/ClickHouse/pull/5418)[#5422](https://github.com/ClickHouse/ClickHouse/pull/5422) ([Danila Kutenin](https://github.com/danlark1)) +* Implementation of basic machine learning methods (stochastic linear regression and logistic regression) using aggregate functions interface. Has different strategies for updating model weights (simple gradient descent, momentum method, Nesterov method). Also supports mini-batches of custom size. [#4943](https://github.com/ClickHouse/ClickHouse/pull/4943) ([Quid37](https://github.com/Quid37)) +* Implementation of `geohashEncode` and `geohashDecode` functions. [#5003](https://github.com/ClickHouse/ClickHouse/pull/5003) ([Vasily Nemkov](https://github.com/Enmk)) +* Added aggregate function `timeSeriesGroupSum`, which can aggregate different time series that sample timestamp not alignment. It will use linear interpolation between two sample timestamp and then sum time-series together. Added aggregate function `timeSeriesGroupRateSum`, which calculates the rate of time-series and then sum rates together. [#4542](https://github.com/ClickHouse/ClickHouse/pull/4542) ([Yangkuan Liu](https://github.com/LiuYangkuan)) +* Added functions `IPv4CIDRtoIPv4Range` and `IPv6CIDRtoIPv6Range` to calculate the lower and higher bounds for an IP in the subnet using a CIDR. [#5095](https://github.com/ClickHouse/ClickHouse/pull/5095) ([Guillaume Tassery](https://github.com/YiuRULE)) +* Add a X-ClickHouse-Summary header when we send a query using HTTP with enabled setting `send_progress_in_http_headers`. Return the usual information of X-ClickHouse-Progress, with additional information like how many rows and bytes were inserted in the query. [#5116](https://github.com/ClickHouse/ClickHouse/pull/5116) ([Guillaume Tassery](https://github.com/YiuRULE)) ### Improvements -* Added `max_parts_in_total` setting for MergeTree family of tables (default: 100 000) that prevents unsafe specification of partition key #5166. [#5171](https://github.com/yandex/ClickHouse/pull/5171) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `clickhouse-obfuscator`: derive seed for individual columns by combining initial seed with column name, not column position. This is intended to transform datasets with multiple related tables, so that tables will remain JOINable after transformation. [#5178](https://github.com/yandex/ClickHouse/pull/5178) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added functions `JSONExtractRaw`, `JSONExtractKeyAndValues`. Renamed functions `jsonExtract` to `JSONExtract`. When something goes wrong these functions return the correspondent values, not `NULL`. Modified function `JSONExtract`, now it gets the return type from its last parameter and doesn't inject nullables. Implemented fallback to RapidJSON in case AVX2 instructions are not available. Simdjson library updated to a new version. [#5235](https://github.com/yandex/ClickHouse/pull/5235) ([Vitaly Baranov](https://github.com/vitlibar)) -* Now `if` and `multiIf` functions don't rely on the condition's `Nullable`, but rely on the branches for sql compatibility. [#5238](https://github.com/yandex/ClickHouse/pull/5238) ([Jian Wu](https://github.com/janplus)) -* `In` predicate now generates `Null` result from `Null` input like the `Equal` function. [#5152](https://github.com/yandex/ClickHouse/pull/5152) ([Jian Wu](https://github.com/janplus)) -* Check the time limit every (flush_interval / poll_timeout) number of rows from Kafka. This allows to break the reading from Kafka consumer more frequently and to check the time limits for the top-level streams [#5249](https://github.com/yandex/ClickHouse/pull/5249) ([Ivan](https://github.com/abyss7)) -* Link rdkafka with bundled SASL. It should allow to use SASL SCRAM authentication [#5253](https://github.com/yandex/ClickHouse/pull/5253) ([Ivan](https://github.com/abyss7)) -* Batched version of RowRefList for ALL JOINS. [#5267](https://github.com/yandex/ClickHouse/pull/5267) ([Artem Zuikov](https://github.com/4ertus2)) -* clickhouse-server: more informative listen error messages. [#5268](https://github.com/yandex/ClickHouse/pull/5268) ([proller](https://github.com/proller)) -* Support dictionaries in clickhouse-copier for functions in `` [#5270](https://github.com/yandex/ClickHouse/pull/5270) ([proller](https://github.com/proller)) +* Added `max_parts_in_total` setting for MergeTree family of tables (default: 100 000) that prevents unsafe specification of partition key #5166. [#5171](https://github.com/ClickHouse/ClickHouse/pull/5171) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* `clickhouse-obfuscator`: derive seed for individual columns by combining initial seed with column name, not column position. This is intended to transform datasets with multiple related tables, so that tables will remain JOINable after transformation. [#5178](https://github.com/ClickHouse/ClickHouse/pull/5178) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added functions `JSONExtractRaw`, `JSONExtractKeyAndValues`. Renamed functions `jsonExtract` to `JSONExtract`. When something goes wrong these functions return the correspondent values, not `NULL`. Modified function `JSONExtract`, now it gets the return type from its last parameter and doesn't inject nullables. Implemented fallback to RapidJSON in case AVX2 instructions are not available. Simdjson library updated to a new version. [#5235](https://github.com/ClickHouse/ClickHouse/pull/5235) ([Vitaly Baranov](https://github.com/vitlibar)) +* Now `if` and `multiIf` functions don't rely on the condition's `Nullable`, but rely on the branches for sql compatibility. [#5238](https://github.com/ClickHouse/ClickHouse/pull/5238) ([Jian Wu](https://github.com/janplus)) +* `In` predicate now generates `Null` result from `Null` input like the `Equal` function. [#5152](https://github.com/ClickHouse/ClickHouse/pull/5152) ([Jian Wu](https://github.com/janplus)) +* Check the time limit every (flush_interval / poll_timeout) number of rows from Kafka. This allows to break the reading from Kafka consumer more frequently and to check the time limits for the top-level streams [#5249](https://github.com/ClickHouse/ClickHouse/pull/5249) ([Ivan](https://github.com/abyss7)) +* Link rdkafka with bundled SASL. It should allow to use SASL SCRAM authentication [#5253](https://github.com/ClickHouse/ClickHouse/pull/5253) ([Ivan](https://github.com/abyss7)) +* Batched version of RowRefList for ALL JOINS. [#5267](https://github.com/ClickHouse/ClickHouse/pull/5267) ([Artem Zuikov](https://github.com/4ertus2)) +* clickhouse-server: more informative listen error messages. [#5268](https://github.com/ClickHouse/ClickHouse/pull/5268) ([proller](https://github.com/proller)) +* Support dictionaries in clickhouse-copier for functions in `` [#5270](https://github.com/ClickHouse/ClickHouse/pull/5270) ([proller](https://github.com/proller)) * Add new setting `kafka_commit_every_batch` to regulate Kafka committing policy. -It allows to set commit mode: after every batch of messages is handled, or after the whole block is written to the storage. It's a trade-off between losing some messages or reading them twice in some extreme situations. [#5308](https://github.com/yandex/ClickHouse/pull/5308) ([Ivan](https://github.com/abyss7)) -* Make `windowFunnel` support other Unsigned Integer Types. [#5320](https://github.com/yandex/ClickHouse/pull/5320) ([sundyli](https://github.com/sundy-li)) -* Allow to shadow virtual column `_table` in Merge engine. [#5325](https://github.com/yandex/ClickHouse/pull/5325) ([Ivan](https://github.com/abyss7)) -* Make `sequenceMatch` aggregate functions support other unsigned Integer types [#5339](https://github.com/yandex/ClickHouse/pull/5339) ([sundyli](https://github.com/sundy-li)) -* Better error messages if checksum mismatch is most likely caused by hardware failures. [#5355](https://github.com/yandex/ClickHouse/pull/5355) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Check that underlying tables support sampling for `StorageMerge` [#5366](https://github.com/yandex/ClickHouse/pull/5366) ([Ivan](https://github.com/abyss7)) -* Сlose MySQL connections after their usage in external dictionaries. It is related to issue #893. [#5395](https://github.com/yandex/ClickHouse/pull/5395) ([Clément Rodriguez](https://github.com/clemrodriguez)) -* Improvements of MySQL Wire Protocol. Changed name of format to MySQLWire. Using RAII for calling RSA_free. Disabling SSL if context cannot be created. [#5419](https://github.com/yandex/ClickHouse/pull/5419) ([Yuriy Baranov](https://github.com/yurriy)) -* clickhouse-client: allow to run with unaccessable history file (read-only, no disk space, file is directory, ...). [#5431](https://github.com/yandex/ClickHouse/pull/5431) ([proller](https://github.com/proller)) -* Respect query settings in asynchronous INSERTs into Distributed tables. [#4936](https://github.com/yandex/ClickHouse/pull/4936) ([TCeason](https://github.com/TCeason)) -* Renamed functions `leastSqr` to `simpleLinearRegression`, `LinearRegression` to `linearRegression`, `LogisticRegression` to `logisticRegression`. [#5391](https://github.com/yandex/ClickHouse/pull/5391) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +It allows to set commit mode: after every batch of messages is handled, or after the whole block is written to the storage. It's a trade-off between losing some messages or reading them twice in some extreme situations. [#5308](https://github.com/ClickHouse/ClickHouse/pull/5308) ([Ivan](https://github.com/abyss7)) +* Make `windowFunnel` support other Unsigned Integer Types. [#5320](https://github.com/ClickHouse/ClickHouse/pull/5320) ([sundyli](https://github.com/sundy-li)) +* Allow to shadow virtual column `_table` in Merge engine. [#5325](https://github.com/ClickHouse/ClickHouse/pull/5325) ([Ivan](https://github.com/abyss7)) +* Make `sequenceMatch` aggregate functions support other unsigned Integer types [#5339](https://github.com/ClickHouse/ClickHouse/pull/5339) ([sundyli](https://github.com/sundy-li)) +* Better error messages if checksum mismatch is most likely caused by hardware failures. [#5355](https://github.com/ClickHouse/ClickHouse/pull/5355) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Check that underlying tables support sampling for `StorageMerge` [#5366](https://github.com/ClickHouse/ClickHouse/pull/5366) ([Ivan](https://github.com/abyss7)) +* Сlose MySQL connections after their usage in external dictionaries. It is related to issue #893. [#5395](https://github.com/ClickHouse/ClickHouse/pull/5395) ([Clément Rodriguez](https://github.com/clemrodriguez)) +* Improvements of MySQL Wire Protocol. Changed name of format to MySQLWire. Using RAII for calling RSA_free. Disabling SSL if context cannot be created. [#5419](https://github.com/ClickHouse/ClickHouse/pull/5419) ([Yuriy Baranov](https://github.com/yurriy)) +* clickhouse-client: allow to run with unaccessable history file (read-only, no disk space, file is directory, ...). [#5431](https://github.com/ClickHouse/ClickHouse/pull/5431) ([proller](https://github.com/proller)) +* Respect query settings in asynchronous INSERTs into Distributed tables. [#4936](https://github.com/ClickHouse/ClickHouse/pull/4936) ([TCeason](https://github.com/TCeason)) +* Renamed functions `leastSqr` to `simpleLinearRegression`, `LinearRegression` to `linearRegression`, `LogisticRegression` to `logisticRegression`. [#5391](https://github.com/ClickHouse/ClickHouse/pull/5391) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) ### Performance Improvements -* Parallelize processing of parts of non-replicated MergeTree tables in ALTER MODIFY query. [#4639](https://github.com/yandex/ClickHouse/pull/4639) ([Ivan Kush](https://github.com/IvanKush)) -* Optimizations in regular expressions extraction. [#5193](https://github.com/yandex/ClickHouse/pull/5193) [#5191](https://github.com/yandex/ClickHouse/pull/5191) ([Danila Kutenin](https://github.com/danlark1)) -* Do not add right join key column to join result if it's used only in join on section. [#5260](https://github.com/yandex/ClickHouse/pull/5260) ([Artem Zuikov](https://github.com/4ertus2)) -* Freeze the Kafka buffer after first empty response. It avoids multiple invokations of `ReadBuffer::next()` for empty result in some row-parsing streams. [#5283](https://github.com/yandex/ClickHouse/pull/5283) ([Ivan](https://github.com/abyss7)) -* `concat` function optimization for multiple arguments. [#5357](https://github.com/yandex/ClickHouse/pull/5357) ([Danila Kutenin](https://github.com/danlark1)) -* Query optimisation. Allow push down IN statement while rewriting commа/cross join into inner one. [#5396](https://github.com/yandex/ClickHouse/pull/5396) ([Artem Zuikov](https://github.com/4ertus2)) -* Upgrade our LZ4 implementation with reference one to have faster decompression. [#5070](https://github.com/yandex/ClickHouse/pull/5070) ([Danila Kutenin](https://github.com/danlark1)) -* Implemented MSD radix sort (based on kxsort), and partial sorting. [#5129](https://github.com/yandex/ClickHouse/pull/5129) ([Evgenii Pravda](https://github.com/kvinty)) +* Parallelize processing of parts of non-replicated MergeTree tables in ALTER MODIFY query. [#4639](https://github.com/ClickHouse/ClickHouse/pull/4639) ([Ivan Kush](https://github.com/IvanKush)) +* Optimizations in regular expressions extraction. [#5193](https://github.com/ClickHouse/ClickHouse/pull/5193) [#5191](https://github.com/ClickHouse/ClickHouse/pull/5191) ([Danila Kutenin](https://github.com/danlark1)) +* Do not add right join key column to join result if it's used only in join on section. [#5260](https://github.com/ClickHouse/ClickHouse/pull/5260) ([Artem Zuikov](https://github.com/4ertus2)) +* Freeze the Kafka buffer after first empty response. It avoids multiple invokations of `ReadBuffer::next()` for empty result in some row-parsing streams. [#5283](https://github.com/ClickHouse/ClickHouse/pull/5283) ([Ivan](https://github.com/abyss7)) +* `concat` function optimization for multiple arguments. [#5357](https://github.com/ClickHouse/ClickHouse/pull/5357) ([Danila Kutenin](https://github.com/danlark1)) +* Query optimisation. Allow push down IN statement while rewriting commа/cross join into inner one. [#5396](https://github.com/ClickHouse/ClickHouse/pull/5396) ([Artem Zuikov](https://github.com/4ertus2)) +* Upgrade our LZ4 implementation with reference one to have faster decompression. [#5070](https://github.com/ClickHouse/ClickHouse/pull/5070) ([Danila Kutenin](https://github.com/danlark1)) +* Implemented MSD radix sort (based on kxsort), and partial sorting. [#5129](https://github.com/ClickHouse/ClickHouse/pull/5129) ([Evgenii Pravda](https://github.com/kvinty)) ### Bug Fixes -* Fix push require columns with join [#5192](https://github.com/yandex/ClickHouse/pull/5192) ([Winter Zhang](https://github.com/zhang2014)) -* Fixed bug, when ClickHouse is run by systemd, the command `sudo service clickhouse-server forcerestart` was not working as expected. [#5204](https://github.com/yandex/ClickHouse/pull/5204) ([proller](https://github.com/proller)) -* Fix http error codes in DataPartsExchange (interserver http server on 9009 port always returned code 200, even on errors). [#5216](https://github.com/yandex/ClickHouse/pull/5216) ([proller](https://github.com/proller)) -* Fix SimpleAggregateFunction for String longer than MAX_SMALL_STRING_SIZE [#5311](https://github.com/yandex/ClickHouse/pull/5311) ([Azat Khuzhin](https://github.com/azat)) -* Fix error for `Decimal` to `Nullable(Decimal)` conversion in IN. Support other Decimal to Decimal conversions (including different scales). [#5350](https://github.com/yandex/ClickHouse/pull/5350) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed FPU clobbering in simdjson library that lead to wrong calculation of `uniqHLL` and `uniqCombined` aggregate function and math functions such as `log`. [#5354](https://github.com/yandex/ClickHouse/pull/5354) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed handling mixed const/nonconst cases in JSON functions. [#5435](https://github.com/yandex/ClickHouse/pull/5435) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix `retention` function. Now all conditions that satisfy in a row of data are added to the data state. [#5119](https://github.com/yandex/ClickHouse/pull/5119) ([小路](https://github.com/nicelulu)) -* Fix result type for `quantileExact` with Decimals. [#5304](https://github.com/yandex/ClickHouse/pull/5304) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix push require columns with join [#5192](https://github.com/ClickHouse/ClickHouse/pull/5192) ([Winter Zhang](https://github.com/zhang2014)) +* Fixed bug, when ClickHouse is run by systemd, the command `sudo service clickhouse-server forcerestart` was not working as expected. [#5204](https://github.com/ClickHouse/ClickHouse/pull/5204) ([proller](https://github.com/proller)) +* Fix http error codes in DataPartsExchange (interserver http server on 9009 port always returned code 200, even on errors). [#5216](https://github.com/ClickHouse/ClickHouse/pull/5216) ([proller](https://github.com/proller)) +* Fix SimpleAggregateFunction for String longer than MAX_SMALL_STRING_SIZE [#5311](https://github.com/ClickHouse/ClickHouse/pull/5311) ([Azat Khuzhin](https://github.com/azat)) +* Fix error for `Decimal` to `Nullable(Decimal)` conversion in IN. Support other Decimal to Decimal conversions (including different scales). [#5350](https://github.com/ClickHouse/ClickHouse/pull/5350) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed FPU clobbering in simdjson library that lead to wrong calculation of `uniqHLL` and `uniqCombined` aggregate function and math functions such as `log`. [#5354](https://github.com/ClickHouse/ClickHouse/pull/5354) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed handling mixed const/nonconst cases in JSON functions. [#5435](https://github.com/ClickHouse/ClickHouse/pull/5435) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fix `retention` function. Now all conditions that satisfy in a row of data are added to the data state. [#5119](https://github.com/ClickHouse/ClickHouse/pull/5119) ([小路](https://github.com/nicelulu)) +* Fix result type for `quantileExact` with Decimals. [#5304](https://github.com/ClickHouse/ClickHouse/pull/5304) ([Artem Zuikov](https://github.com/4ertus2)) ### Documentation -* Translate documentation for `CollapsingMergeTree` to chinese. [#5168](https://github.com/yandex/ClickHouse/pull/5168) ([张风啸](https://github.com/AlexZFX)) +* Translate documentation for `CollapsingMergeTree` to chinese. [#5168](https://github.com/ClickHouse/ClickHouse/pull/5168) ([张风啸](https://github.com/AlexZFX)) * Translate some documentation about table engines to chinese. - [#5134](https://github.com/yandex/ClickHouse/pull/5134) - [#5328](https://github.com/yandex/ClickHouse/pull/5328) + [#5134](https://github.com/ClickHouse/ClickHouse/pull/5134) + [#5328](https://github.com/ClickHouse/ClickHouse/pull/5328) ([never lee](https://github.com/neverlee)) ### Build/Testing/Packaging Improvements -* Fix some sanitizer reports that show probable use-after-free.[#5139](https://github.com/yandex/ClickHouse/pull/5139) [#5143](https://github.com/yandex/ClickHouse/pull/5143) [#5393](https://github.com/yandex/ClickHouse/pull/5393) ([Ivan](https://github.com/abyss7)) -* Move performance tests out of separate directories for convenience. [#5158](https://github.com/yandex/ClickHouse/pull/5158) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix incorrect performance tests. [#5255](https://github.com/yandex/ClickHouse/pull/5255) ([alesapin](https://github.com/alesapin)) -* Added a tool to calculate checksums caused by bit flips to debug hardware issues. [#5334](https://github.com/yandex/ClickHouse/pull/5334) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Make runner script more usable. [#5340](https://github.com/yandex/ClickHouse/pull/5340)[#5360](https://github.com/yandex/ClickHouse/pull/5360) ([filimonov](https://github.com/filimonov)) -* Add small instruction how to write performance tests. [#5408](https://github.com/yandex/ClickHouse/pull/5408) ([alesapin](https://github.com/alesapin)) -* Add ability to make substitutions in create, fill and drop query in performance tests [#5367](https://github.com/yandex/ClickHouse/pull/5367) ([Olga Khvostikova](https://github.com/stavrolia)) +* Fix some sanitizer reports that show probable use-after-free.[#5139](https://github.com/ClickHouse/ClickHouse/pull/5139) [#5143](https://github.com/ClickHouse/ClickHouse/pull/5143) [#5393](https://github.com/ClickHouse/ClickHouse/pull/5393) ([Ivan](https://github.com/abyss7)) +* Move performance tests out of separate directories for convenience. [#5158](https://github.com/ClickHouse/ClickHouse/pull/5158) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix incorrect performance tests. [#5255](https://github.com/ClickHouse/ClickHouse/pull/5255) ([alesapin](https://github.com/alesapin)) +* Added a tool to calculate checksums caused by bit flips to debug hardware issues. [#5334](https://github.com/ClickHouse/ClickHouse/pull/5334) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Make runner script more usable. [#5340](https://github.com/ClickHouse/ClickHouse/pull/5340)[#5360](https://github.com/ClickHouse/ClickHouse/pull/5360) ([filimonov](https://github.com/filimonov)) +* Add small instruction how to write performance tests. [#5408](https://github.com/ClickHouse/ClickHouse/pull/5408) ([alesapin](https://github.com/alesapin)) +* Add ability to make substitutions in create, fill and drop query in performance tests [#5367](https://github.com/ClickHouse/ClickHouse/pull/5367) ([Olga Khvostikova](https://github.com/stavrolia)) ## ClickHouse release 19.7.5.27, 2019-06-09 ### New features -* Added bitmap related functions `bitmapHasAny` and `bitmapHasAll` analogous to `hasAny` and `hasAll` functions for arrays. [#5279](https://github.com/yandex/ClickHouse/pull/5279) ([Sergi Vladykin](https://github.com/svladykin)) +* Added bitmap related functions `bitmapHasAny` and `bitmapHasAll` analogous to `hasAny` and `hasAll` functions for arrays. [#5279](https://github.com/ClickHouse/ClickHouse/pull/5279) ([Sergi Vladykin](https://github.com/svladykin)) ### Bug Fixes -* Fix segfault on `minmax` INDEX with Null value. [#5246](https://github.com/yandex/ClickHouse/pull/5246) ([Nikita Vasilev](https://github.com/nikvas0)) -* Mark all input columns in LIMIT BY as required output. It fixes 'Not found column' error in some distributed queries. [#5407](https://github.com/yandex/ClickHouse/pull/5407) ([Constantin S. Pan](https://github.com/kvap)) -* Fix "Column '0' already exists" error in `SELECT .. PREWHERE` on column with DEFAULT [#5397](https://github.com/yandex/ClickHouse/pull/5397) ([proller](https://github.com/proller)) -* Fix `ALTER MODIFY TTL` query on `ReplicatedMergeTree`. [#5539](https://github.com/yandex/ClickHouse/pull/5539/commits) ([Anton Popov](https://github.com/CurtizJ)) -* Don't crash the server when Kafka consumers have failed to start. [#5285](https://github.com/yandex/ClickHouse/pull/5285) ([Ivan](https://github.com/abyss7)) -* Fixed bitmap functions produce wrong result. [#5359](https://github.com/yandex/ClickHouse/pull/5359) ([Andy Yang](https://github.com/andyyzh)) -* Fix element_count for hashed dictionary (do not include duplicates) [#5440](https://github.com/yandex/ClickHouse/pull/5440) ([Azat Khuzhin](https://github.com/azat)) -* Use contents of environment variable TZ as the name for timezone. It helps to correctly detect default timezone in some cases.[#5443](https://github.com/yandex/ClickHouse/pull/5443) ([Ivan](https://github.com/abyss7)) -* Do not try to convert integers in `dictGetT` functions, because it doesn't work correctly. Throw an exception instead. [#5446](https://github.com/yandex/ClickHouse/pull/5446) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix settings in ExternalData HTTP request. [#5455](https://github.com/yandex/ClickHouse/pull/5455) ([Danila +* Fix segfault on `minmax` INDEX with Null value. [#5246](https://github.com/ClickHouse/ClickHouse/pull/5246) ([Nikita Vasilev](https://github.com/nikvas0)) +* Mark all input columns in LIMIT BY as required output. It fixes 'Not found column' error in some distributed queries. [#5407](https://github.com/ClickHouse/ClickHouse/pull/5407) ([Constantin S. Pan](https://github.com/kvap)) +* Fix "Column '0' already exists" error in `SELECT .. PREWHERE` on column with DEFAULT [#5397](https://github.com/ClickHouse/ClickHouse/pull/5397) ([proller](https://github.com/proller)) +* Fix `ALTER MODIFY TTL` query on `ReplicatedMergeTree`. [#5539](https://github.com/ClickHouse/ClickHouse/pull/5539/commits) ([Anton Popov](https://github.com/CurtizJ)) +* Don't crash the server when Kafka consumers have failed to start. [#5285](https://github.com/ClickHouse/ClickHouse/pull/5285) ([Ivan](https://github.com/abyss7)) +* Fixed bitmap functions produce wrong result. [#5359](https://github.com/ClickHouse/ClickHouse/pull/5359) ([Andy Yang](https://github.com/andyyzh)) +* Fix element_count for hashed dictionary (do not include duplicates) [#5440](https://github.com/ClickHouse/ClickHouse/pull/5440) ([Azat Khuzhin](https://github.com/azat)) +* Use contents of environment variable TZ as the name for timezone. It helps to correctly detect default timezone in some cases.[#5443](https://github.com/ClickHouse/ClickHouse/pull/5443) ([Ivan](https://github.com/abyss7)) +* Do not try to convert integers in `dictGetT` functions, because it doesn't work correctly. Throw an exception instead. [#5446](https://github.com/ClickHouse/ClickHouse/pull/5446) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix settings in ExternalData HTTP request. [#5455](https://github.com/ClickHouse/ClickHouse/pull/5455) ([Danila Kutenin](https://github.com/danlark1)) -* Fix bug when parts were removed only from FS without dropping them from Zookeeper. [#5520](https://github.com/yandex/ClickHouse/pull/5520) ([alesapin](https://github.com/alesapin)) -* Fix segmentation fault in `bitmapHasAny` function. [#5528](https://github.com/yandex/ClickHouse/pull/5528) ([Zhichang Yu](https://github.com/yuzhichang)) -* Fixed error when replication connection pool doesn't retry to resolve host, even when DNS cache was dropped. [#5534](https://github.com/yandex/ClickHouse/pull/5534) ([alesapin](https://github.com/alesapin)) -* Fixed `DROP INDEX IF EXISTS` query. Now `ALTER TABLE ... DROP INDEX IF EXISTS ...` query doesn't raise an exception if provided index does not exist. [#5524](https://github.com/yandex/ClickHouse/pull/5524) ([Gleb Novikov](https://github.com/NanoBjorn)) -* Fix union all supertype column. There were cases with inconsistent data and column types of resulting columns. [#5503](https://github.com/yandex/ClickHouse/pull/5503) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix bug when parts were removed only from FS without dropping them from Zookeeper. [#5520](https://github.com/ClickHouse/ClickHouse/pull/5520) ([alesapin](https://github.com/alesapin)) +* Fix segmentation fault in `bitmapHasAny` function. [#5528](https://github.com/ClickHouse/ClickHouse/pull/5528) ([Zhichang Yu](https://github.com/yuzhichang)) +* Fixed error when replication connection pool doesn't retry to resolve host, even when DNS cache was dropped. [#5534](https://github.com/ClickHouse/ClickHouse/pull/5534) ([alesapin](https://github.com/alesapin)) +* Fixed `DROP INDEX IF EXISTS` query. Now `ALTER TABLE ... DROP INDEX IF EXISTS ...` query doesn't raise an exception if provided index does not exist. [#5524](https://github.com/ClickHouse/ClickHouse/pull/5524) ([Gleb Novikov](https://github.com/NanoBjorn)) +* Fix union all supertype column. There were cases with inconsistent data and column types of resulting columns. [#5503](https://github.com/ClickHouse/ClickHouse/pull/5503) ([Artem Zuikov](https://github.com/4ertus2)) * Skip ZNONODE during DDL query processing. Before if another node removes the znode in task queue, the one that -did not process it, but already get list of children, will terminate the DDLWorker thread. [#5489](https://github.com/yandex/ClickHouse/pull/5489) ([Azat Khuzhin](https://github.com/azat)) -* Fix INSERT into Distributed() table with MATERIALIZED column. [#5429](https://github.com/yandex/ClickHouse/pull/5429) ([Azat Khuzhin](https://github.com/azat)) +did not process it, but already get list of children, will terminate the DDLWorker thread. [#5489](https://github.com/ClickHouse/ClickHouse/pull/5489) ([Azat Khuzhin](https://github.com/azat)) +* Fix INSERT into Distributed() table with MATERIALIZED column. [#5429](https://github.com/ClickHouse/ClickHouse/pull/5429) ([Azat Khuzhin](https://github.com/azat)) ## ClickHouse release 19.7.3.9, 2019-05-30 ### New Features * Allow to limit the range of a setting that can be specified by user. These constraints can be set up in user settings profile. -[#4931](https://github.com/yandex/ClickHouse/pull/4931) ([Vitaly +[#4931](https://github.com/ClickHouse/ClickHouse/pull/4931) ([Vitaly Baranov](https://github.com/vitlibar)) * Add a second version of the function `groupUniqArray` with an optional `max_size` parameter that limits the size of the resulting array. This behavior is similar to `groupArray(max_size)(x)` function. -[#5026](https://github.com/yandex/ClickHouse/pull/5026) ([Guillaume +[#5026](https://github.com/ClickHouse/ClickHouse/pull/5026) ([Guillaume Tassery](https://github.com/YiuRULE)) * For TSVWithNames/CSVWithNames input file formats, column order can now be determined from file header. This is controlled by `input_format_with_names_use_header` parameter. -[#5081](https://github.com/yandex/ClickHouse/pull/5081) +[#5081](https://github.com/ClickHouse/ClickHouse/pull/5081) ([Alexander](https://github.com/Akazz)) ### Bug Fixes * Crash with uncompressed_cache + JOIN during merge (#5197) -[#5133](https://github.com/yandex/ClickHouse/pull/5133) ([Danila +[#5133](https://github.com/ClickHouse/ClickHouse/pull/5133) ([Danila Kutenin](https://github.com/danlark1)) * Segmentation fault on a clickhouse-client query to system tables. #5066 -[#5127](https://github.com/yandex/ClickHouse/pull/5127) +[#5127](https://github.com/ClickHouse/ClickHouse/pull/5127) ([Ivan](https://github.com/abyss7)) * Data loss on heavy load via KafkaEngine (#4736) -[#5080](https://github.com/yandex/ClickHouse/pull/5080) +[#5080](https://github.com/ClickHouse/ClickHouse/pull/5080) ([Ivan](https://github.com/abyss7)) -* Fixed very rare data race condition that could happen when executing a query with UNION ALL involving at least two SELECTs from system.columns, system.tables, system.parts, system.parts_tables or tables of Merge family and performing ALTER of columns of the related tables concurrently. [#5189](https://github.com/yandex/ClickHouse/pull/5189) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed very rare data race condition that could happen when executing a query with UNION ALL involving at least two SELECTs from system.columns, system.tables, system.parts, system.parts_tables or tables of Merge family and performing ALTER of columns of the related tables concurrently. [#5189](https://github.com/ClickHouse/ClickHouse/pull/5189) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Performance Improvements * Use radix sort for sorting by single numeric column in `ORDER BY` without - `LIMIT`. [#5106](https://github.com/yandex/ClickHouse/pull/5106), -[#4439](https://github.com/yandex/ClickHouse/pull/4439) + `LIMIT`. [#5106](https://github.com/ClickHouse/ClickHouse/pull/5106), +[#4439](https://github.com/ClickHouse/ClickHouse/pull/4439) ([Evgenii Pravda](https://github.com/kvinty), [alexey-milovidov](https://github.com/alexey-milovidov)) ### Documentation * Translate documentation for some table engines to Chinese. - [#5107](https://github.com/yandex/ClickHouse/pull/5107), -[#5094](https://github.com/yandex/ClickHouse/pull/5094), -[#5087](https://github.com/yandex/ClickHouse/pull/5087) + [#5107](https://github.com/ClickHouse/ClickHouse/pull/5107), +[#5094](https://github.com/ClickHouse/ClickHouse/pull/5094), +[#5087](https://github.com/ClickHouse/ClickHouse/pull/5087) ([张风啸](https://github.com/AlexZFX)), -[#5068](https://github.com/yandex/ClickHouse/pull/5068) ([never +[#5068](https://github.com/ClickHouse/ClickHouse/pull/5068) ([never lee](https://github.com/neverlee)) ### Build/Testing/Packaging Improvements * Print UTF-8 characters properly in `clickhouse-test`. - [#5084](https://github.com/yandex/ClickHouse/pull/5084) + [#5084](https://github.com/ClickHouse/ClickHouse/pull/5084) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Add command line parameter for clickhouse-client to always load suggestion - data. [#5102](https://github.com/yandex/ClickHouse/pull/5102) + data. [#5102](https://github.com/ClickHouse/ClickHouse/pull/5102) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Resolve some of PVS-Studio warnings. - [#5082](https://github.com/yandex/ClickHouse/pull/5082) + [#5082](https://github.com/ClickHouse/ClickHouse/pull/5082) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Update LZ4 [#5040](https://github.com/yandex/ClickHouse/pull/5040) ([Danila +* Update LZ4 [#5040](https://github.com/ClickHouse/ClickHouse/pull/5040) ([Danila Kutenin](https://github.com/danlark1)) * Add gperf to build requirements for upcoming pull request #5030. - [#5110](https://github.com/yandex/ClickHouse/pull/5110) + [#5110](https://github.com/ClickHouse/ClickHouse/pull/5110) ([proller](https://github.com/proller)) ## ClickHouse release 19.6.3.18, 2019-06-13 ### Bug Fixes -* Fixed IN condition pushdown for queries from table functions `mysql` and `odbc` and corresponding table engines. This fixes #3540 and #2384. [#5313](https://github.com/yandex/ClickHouse/pull/5313) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix deadlock in Zookeeper. [#5297](https://github.com/yandex/ClickHouse/pull/5297) ([github1youlc](https://github.com/github1youlc)) -* Allow quoted decimals in CSV. [#5284](https://github.com/yandex/ClickHouse/pull/5284) ([Artem Zuikov](https://github.com/4ertus2) -* Disallow conversion from float Inf/NaN into Decimals (throw exception). [#5282](https://github.com/yandex/ClickHouse/pull/5282) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix data race in rename query. [#5247](https://github.com/yandex/ClickHouse/pull/5247) ([Winter Zhang](https://github.com/zhang2014)) -* Temporarily disable LFAlloc. Usage of LFAlloc might lead to a lot of MAP_FAILED in allocating UncompressedCache and in a result to crashes of queries at high loaded servers. [cfdba93](https://github.com/yandex/ClickHouse/commit/cfdba938ce22f16efeec504f7f90206a515b1280)([Danila Kutenin](https://github.com/danlark1)) +* Fixed IN condition pushdown for queries from table functions `mysql` and `odbc` and corresponding table engines. This fixes #3540 and #2384. [#5313](https://github.com/ClickHouse/ClickHouse/pull/5313) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix deadlock in Zookeeper. [#5297](https://github.com/ClickHouse/ClickHouse/pull/5297) ([github1youlc](https://github.com/github1youlc)) +* Allow quoted decimals in CSV. [#5284](https://github.com/ClickHouse/ClickHouse/pull/5284) ([Artem Zuikov](https://github.com/4ertus2) +* Disallow conversion from float Inf/NaN into Decimals (throw exception). [#5282](https://github.com/ClickHouse/ClickHouse/pull/5282) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix data race in rename query. [#5247](https://github.com/ClickHouse/ClickHouse/pull/5247) ([Winter Zhang](https://github.com/zhang2014)) +* Temporarily disable LFAlloc. Usage of LFAlloc might lead to a lot of MAP_FAILED in allocating UncompressedCache and in a result to crashes of queries at high loaded servers. [cfdba93](https://github.com/ClickHouse/ClickHouse/commit/cfdba938ce22f16efeec504f7f90206a515b1280)([Danila Kutenin](https://github.com/danlark1)) ## ClickHouse release 19.6.2.11, 2019-05-13 ### New Features -* TTL expressions for columns and tables. [#4212](https://github.com/yandex/ClickHouse/pull/4212) ([Anton Popov](https://github.com/CurtizJ)) -* Added support for `brotli` compression for HTTP responses (Accept-Encoding: br) [#4388](https://github.com/yandex/ClickHouse/pull/4388) ([Mikhail](https://github.com/fandyushin)) -* Added new function `isValidUTF8` for checking whether a set of bytes is correctly utf-8 encoded. [#4934](https://github.com/yandex/ClickHouse/pull/4934) ([Danila Kutenin](https://github.com/danlark1)) -* Add new load balancing policy `first_or_random` which sends queries to the first specified host and if it's inaccessible send queries to random hosts of shard. Useful for cross-replication topology setups. [#5012](https://github.com/yandex/ClickHouse/pull/5012) ([nvartolomei](https://github.com/nvartolomei)) +* TTL expressions for columns and tables. [#4212](https://github.com/ClickHouse/ClickHouse/pull/4212) ([Anton Popov](https://github.com/CurtizJ)) +* Added support for `brotli` compression for HTTP responses (Accept-Encoding: br) [#4388](https://github.com/ClickHouse/ClickHouse/pull/4388) ([Mikhail](https://github.com/fandyushin)) +* Added new function `isValidUTF8` for checking whether a set of bytes is correctly utf-8 encoded. [#4934](https://github.com/ClickHouse/ClickHouse/pull/4934) ([Danila Kutenin](https://github.com/danlark1)) +* Add new load balancing policy `first_or_random` which sends queries to the first specified host and if it's inaccessible send queries to random hosts of shard. Useful for cross-replication topology setups. [#5012](https://github.com/ClickHouse/ClickHouse/pull/5012) ([nvartolomei](https://github.com/nvartolomei)) ### Experimental Features -* Add setting `index_granularity_bytes` (adaptive index granularity) for MergeTree* tables family. [#4826](https://github.com/yandex/ClickHouse/pull/4826) ([alesapin](https://github.com/alesapin)) +* Add setting `index_granularity_bytes` (adaptive index granularity) for MergeTree* tables family. [#4826](https://github.com/ClickHouse/ClickHouse/pull/4826) ([alesapin](https://github.com/alesapin)) ### Improvements -* Added support for non-constant and negative size and length arguments for function `substringUTF8`. [#4989](https://github.com/yandex/ClickHouse/pull/4989) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Disable push-down to right table in left join, left table in right join, and both tables in full join. This fixes wrong JOIN results in some cases. [#4846](https://github.com/yandex/ClickHouse/pull/4846) ([Ivan](https://github.com/abyss7)) -* `clickhouse-copier`: auto upload task configuration from `--task-file` option [#4876](https://github.com/yandex/ClickHouse/pull/4876) ([proller](https://github.com/proller)) -* Added typos handler for storage factory and table functions factory. [#4891](https://github.com/yandex/ClickHouse/pull/4891) ([Danila Kutenin](https://github.com/danlark1)) -* Support asterisks and qualified asterisks for multiple joins without subqueries [#4898](https://github.com/yandex/ClickHouse/pull/4898) ([Artem Zuikov](https://github.com/4ertus2)) -* Make missing column error message more user friendly. [#4915](https://github.com/yandex/ClickHouse/pull/4915) ([Artem Zuikov](https://github.com/4ertus2)) +* Added support for non-constant and negative size and length arguments for function `substringUTF8`. [#4989](https://github.com/ClickHouse/ClickHouse/pull/4989) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Disable push-down to right table in left join, left table in right join, and both tables in full join. This fixes wrong JOIN results in some cases. [#4846](https://github.com/ClickHouse/ClickHouse/pull/4846) ([Ivan](https://github.com/abyss7)) +* `clickhouse-copier`: auto upload task configuration from `--task-file` option [#4876](https://github.com/ClickHouse/ClickHouse/pull/4876) ([proller](https://github.com/proller)) +* Added typos handler for storage factory and table functions factory. [#4891](https://github.com/ClickHouse/ClickHouse/pull/4891) ([Danila Kutenin](https://github.com/danlark1)) +* Support asterisks and qualified asterisks for multiple joins without subqueries [#4898](https://github.com/ClickHouse/ClickHouse/pull/4898) ([Artem Zuikov](https://github.com/4ertus2)) +* Make missing column error message more user friendly. [#4915](https://github.com/ClickHouse/ClickHouse/pull/4915) ([Artem Zuikov](https://github.com/4ertus2)) ### Performance Improvements -* Significant speedup of ASOF JOIN [#4924](https://github.com/yandex/ClickHouse/pull/4924) ([Martijn Bakker](https://github.com/Gladdy)) +* Significant speedup of ASOF JOIN [#4924](https://github.com/ClickHouse/ClickHouse/pull/4924) ([Martijn Bakker](https://github.com/Gladdy)) ### Backward Incompatible Changes -* HTTP header `Query-Id` was renamed to `X-ClickHouse-Query-Id` for consistency. [#4972](https://github.com/yandex/ClickHouse/pull/4972) ([Mikhail](https://github.com/fandyushin)) +* HTTP header `Query-Id` was renamed to `X-ClickHouse-Query-Id` for consistency. [#4972](https://github.com/ClickHouse/ClickHouse/pull/4972) ([Mikhail](https://github.com/fandyushin)) ### Bug Fixes -* Fixed potential null pointer dereference in `clickhouse-copier`. [#4900](https://github.com/yandex/ClickHouse/pull/4900) ([proller](https://github.com/proller)) -* Fixed error on query with JOIN + ARRAY JOIN [#4938](https://github.com/yandex/ClickHouse/pull/4938) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed hanging on start of the server when a dictionary depends on another dictionary via a database with engine=Dictionary. [#4962](https://github.com/yandex/ClickHouse/pull/4962) ([Vitaly Baranov](https://github.com/vitlibar)) -* Partially fix distributed_product_mode = local. It's possible to allow columns of local tables in where/having/order by/... via table aliases. Throw exception if table does not have alias. There's not possible to access to the columns without table aliases yet. [#4986](https://github.com/yandex/ClickHouse/pull/4986) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix potentially wrong result for `SELECT DISTINCT` with `JOIN` [#5001](https://github.com/yandex/ClickHouse/pull/5001) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed very rare data race condition that could happen when executing a query with UNION ALL involving at least two SELECTs from system.columns, system.tables, system.parts, system.parts_tables or tables of Merge family and performing ALTER of columns of the related tables concurrently. [#5189](https://github.com/yandex/ClickHouse/pull/5189) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed potential null pointer dereference in `clickhouse-copier`. [#4900](https://github.com/ClickHouse/ClickHouse/pull/4900) ([proller](https://github.com/proller)) +* Fixed error on query with JOIN + ARRAY JOIN [#4938](https://github.com/ClickHouse/ClickHouse/pull/4938) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed hanging on start of the server when a dictionary depends on another dictionary via a database with engine=Dictionary. [#4962](https://github.com/ClickHouse/ClickHouse/pull/4962) ([Vitaly Baranov](https://github.com/vitlibar)) +* Partially fix distributed_product_mode = local. It's possible to allow columns of local tables in where/having/order by/... via table aliases. Throw exception if table does not have alias. There's not possible to access to the columns without table aliases yet. [#4986](https://github.com/ClickHouse/ClickHouse/pull/4986) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix potentially wrong result for `SELECT DISTINCT` with `JOIN` [#5001](https://github.com/ClickHouse/ClickHouse/pull/5001) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed very rare data race condition that could happen when executing a query with UNION ALL involving at least two SELECTs from system.columns, system.tables, system.parts, system.parts_tables or tables of Merge family and performing ALTER of columns of the related tables concurrently. [#5189](https://github.com/ClickHouse/ClickHouse/pull/5189) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Build/Testing/Packaging Improvements -* Fixed test failures when running clickhouse-server on different host [#4713](https://github.com/yandex/ClickHouse/pull/4713) ([Vasily Nemkov](https://github.com/Enmk)) -* clickhouse-test: Disable color control sequences in non tty environment. [#4937](https://github.com/yandex/ClickHouse/pull/4937) ([alesapin](https://github.com/alesapin)) -* clickhouse-test: Allow use any test database (remove `test.` qualification where it possible) [#5008](https://github.com/yandex/ClickHouse/pull/5008) ([proller](https://github.com/proller)) -* Fix ubsan errors [#5037](https://github.com/yandex/ClickHouse/pull/5037) ([Vitaly Baranov](https://github.com/vitlibar)) -* Yandex LFAlloc was added to ClickHouse to allocate MarkCache and UncompressedCache data in different ways to catch segfaults more reliable [#4995](https://github.com/yandex/ClickHouse/pull/4995) ([Danila Kutenin](https://github.com/danlark1)) -* Python util to help with backports and changelogs. [#4949](https://github.com/yandex/ClickHouse/pull/4949) ([Ivan](https://github.com/abyss7)) +* Fixed test failures when running clickhouse-server on different host [#4713](https://github.com/ClickHouse/ClickHouse/pull/4713) ([Vasily Nemkov](https://github.com/Enmk)) +* clickhouse-test: Disable color control sequences in non tty environment. [#4937](https://github.com/ClickHouse/ClickHouse/pull/4937) ([alesapin](https://github.com/alesapin)) +* clickhouse-test: Allow use any test database (remove `test.` qualification where it possible) [#5008](https://github.com/ClickHouse/ClickHouse/pull/5008) ([proller](https://github.com/proller)) +* Fix ubsan errors [#5037](https://github.com/ClickHouse/ClickHouse/pull/5037) ([Vitaly Baranov](https://github.com/vitlibar)) +* Yandex LFAlloc was added to ClickHouse to allocate MarkCache and UncompressedCache data in different ways to catch segfaults more reliable [#4995](https://github.com/ClickHouse/ClickHouse/pull/4995) ([Danila Kutenin](https://github.com/danlark1)) +* Python util to help with backports and changelogs. [#4949](https://github.com/ClickHouse/ClickHouse/pull/4949) ([Ivan](https://github.com/abyss7)) ## ClickHouse release 19.5.4.22, 2019-05-13 ### Bug fixes -* Fixed possible crash in bitmap* functions [#5220](https://github.com/yandex/ClickHouse/pull/5220) [#5228](https://github.com/yandex/ClickHouse/pull/5228) ([Andy Yang](https://github.com/andyyzh)) -* Fixed very rare data race condition that could happen when executing a query with UNION ALL involving at least two SELECTs from system.columns, system.tables, system.parts, system.parts_tables or tables of Merge family and performing ALTER of columns of the related tables concurrently. [#5189](https://github.com/yandex/ClickHouse/pull/5189) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed error `Set for IN is not created yet in case of using single LowCardinality column in the left part of IN`. This error happened if LowCardinality column was the part of primary key. #5031 [#5154](https://github.com/yandex/ClickHouse/pull/5154) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Modification of retention function: If a row satisfies both the first and NTH condition, only the first satisfied condition is added to the data state. Now all conditions that satisfy in a row of data are added to the data state. [#5119](https://github.com/yandex/ClickHouse/pull/5119) ([小路](https://github.com/nicelulu)) +* Fixed possible crash in bitmap* functions [#5220](https://github.com/ClickHouse/ClickHouse/pull/5220) [#5228](https://github.com/ClickHouse/ClickHouse/pull/5228) ([Andy Yang](https://github.com/andyyzh)) +* Fixed very rare data race condition that could happen when executing a query with UNION ALL involving at least two SELECTs from system.columns, system.tables, system.parts, system.parts_tables or tables of Merge family and performing ALTER of columns of the related tables concurrently. [#5189](https://github.com/ClickHouse/ClickHouse/pull/5189) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed error `Set for IN is not created yet in case of using single LowCardinality column in the left part of IN`. This error happened if LowCardinality column was the part of primary key. #5031 [#5154](https://github.com/ClickHouse/ClickHouse/pull/5154) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Modification of retention function: If a row satisfies both the first and NTH condition, only the first satisfied condition is added to the data state. Now all conditions that satisfy in a row of data are added to the data state. [#5119](https://github.com/ClickHouse/ClickHouse/pull/5119) ([小路](https://github.com/nicelulu)) ## ClickHouse release 19.5.3.8, 2019-04-18 ### Bug fixes -* Fixed type of setting `max_partitions_per_insert_block` from boolean to UInt64. [#5028](https://github.com/yandex/ClickHouse/pull/5028) ([Mohammad Hossein Sekhavat](https://github.com/mhsekhavat)) +* Fixed type of setting `max_partitions_per_insert_block` from boolean to UInt64. [#5028](https://github.com/ClickHouse/ClickHouse/pull/5028) ([Mohammad Hossein Sekhavat](https://github.com/mhsekhavat)) ## ClickHouse release 19.5.2.6, 2019-04-15 ### New Features -* [Hyperscan](https://github.com/intel/hyperscan) multiple regular expression matching was added (functions `multiMatchAny`, `multiMatchAnyIndex`, `multiFuzzyMatchAny`, `multiFuzzyMatchAnyIndex`). [#4780](https://github.com/yandex/ClickHouse/pull/4780), [#4841](https://github.com/yandex/ClickHouse/pull/4841) ([Danila Kutenin](https://github.com/danlark1)) -* `multiSearchFirstPosition` function was added. [#4780](https://github.com/yandex/ClickHouse/pull/4780) ([Danila Kutenin](https://github.com/danlark1)) -* Implement the predefined expression filter per row for tables. [#4792](https://github.com/yandex/ClickHouse/pull/4792) ([Ivan](https://github.com/abyss7)) -* A new type of data skipping indices based on bloom filters (can be used for `equal`, `in` and `like` functions). [#4499](https://github.com/yandex/ClickHouse/pull/4499) ([Nikita Vasilev](https://github.com/nikvas0)) -* Added `ASOF JOIN` which allows to run queries that join to the most recent value known. [#4774](https://github.com/yandex/ClickHouse/pull/4774) [#4867](https://github.com/yandex/ClickHouse/pull/4867) [#4863](https://github.com/yandex/ClickHouse/pull/4863) [#4875](https://github.com/yandex/ClickHouse/pull/4875) ([Martijn Bakker](https://github.com/Gladdy), [Artem Zuikov](https://github.com/4ertus2)) -* Rewrite multiple `COMMA JOIN` to `CROSS JOIN`. Then rewrite them to `INNER JOIN` if possible. [#4661](https://github.com/yandex/ClickHouse/pull/4661) ([Artem Zuikov](https://github.com/4ertus2)) +* [Hyperscan](https://github.com/intel/hyperscan) multiple regular expression matching was added (functions `multiMatchAny`, `multiMatchAnyIndex`, `multiFuzzyMatchAny`, `multiFuzzyMatchAnyIndex`). [#4780](https://github.com/ClickHouse/ClickHouse/pull/4780), [#4841](https://github.com/ClickHouse/ClickHouse/pull/4841) ([Danila Kutenin](https://github.com/danlark1)) +* `multiSearchFirstPosition` function was added. [#4780](https://github.com/ClickHouse/ClickHouse/pull/4780) ([Danila Kutenin](https://github.com/danlark1)) +* Implement the predefined expression filter per row for tables. [#4792](https://github.com/ClickHouse/ClickHouse/pull/4792) ([Ivan](https://github.com/abyss7)) +* A new type of data skipping indices based on bloom filters (can be used for `equal`, `in` and `like` functions). [#4499](https://github.com/ClickHouse/ClickHouse/pull/4499) ([Nikita Vasilev](https://github.com/nikvas0)) +* Added `ASOF JOIN` which allows to run queries that join to the most recent value known. [#4774](https://github.com/ClickHouse/ClickHouse/pull/4774) [#4867](https://github.com/ClickHouse/ClickHouse/pull/4867) [#4863](https://github.com/ClickHouse/ClickHouse/pull/4863) [#4875](https://github.com/ClickHouse/ClickHouse/pull/4875) ([Martijn Bakker](https://github.com/Gladdy), [Artem Zuikov](https://github.com/4ertus2)) +* Rewrite multiple `COMMA JOIN` to `CROSS JOIN`. Then rewrite them to `INNER JOIN` if possible. [#4661](https://github.com/ClickHouse/ClickHouse/pull/4661) ([Artem Zuikov](https://github.com/4ertus2)) ### Improvement -* `topK` and `topKWeighted` now supports custom `loadFactor` (fixes issue [#4252](https://github.com/yandex/ClickHouse/issues/4252)). [#4634](https://github.com/yandex/ClickHouse/pull/4634) ([Kirill Danshin](https://github.com/kirillDanshin)) -* Allow to use `parallel_replicas_count > 1` even for tables without sampling (the setting is simply ignored for them). In previous versions it was lead to exception. [#4637](https://github.com/yandex/ClickHouse/pull/4637) ([Alexey Elymanov](https://github.com/digitalist)) -* Support for `CREATE OR REPLACE VIEW`. Allow to create a view or set a new definition in a single statement. [#4654](https://github.com/yandex/ClickHouse/pull/4654) ([Boris Granveaud](https://github.com/bgranvea)) -* `Buffer` table engine now supports `PREWHERE`. [#4671](https://github.com/yandex/ClickHouse/pull/4671) ([Yangkuan Liu](https://github.com/LiuYangkuan)) -* Add ability to start replicated table without metadata in zookeeper in `readonly` mode. [#4691](https://github.com/yandex/ClickHouse/pull/4691) ([alesapin](https://github.com/alesapin)) -* Fixed flicker of progress bar in clickhouse-client. The issue was most noticeable when using `FORMAT Null` with streaming queries. [#4811](https://github.com/yandex/ClickHouse/pull/4811) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Allow to disable functions with `hyperscan` library on per user basis to limit potentially excessive and uncontrolled resource usage. [#4816](https://github.com/yandex/ClickHouse/pull/4816) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add version number logging in all errors. [#4824](https://github.com/yandex/ClickHouse/pull/4824) ([proller](https://github.com/proller)) -* Added restriction to the `multiMatch` functions which requires string size to fit into `unsigned int`. Also added the number of arguments limit to the `multiSearch` functions. [#4834](https://github.com/yandex/ClickHouse/pull/4834) ([Danila Kutenin](https://github.com/danlark1)) -* Improved usage of scratch space and error handling in Hyperscan. [#4866](https://github.com/yandex/ClickHouse/pull/4866) ([Danila Kutenin](https://github.com/danlark1)) -* Fill `system.graphite_detentions` from a table config of `*GraphiteMergeTree` engine tables. [#4584](https://github.com/yandex/ClickHouse/pull/4584) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) -* Rename `trigramDistance` function to `ngramDistance` and add more functions with `CaseInsensitive` and `UTF`. [#4602](https://github.com/yandex/ClickHouse/pull/4602) ([Danila Kutenin](https://github.com/danlark1)) -* Improved data skipping indices calculation. [#4640](https://github.com/yandex/ClickHouse/pull/4640) ([Nikita Vasilev](https://github.com/nikvas0)) -* Keep ordinary, `DEFAULT`, `MATERIALIZED` and `ALIAS` columns in a single list (fixes issue [#2867](https://github.com/yandex/ClickHouse/issues/2867)). [#4707](https://github.com/yandex/ClickHouse/pull/4707) ([Alex Zatelepin](https://github.com/ztlpn)) +* `topK` and `topKWeighted` now supports custom `loadFactor` (fixes issue [#4252](https://github.com/ClickHouse/ClickHouse/issues/4252)). [#4634](https://github.com/ClickHouse/ClickHouse/pull/4634) ([Kirill Danshin](https://github.com/kirillDanshin)) +* Allow to use `parallel_replicas_count > 1` even for tables without sampling (the setting is simply ignored for them). In previous versions it was lead to exception. [#4637](https://github.com/ClickHouse/ClickHouse/pull/4637) ([Alexey Elymanov](https://github.com/digitalist)) +* Support for `CREATE OR REPLACE VIEW`. Allow to create a view or set a new definition in a single statement. [#4654](https://github.com/ClickHouse/ClickHouse/pull/4654) ([Boris Granveaud](https://github.com/bgranvea)) +* `Buffer` table engine now supports `PREWHERE`. [#4671](https://github.com/ClickHouse/ClickHouse/pull/4671) ([Yangkuan Liu](https://github.com/LiuYangkuan)) +* Add ability to start replicated table without metadata in zookeeper in `readonly` mode. [#4691](https://github.com/ClickHouse/ClickHouse/pull/4691) ([alesapin](https://github.com/alesapin)) +* Fixed flicker of progress bar in clickhouse-client. The issue was most noticeable when using `FORMAT Null` with streaming queries. [#4811](https://github.com/ClickHouse/ClickHouse/pull/4811) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Allow to disable functions with `hyperscan` library on per user basis to limit potentially excessive and uncontrolled resource usage. [#4816](https://github.com/ClickHouse/ClickHouse/pull/4816) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add version number logging in all errors. [#4824](https://github.com/ClickHouse/ClickHouse/pull/4824) ([proller](https://github.com/proller)) +* Added restriction to the `multiMatch` functions which requires string size to fit into `unsigned int`. Also added the number of arguments limit to the `multiSearch` functions. [#4834](https://github.com/ClickHouse/ClickHouse/pull/4834) ([Danila Kutenin](https://github.com/danlark1)) +* Improved usage of scratch space and error handling in Hyperscan. [#4866](https://github.com/ClickHouse/ClickHouse/pull/4866) ([Danila Kutenin](https://github.com/danlark1)) +* Fill `system.graphite_detentions` from a table config of `*GraphiteMergeTree` engine tables. [#4584](https://github.com/ClickHouse/ClickHouse/pull/4584) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) +* Rename `trigramDistance` function to `ngramDistance` and add more functions with `CaseInsensitive` and `UTF`. [#4602](https://github.com/ClickHouse/ClickHouse/pull/4602) ([Danila Kutenin](https://github.com/danlark1)) +* Improved data skipping indices calculation. [#4640](https://github.com/ClickHouse/ClickHouse/pull/4640) ([Nikita Vasilev](https://github.com/nikvas0)) +* Keep ordinary, `DEFAULT`, `MATERIALIZED` and `ALIAS` columns in a single list (fixes issue [#2867](https://github.com/ClickHouse/ClickHouse/issues/2867)). [#4707](https://github.com/ClickHouse/ClickHouse/pull/4707) ([Alex Zatelepin](https://github.com/ztlpn)) ### Bug Fix -* Avoid `std::terminate` in case of memory allocation failure. Now `std::bad_alloc` exception is thrown as expected. [#4665](https://github.com/yandex/ClickHouse/pull/4665) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixes capnproto reading from buffer. Sometimes files wasn't loaded successfully by HTTP. [#4674](https://github.com/yandex/ClickHouse/pull/4674) ([Vladislav](https://github.com/smirnov-vs)) -* Fix error `Unknown log entry type: 0` after `OPTIMIZE TABLE FINAL` query. [#4683](https://github.com/yandex/ClickHouse/pull/4683) ([Amos Bird](https://github.com/amosbird)) -* Wrong arguments to `hasAny` or `hasAll` functions may lead to segfault. [#4698](https://github.com/yandex/ClickHouse/pull/4698) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Deadlock may happen while executing `DROP DATABASE dictionary` query. [#4701](https://github.com/yandex/ClickHouse/pull/4701) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix undefined behavior in `median` and `quantile` functions. [#4702](https://github.com/yandex/ClickHouse/pull/4702) ([hcz](https://github.com/hczhcz)) -* Fix compression level detection when `network_compression_method` in lowercase. Broken in v19.1. [#4706](https://github.com/yandex/ClickHouse/pull/4706) ([proller](https://github.com/proller)) -* Fixed ignorance of `UTC` setting (fixes issue [#4658](https://github.com/yandex/ClickHouse/issues/4658)). [#4718](https://github.com/yandex/ClickHouse/pull/4718) ([proller](https://github.com/proller)) -* Fix `histogram` function behaviour with `Distributed` tables. [#4741](https://github.com/yandex/ClickHouse/pull/4741) ([olegkv](https://github.com/olegkv)) -* Fixed tsan report `destroy of a locked mutex`. [#4742](https://github.com/yandex/ClickHouse/pull/4742) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed TSan report on shutdown due to race condition in system logs usage. Fixed potential use-after-free on shutdown when part_log is enabled. [#4758](https://github.com/yandex/ClickHouse/pull/4758) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix recheck parts in `ReplicatedMergeTreeAlterThread` in case of error. [#4772](https://github.com/yandex/ClickHouse/pull/4772) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Arithmetic operations on intermediate aggregate function states were not working for constant arguments (such as subquery results). [#4776](https://github.com/yandex/ClickHouse/pull/4776) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Always backquote column names in metadata. Otherwise it's impossible to create a table with column named `index` (server won't restart due to malformed `ATTACH` query in metadata). [#4782](https://github.com/yandex/ClickHouse/pull/4782) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix crash in `ALTER ... MODIFY ORDER BY` on `Distributed` table. [#4790](https://github.com/yandex/ClickHouse/pull/4790) ([TCeason](https://github.com/TCeason)) -* Fix segfault in `JOIN ON` with enabled `enable_optimize_predicate_expression`. [#4794](https://github.com/yandex/ClickHouse/pull/4794) ([Winter Zhang](https://github.com/zhang2014)) -* Fix bug with adding an extraneous row after consuming a protobuf message from Kafka. [#4808](https://github.com/yandex/ClickHouse/pull/4808) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix crash of `JOIN` on not-nullable vs nullable column. Fix `NULLs` in right keys in `ANY JOIN` + `join_use_nulls`. [#4815](https://github.com/yandex/ClickHouse/pull/4815) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix segmentation fault in `clickhouse-copier`. [#4835](https://github.com/yandex/ClickHouse/pull/4835) ([proller](https://github.com/proller)) -* Fixed race condition in `SELECT` from `system.tables` if the table is renamed or altered concurrently. [#4836](https://github.com/yandex/ClickHouse/pull/4836) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed data race when fetching data part that is already obsolete. [#4839](https://github.com/yandex/ClickHouse/pull/4839) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed rare data race that can happen during `RENAME` table of MergeTree family. [#4844](https://github.com/yandex/ClickHouse/pull/4844) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed segmentation fault in function `arrayIntersect`. Segmentation fault could happen if function was called with mixed constant and ordinary arguments. [#4847](https://github.com/yandex/ClickHouse/pull/4847) ([Lixiang Qian](https://github.com/fancyqlx)) -* Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [#4850](https://github.com/yandex/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix crash in `FULL/RIGHT JOIN` when we joining on nullable vs not nullable. [#4855](https://github.com/yandex/ClickHouse/pull/4855) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix `No message received` exception while fetching parts between replicas. [#4856](https://github.com/yandex/ClickHouse/pull/4856) ([alesapin](https://github.com/alesapin)) -* Fixed `arrayIntersect` function wrong result in case of several repeated values in single array. [#4871](https://github.com/yandex/ClickHouse/pull/4871) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix a race condition during concurrent `ALTER COLUMN` queries that could lead to a server crash (fixes issue [#3421](https://github.com/yandex/ClickHouse/issues/3421)). [#4592](https://github.com/yandex/ClickHouse/pull/4592) ([Alex Zatelepin](https://github.com/ztlpn)) -* Fix incorrect result in `FULL/RIGHT JOIN` with const column. [#4723](https://github.com/yandex/ClickHouse/pull/4723) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix duplicates in `GLOBAL JOIN` with asterisk. [#4705](https://github.com/yandex/ClickHouse/pull/4705) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix parameter deduction in `ALTER MODIFY` of column `CODEC` when column type is not specified. [#4883](https://github.com/yandex/ClickHouse/pull/4883) ([alesapin](https://github.com/alesapin)) -* Functions `cutQueryStringAndFragment()` and `queryStringAndFragment()` now works correctly when `URL` contains a fragment and no query. [#4894](https://github.com/yandex/ClickHouse/pull/4894) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix rare bug when setting `min_bytes_to_use_direct_io` is greater than zero, which occures when thread have to seek backward in column file. [#4897](https://github.com/yandex/ClickHouse/pull/4897) ([alesapin](https://github.com/alesapin)) -* Fix wrong argument types for aggregate functions with `LowCardinality` arguments (fixes issue [#4919](https://github.com/yandex/ClickHouse/issues/4919)). [#4922](https://github.com/yandex/ClickHouse/pull/4922) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix wrong name qualification in `GLOBAL JOIN`. [#4969](https://github.com/yandex/ClickHouse/pull/4969) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix function `toISOWeek` result for year 1970. [#4988](https://github.com/yandex/ClickHouse/pull/4988) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix `DROP`, `TRUNCATE` and `OPTIMIZE` queries duplication, when executed on `ON CLUSTER` for `ReplicatedMergeTree*` tables family. [#4991](https://github.com/yandex/ClickHouse/pull/4991) ([alesapin](https://github.com/alesapin)) +* Avoid `std::terminate` in case of memory allocation failure. Now `std::bad_alloc` exception is thrown as expected. [#4665](https://github.com/ClickHouse/ClickHouse/pull/4665) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixes capnproto reading from buffer. Sometimes files wasn't loaded successfully by HTTP. [#4674](https://github.com/ClickHouse/ClickHouse/pull/4674) ([Vladislav](https://github.com/smirnov-vs)) +* Fix error `Unknown log entry type: 0` after `OPTIMIZE TABLE FINAL` query. [#4683](https://github.com/ClickHouse/ClickHouse/pull/4683) ([Amos Bird](https://github.com/amosbird)) +* Wrong arguments to `hasAny` or `hasAll` functions may lead to segfault. [#4698](https://github.com/ClickHouse/ClickHouse/pull/4698) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Deadlock may happen while executing `DROP DATABASE dictionary` query. [#4701](https://github.com/ClickHouse/ClickHouse/pull/4701) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix undefined behavior in `median` and `quantile` functions. [#4702](https://github.com/ClickHouse/ClickHouse/pull/4702) ([hcz](https://github.com/hczhcz)) +* Fix compression level detection when `network_compression_method` in lowercase. Broken in v19.1. [#4706](https://github.com/ClickHouse/ClickHouse/pull/4706) ([proller](https://github.com/proller)) +* Fixed ignorance of `UTC` setting (fixes issue [#4658](https://github.com/ClickHouse/ClickHouse/issues/4658)). [#4718](https://github.com/ClickHouse/ClickHouse/pull/4718) ([proller](https://github.com/proller)) +* Fix `histogram` function behaviour with `Distributed` tables. [#4741](https://github.com/ClickHouse/ClickHouse/pull/4741) ([olegkv](https://github.com/olegkv)) +* Fixed tsan report `destroy of a locked mutex`. [#4742](https://github.com/ClickHouse/ClickHouse/pull/4742) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed TSan report on shutdown due to race condition in system logs usage. Fixed potential use-after-free on shutdown when part_log is enabled. [#4758](https://github.com/ClickHouse/ClickHouse/pull/4758) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix recheck parts in `ReplicatedMergeTreeAlterThread` in case of error. [#4772](https://github.com/ClickHouse/ClickHouse/pull/4772) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Arithmetic operations on intermediate aggregate function states were not working for constant arguments (such as subquery results). [#4776](https://github.com/ClickHouse/ClickHouse/pull/4776) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Always backquote column names in metadata. Otherwise it's impossible to create a table with column named `index` (server won't restart due to malformed `ATTACH` query in metadata). [#4782](https://github.com/ClickHouse/ClickHouse/pull/4782) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix crash in `ALTER ... MODIFY ORDER BY` on `Distributed` table. [#4790](https://github.com/ClickHouse/ClickHouse/pull/4790) ([TCeason](https://github.com/TCeason)) +* Fix segfault in `JOIN ON` with enabled `enable_optimize_predicate_expression`. [#4794](https://github.com/ClickHouse/ClickHouse/pull/4794) ([Winter Zhang](https://github.com/zhang2014)) +* Fix bug with adding an extraneous row after consuming a protobuf message from Kafka. [#4808](https://github.com/ClickHouse/ClickHouse/pull/4808) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fix crash of `JOIN` on not-nullable vs nullable column. Fix `NULLs` in right keys in `ANY JOIN` + `join_use_nulls`. [#4815](https://github.com/ClickHouse/ClickHouse/pull/4815) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix segmentation fault in `clickhouse-copier`. [#4835](https://github.com/ClickHouse/ClickHouse/pull/4835) ([proller](https://github.com/proller)) +* Fixed race condition in `SELECT` from `system.tables` if the table is renamed or altered concurrently. [#4836](https://github.com/ClickHouse/ClickHouse/pull/4836) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed data race when fetching data part that is already obsolete. [#4839](https://github.com/ClickHouse/ClickHouse/pull/4839) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed rare data race that can happen during `RENAME` table of MergeTree family. [#4844](https://github.com/ClickHouse/ClickHouse/pull/4844) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed segmentation fault in function `arrayIntersect`. Segmentation fault could happen if function was called with mixed constant and ordinary arguments. [#4847](https://github.com/ClickHouse/ClickHouse/pull/4847) ([Lixiang Qian](https://github.com/fancyqlx)) +* Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [#4850](https://github.com/ClickHouse/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fix crash in `FULL/RIGHT JOIN` when we joining on nullable vs not nullable. [#4855](https://github.com/ClickHouse/ClickHouse/pull/4855) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix `No message received` exception while fetching parts between replicas. [#4856](https://github.com/ClickHouse/ClickHouse/pull/4856) ([alesapin](https://github.com/alesapin)) +* Fixed `arrayIntersect` function wrong result in case of several repeated values in single array. [#4871](https://github.com/ClickHouse/ClickHouse/pull/4871) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fix a race condition during concurrent `ALTER COLUMN` queries that could lead to a server crash (fixes issue [#3421](https://github.com/ClickHouse/ClickHouse/issues/3421)). [#4592](https://github.com/ClickHouse/ClickHouse/pull/4592) ([Alex Zatelepin](https://github.com/ztlpn)) +* Fix incorrect result in `FULL/RIGHT JOIN` with const column. [#4723](https://github.com/ClickHouse/ClickHouse/pull/4723) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix duplicates in `GLOBAL JOIN` with asterisk. [#4705](https://github.com/ClickHouse/ClickHouse/pull/4705) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix parameter deduction in `ALTER MODIFY` of column `CODEC` when column type is not specified. [#4883](https://github.com/ClickHouse/ClickHouse/pull/4883) ([alesapin](https://github.com/alesapin)) +* Functions `cutQueryStringAndFragment()` and `queryStringAndFragment()` now works correctly when `URL` contains a fragment and no query. [#4894](https://github.com/ClickHouse/ClickHouse/pull/4894) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fix rare bug when setting `min_bytes_to_use_direct_io` is greater than zero, which occures when thread have to seek backward in column file. [#4897](https://github.com/ClickHouse/ClickHouse/pull/4897) ([alesapin](https://github.com/alesapin)) +* Fix wrong argument types for aggregate functions with `LowCardinality` arguments (fixes issue [#4919](https://github.com/ClickHouse/ClickHouse/issues/4919)). [#4922](https://github.com/ClickHouse/ClickHouse/pull/4922) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fix wrong name qualification in `GLOBAL JOIN`. [#4969](https://github.com/ClickHouse/ClickHouse/pull/4969) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix function `toISOWeek` result for year 1970. [#4988](https://github.com/ClickHouse/ClickHouse/pull/4988) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix `DROP`, `TRUNCATE` and `OPTIMIZE` queries duplication, when executed on `ON CLUSTER` for `ReplicatedMergeTree*` tables family. [#4991](https://github.com/ClickHouse/ClickHouse/pull/4991) ([alesapin](https://github.com/alesapin)) ### Backward Incompatible Change -* Rename setting `insert_sample_with_metadata` to setting `input_format_defaults_for_omitted_fields`. [#4771](https://github.com/yandex/ClickHouse/pull/4771) ([Artem Zuikov](https://github.com/4ertus2)) -* Added setting `max_partitions_per_insert_block` (with value 100 by default). If inserted block contains larger number of partitions, an exception is thrown. Set it to 0 if you want to remove the limit (not recommended). [#4845](https://github.com/yandex/ClickHouse/pull/4845) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Multi-search functions were renamed (`multiPosition` to `multiSearchAllPositions`, `multiSearch` to `multiSearchAny`, `firstMatch` to `multiSearchFirstIndex`). [#4780](https://github.com/yandex/ClickHouse/pull/4780) ([Danila Kutenin](https://github.com/danlark1)) +* Rename setting `insert_sample_with_metadata` to setting `input_format_defaults_for_omitted_fields`. [#4771](https://github.com/ClickHouse/ClickHouse/pull/4771) ([Artem Zuikov](https://github.com/4ertus2)) +* Added setting `max_partitions_per_insert_block` (with value 100 by default). If inserted block contains larger number of partitions, an exception is thrown. Set it to 0 if you want to remove the limit (not recommended). [#4845](https://github.com/ClickHouse/ClickHouse/pull/4845) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Multi-search functions were renamed (`multiPosition` to `multiSearchAllPositions`, `multiSearch` to `multiSearchAny`, `firstMatch` to `multiSearchFirstIndex`). [#4780](https://github.com/ClickHouse/ClickHouse/pull/4780) ([Danila Kutenin](https://github.com/danlark1)) ### Performance Improvement -* Optimize Volnitsky searcher by inlining, giving about 5-10% search improvement for queries with many needles or many similar bigrams. [#4862](https://github.com/yandex/ClickHouse/pull/4862) ([Danila Kutenin](https://github.com/danlark1)) -* Fix performance issue when setting `use_uncompressed_cache` is greater than zero, which appeared when all read data contained in cache. [#4913](https://github.com/yandex/ClickHouse/pull/4913) ([alesapin](https://github.com/alesapin)) +* Optimize Volnitsky searcher by inlining, giving about 5-10% search improvement for queries with many needles or many similar bigrams. [#4862](https://github.com/ClickHouse/ClickHouse/pull/4862) ([Danila Kutenin](https://github.com/danlark1)) +* Fix performance issue when setting `use_uncompressed_cache` is greater than zero, which appeared when all read data contained in cache. [#4913](https://github.com/ClickHouse/ClickHouse/pull/4913) ([alesapin](https://github.com/alesapin)) ### Build/Testing/Packaging Improvement -* Hardening debug build: more granular memory mappings and ASLR; add memory protection for mark cache and index. This allows to find more memory stomping bugs in case when ASan and MSan cannot do it. [#4632](https://github.com/yandex/ClickHouse/pull/4632) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add support for cmake variables `ENABLE_PROTOBUF`, `ENABLE_PARQUET` and `ENABLE_BROTLI` which allows to enable/disable the above features (same as we can do for librdkafka, mysql, etc). [#4669](https://github.com/yandex/ClickHouse/pull/4669) ([Silviu Caragea](https://github.com/silviucpp)) -* Add ability to print process list and stacktraces of all threads if some queries are hung after test run. [#4675](https://github.com/yandex/ClickHouse/pull/4675) ([alesapin](https://github.com/alesapin)) -* Add retries on `Connection loss` error in `clickhouse-test`. [#4682](https://github.com/yandex/ClickHouse/pull/4682) ([alesapin](https://github.com/alesapin)) -* Add freebsd build with vagrant and build with thread sanitizer to packager script. [#4712](https://github.com/yandex/ClickHouse/pull/4712) [#4748](https://github.com/yandex/ClickHouse/pull/4748) ([alesapin](https://github.com/alesapin)) -* Now user asked for password for user `'default'` during installation. [#4725](https://github.com/yandex/ClickHouse/pull/4725) ([proller](https://github.com/proller)) -* Suppress warning in `rdkafka` library. [#4740](https://github.com/yandex/ClickHouse/pull/4740) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Allow ability to build without ssl. [#4750](https://github.com/yandex/ClickHouse/pull/4750) ([proller](https://github.com/proller)) -* Add a way to launch clickhouse-server image from a custom user. [#4753](https://github.com/yandex/ClickHouse/pull/4753) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) -* Upgrade contrib boost to 1.69. [#4793](https://github.com/yandex/ClickHouse/pull/4793) ([proller](https://github.com/proller)) -* Disable usage of `mremap` when compiled with Thread Sanitizer. Surprisingly enough, TSan does not intercept `mremap` (though it does intercept `mmap`, `munmap`) that leads to false positives. Fixed TSan report in stateful tests. [#4859](https://github.com/yandex/ClickHouse/pull/4859) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add test checking using format schema via HTTP interface. [#4864](https://github.com/yandex/ClickHouse/pull/4864) ([Vitaly Baranov](https://github.com/vitlibar)) +* Hardening debug build: more granular memory mappings and ASLR; add memory protection for mark cache and index. This allows to find more memory stomping bugs in case when ASan and MSan cannot do it. [#4632](https://github.com/ClickHouse/ClickHouse/pull/4632) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add support for cmake variables `ENABLE_PROTOBUF`, `ENABLE_PARQUET` and `ENABLE_BROTLI` which allows to enable/disable the above features (same as we can do for librdkafka, mysql, etc). [#4669](https://github.com/ClickHouse/ClickHouse/pull/4669) ([Silviu Caragea](https://github.com/silviucpp)) +* Add ability to print process list and stacktraces of all threads if some queries are hung after test run. [#4675](https://github.com/ClickHouse/ClickHouse/pull/4675) ([alesapin](https://github.com/alesapin)) +* Add retries on `Connection loss` error in `clickhouse-test`. [#4682](https://github.com/ClickHouse/ClickHouse/pull/4682) ([alesapin](https://github.com/alesapin)) +* Add freebsd build with vagrant and build with thread sanitizer to packager script. [#4712](https://github.com/ClickHouse/ClickHouse/pull/4712) [#4748](https://github.com/ClickHouse/ClickHouse/pull/4748) ([alesapin](https://github.com/alesapin)) +* Now user asked for password for user `'default'` during installation. [#4725](https://github.com/ClickHouse/ClickHouse/pull/4725) ([proller](https://github.com/proller)) +* Suppress warning in `rdkafka` library. [#4740](https://github.com/ClickHouse/ClickHouse/pull/4740) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Allow ability to build without ssl. [#4750](https://github.com/ClickHouse/ClickHouse/pull/4750) ([proller](https://github.com/proller)) +* Add a way to launch clickhouse-server image from a custom user. [#4753](https://github.com/ClickHouse/ClickHouse/pull/4753) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) +* Upgrade contrib boost to 1.69. [#4793](https://github.com/ClickHouse/ClickHouse/pull/4793) ([proller](https://github.com/proller)) +* Disable usage of `mremap` when compiled with Thread Sanitizer. Surprisingly enough, TSan does not intercept `mremap` (though it does intercept `mmap`, `munmap`) that leads to false positives. Fixed TSan report in stateful tests. [#4859](https://github.com/ClickHouse/ClickHouse/pull/4859) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add test checking using format schema via HTTP interface. [#4864](https://github.com/ClickHouse/ClickHouse/pull/4864) ([Vitaly Baranov](https://github.com/vitlibar)) ## ClickHouse release 19.4.4.33, 2019-04-17 ### Bug Fixes -* Avoid `std::terminate` in case of memory allocation failure. Now `std::bad_alloc` exception is thrown as expected. [#4665](https://github.com/yandex/ClickHouse/pull/4665) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixes capnproto reading from buffer. Sometimes files wasn't loaded successfully by HTTP. [#4674](https://github.com/yandex/ClickHouse/pull/4674) ([Vladislav](https://github.com/smirnov-vs)) -* Fix error `Unknown log entry type: 0` after `OPTIMIZE TABLE FINAL` query. [#4683](https://github.com/yandex/ClickHouse/pull/4683) ([Amos Bird](https://github.com/amosbird)) -* Wrong arguments to `hasAny` or `hasAll` functions may lead to segfault. [#4698](https://github.com/yandex/ClickHouse/pull/4698) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Deadlock may happen while executing `DROP DATABASE dictionary` query. [#4701](https://github.com/yandex/ClickHouse/pull/4701) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix undefined behavior in `median` and `quantile` functions. [#4702](https://github.com/yandex/ClickHouse/pull/4702) ([hcz](https://github.com/hczhcz)) -* Fix compression level detection when `network_compression_method` in lowercase. Broken in v19.1. [#4706](https://github.com/yandex/ClickHouse/pull/4706) ([proller](https://github.com/proller)) -* Fixed ignorance of `UTC` setting (fixes issue [#4658](https://github.com/yandex/ClickHouse/issues/4658)). [#4718](https://github.com/yandex/ClickHouse/pull/4718) ([proller](https://github.com/proller)) -* Fix `histogram` function behaviour with `Distributed` tables. [#4741](https://github.com/yandex/ClickHouse/pull/4741) ([olegkv](https://github.com/olegkv)) -* Fixed tsan report `destroy of a locked mutex`. [#4742](https://github.com/yandex/ClickHouse/pull/4742) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed TSan report on shutdown due to race condition in system logs usage. Fixed potential use-after-free on shutdown when part_log is enabled. [#4758](https://github.com/yandex/ClickHouse/pull/4758) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix recheck parts in `ReplicatedMergeTreeAlterThread` in case of error. [#4772](https://github.com/yandex/ClickHouse/pull/4772) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Arithmetic operations on intermediate aggregate function states were not working for constant arguments (such as subquery results). [#4776](https://github.com/yandex/ClickHouse/pull/4776) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Always backquote column names in metadata. Otherwise it's impossible to create a table with column named `index` (server won't restart due to malformed `ATTACH` query in metadata). [#4782](https://github.com/yandex/ClickHouse/pull/4782) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix crash in `ALTER ... MODIFY ORDER BY` on `Distributed` table. [#4790](https://github.com/yandex/ClickHouse/pull/4790) ([TCeason](https://github.com/TCeason)) -* Fix segfault in `JOIN ON` with enabled `enable_optimize_predicate_expression`. [#4794](https://github.com/yandex/ClickHouse/pull/4794) ([Winter Zhang](https://github.com/zhang2014)) -* Fix bug with adding an extraneous row after consuming a protobuf message from Kafka. [#4808](https://github.com/yandex/ClickHouse/pull/4808) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix segmentation fault in `clickhouse-copier`. [#4835](https://github.com/yandex/ClickHouse/pull/4835) ([proller](https://github.com/proller)) -* Fixed race condition in `SELECT` from `system.tables` if the table is renamed or altered concurrently. [#4836](https://github.com/yandex/ClickHouse/pull/4836) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed data race when fetching data part that is already obsolete. [#4839](https://github.com/yandex/ClickHouse/pull/4839) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed rare data race that can happen during `RENAME` table of MergeTree family. [#4844](https://github.com/yandex/ClickHouse/pull/4844) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed segmentation fault in function `arrayIntersect`. Segmentation fault could happen if function was called with mixed constant and ordinary arguments. [#4847](https://github.com/yandex/ClickHouse/pull/4847) ([Lixiang Qian](https://github.com/fancyqlx)) -* Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [#4850](https://github.com/yandex/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix `No message received` exception while fetching parts between replicas. [#4856](https://github.com/yandex/ClickHouse/pull/4856) ([alesapin](https://github.com/alesapin)) -* Fixed `arrayIntersect` function wrong result in case of several repeated values in single array. [#4871](https://github.com/yandex/ClickHouse/pull/4871) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix a race condition during concurrent `ALTER COLUMN` queries that could lead to a server crash (fixes issue [#3421](https://github.com/yandex/ClickHouse/issues/3421)). [#4592](https://github.com/yandex/ClickHouse/pull/4592) ([Alex Zatelepin](https://github.com/ztlpn)) -* Fix parameter deduction in `ALTER MODIFY` of column `CODEC` when column type is not specified. [#4883](https://github.com/yandex/ClickHouse/pull/4883) ([alesapin](https://github.com/alesapin)) -* Functions `cutQueryStringAndFragment()` and `queryStringAndFragment()` now works correctly when `URL` contains a fragment and no query. [#4894](https://github.com/yandex/ClickHouse/pull/4894) ([Vitaly Baranov](https://github.com/vitlibar)) -* Fix rare bug when setting `min_bytes_to_use_direct_io` is greater than zero, which occures when thread have to seek backward in column file. [#4897](https://github.com/yandex/ClickHouse/pull/4897) ([alesapin](https://github.com/alesapin)) -* Fix wrong argument types for aggregate functions with `LowCardinality` arguments (fixes issue [#4919](https://github.com/yandex/ClickHouse/issues/4919)). [#4922](https://github.com/yandex/ClickHouse/pull/4922) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fix function `toISOWeek` result for year 1970. [#4988](https://github.com/yandex/ClickHouse/pull/4988) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix `DROP`, `TRUNCATE` and `OPTIMIZE` queries duplication, when executed on `ON CLUSTER` for `ReplicatedMergeTree*` tables family. [#4991](https://github.com/yandex/ClickHouse/pull/4991) ([alesapin](https://github.com/alesapin)) +* Avoid `std::terminate` in case of memory allocation failure. Now `std::bad_alloc` exception is thrown as expected. [#4665](https://github.com/ClickHouse/ClickHouse/pull/4665) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixes capnproto reading from buffer. Sometimes files wasn't loaded successfully by HTTP. [#4674](https://github.com/ClickHouse/ClickHouse/pull/4674) ([Vladislav](https://github.com/smirnov-vs)) +* Fix error `Unknown log entry type: 0` after `OPTIMIZE TABLE FINAL` query. [#4683](https://github.com/ClickHouse/ClickHouse/pull/4683) ([Amos Bird](https://github.com/amosbird)) +* Wrong arguments to `hasAny` or `hasAll` functions may lead to segfault. [#4698](https://github.com/ClickHouse/ClickHouse/pull/4698) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Deadlock may happen while executing `DROP DATABASE dictionary` query. [#4701](https://github.com/ClickHouse/ClickHouse/pull/4701) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix undefined behavior in `median` and `quantile` functions. [#4702](https://github.com/ClickHouse/ClickHouse/pull/4702) ([hcz](https://github.com/hczhcz)) +* Fix compression level detection when `network_compression_method` in lowercase. Broken in v19.1. [#4706](https://github.com/ClickHouse/ClickHouse/pull/4706) ([proller](https://github.com/proller)) +* Fixed ignorance of `UTC` setting (fixes issue [#4658](https://github.com/ClickHouse/ClickHouse/issues/4658)). [#4718](https://github.com/ClickHouse/ClickHouse/pull/4718) ([proller](https://github.com/proller)) +* Fix `histogram` function behaviour with `Distributed` tables. [#4741](https://github.com/ClickHouse/ClickHouse/pull/4741) ([olegkv](https://github.com/olegkv)) +* Fixed tsan report `destroy of a locked mutex`. [#4742](https://github.com/ClickHouse/ClickHouse/pull/4742) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed TSan report on shutdown due to race condition in system logs usage. Fixed potential use-after-free on shutdown when part_log is enabled. [#4758](https://github.com/ClickHouse/ClickHouse/pull/4758) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix recheck parts in `ReplicatedMergeTreeAlterThread` in case of error. [#4772](https://github.com/ClickHouse/ClickHouse/pull/4772) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Arithmetic operations on intermediate aggregate function states were not working for constant arguments (such as subquery results). [#4776](https://github.com/ClickHouse/ClickHouse/pull/4776) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Always backquote column names in metadata. Otherwise it's impossible to create a table with column named `index` (server won't restart due to malformed `ATTACH` query in metadata). [#4782](https://github.com/ClickHouse/ClickHouse/pull/4782) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix crash in `ALTER ... MODIFY ORDER BY` on `Distributed` table. [#4790](https://github.com/ClickHouse/ClickHouse/pull/4790) ([TCeason](https://github.com/TCeason)) +* Fix segfault in `JOIN ON` with enabled `enable_optimize_predicate_expression`. [#4794](https://github.com/ClickHouse/ClickHouse/pull/4794) ([Winter Zhang](https://github.com/zhang2014)) +* Fix bug with adding an extraneous row after consuming a protobuf message from Kafka. [#4808](https://github.com/ClickHouse/ClickHouse/pull/4808) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fix segmentation fault in `clickhouse-copier`. [#4835](https://github.com/ClickHouse/ClickHouse/pull/4835) ([proller](https://github.com/proller)) +* Fixed race condition in `SELECT` from `system.tables` if the table is renamed or altered concurrently. [#4836](https://github.com/ClickHouse/ClickHouse/pull/4836) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed data race when fetching data part that is already obsolete. [#4839](https://github.com/ClickHouse/ClickHouse/pull/4839) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed rare data race that can happen during `RENAME` table of MergeTree family. [#4844](https://github.com/ClickHouse/ClickHouse/pull/4844) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed segmentation fault in function `arrayIntersect`. Segmentation fault could happen if function was called with mixed constant and ordinary arguments. [#4847](https://github.com/ClickHouse/ClickHouse/pull/4847) ([Lixiang Qian](https://github.com/fancyqlx)) +* Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [#4850](https://github.com/ClickHouse/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fix `No message received` exception while fetching parts between replicas. [#4856](https://github.com/ClickHouse/ClickHouse/pull/4856) ([alesapin](https://github.com/alesapin)) +* Fixed `arrayIntersect` function wrong result in case of several repeated values in single array. [#4871](https://github.com/ClickHouse/ClickHouse/pull/4871) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fix a race condition during concurrent `ALTER COLUMN` queries that could lead to a server crash (fixes issue [#3421](https://github.com/ClickHouse/ClickHouse/issues/3421)). [#4592](https://github.com/ClickHouse/ClickHouse/pull/4592) ([Alex Zatelepin](https://github.com/ztlpn)) +* Fix parameter deduction in `ALTER MODIFY` of column `CODEC` when column type is not specified. [#4883](https://github.com/ClickHouse/ClickHouse/pull/4883) ([alesapin](https://github.com/alesapin)) +* Functions `cutQueryStringAndFragment()` and `queryStringAndFragment()` now works correctly when `URL` contains a fragment and no query. [#4894](https://github.com/ClickHouse/ClickHouse/pull/4894) ([Vitaly Baranov](https://github.com/vitlibar)) +* Fix rare bug when setting `min_bytes_to_use_direct_io` is greater than zero, which occures when thread have to seek backward in column file. [#4897](https://github.com/ClickHouse/ClickHouse/pull/4897) ([alesapin](https://github.com/alesapin)) +* Fix wrong argument types for aggregate functions with `LowCardinality` arguments (fixes issue [#4919](https://github.com/ClickHouse/ClickHouse/issues/4919)). [#4922](https://github.com/ClickHouse/ClickHouse/pull/4922) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fix function `toISOWeek` result for year 1970. [#4988](https://github.com/ClickHouse/ClickHouse/pull/4988) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix `DROP`, `TRUNCATE` and `OPTIMIZE` queries duplication, when executed on `ON CLUSTER` for `ReplicatedMergeTree*` tables family. [#4991](https://github.com/ClickHouse/ClickHouse/pull/4991) ([alesapin](https://github.com/alesapin)) ### Improvements -* Keep ordinary, `DEFAULT`, `MATERIALIZED` and `ALIAS` columns in a single list (fixes issue [#2867](https://github.com/yandex/ClickHouse/issues/2867)). [#4707](https://github.com/yandex/ClickHouse/pull/4707) ([Alex Zatelepin](https://github.com/ztlpn)) +* Keep ordinary, `DEFAULT`, `MATERIALIZED` and `ALIAS` columns in a single list (fixes issue [#2867](https://github.com/ClickHouse/ClickHouse/issues/2867)). [#4707](https://github.com/ClickHouse/ClickHouse/pull/4707) ([Alex Zatelepin](https://github.com/ztlpn)) ## ClickHouse release 19.4.3.11, 2019-04-02 ### Bug Fixes -* Fix crash in `FULL/RIGHT JOIN` when we joining on nullable vs not nullable. [#4855](https://github.com/yandex/ClickHouse/pull/4855) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix segmentation fault in `clickhouse-copier`. [#4835](https://github.com/yandex/ClickHouse/pull/4835) ([proller](https://github.com/proller)) +* Fix crash in `FULL/RIGHT JOIN` when we joining on nullable vs not nullable. [#4855](https://github.com/ClickHouse/ClickHouse/pull/4855) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix segmentation fault in `clickhouse-copier`. [#4835](https://github.com/ClickHouse/ClickHouse/pull/4835) ([proller](https://github.com/proller)) ### Build/Testing/Packaging Improvement -* Add a way to launch clickhouse-server image from a custom user. [#4753](https://github.com/yandex/ClickHouse/pull/4753) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) +* Add a way to launch clickhouse-server image from a custom user. [#4753](https://github.com/ClickHouse/ClickHouse/pull/4753) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) ## ClickHouse release 19.4.2.7, 2019-03-30 ### Bug Fixes -* Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [#4850](https://github.com/yandex/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [#4850](https://github.com/ClickHouse/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) ## ClickHouse release 19.4.1.3, 2019-03-19 ### Bug Fixes -* Fixed remote queries which contain both `LIMIT BY` and `LIMIT`. Previously, if `LIMIT BY` and `LIMIT` were used for remote query, `LIMIT` could happen before `LIMIT BY`, which led to too filtered result. [#4708](https://github.com/yandex/ClickHouse/pull/4708) ([Constantin S. Pan](https://github.com/kvap)) +* Fixed remote queries which contain both `LIMIT BY` and `LIMIT`. Previously, if `LIMIT BY` and `LIMIT` were used for remote query, `LIMIT` could happen before `LIMIT BY`, which led to too filtered result. [#4708](https://github.com/ClickHouse/ClickHouse/pull/4708) ([Constantin S. Pan](https://github.com/kvap)) ## ClickHouse release 19.4.0.49, 2019-03-09 ### New Features -* Added full support for `Protobuf` format (input and output, nested data structures). [#4174](https://github.com/yandex/ClickHouse/pull/4174) [#4493](https://github.com/yandex/ClickHouse/pull/4493) ([Vitaly Baranov](https://github.com/vitlibar)) -* Added bitmap functions with Roaring Bitmaps. [#4207](https://github.com/yandex/ClickHouse/pull/4207) ([Andy Yang](https://github.com/andyyzh)) [#4568](https://github.com/yandex/ClickHouse/pull/4568) ([Vitaly Baranov](https://github.com/vitlibar)) -* Parquet format support. [#4448](https://github.com/yandex/ClickHouse/pull/4448) ([proller](https://github.com/proller)) -* N-gram distance was added for fuzzy string comparison. It is similar to q-gram metrics in R language. [#4466](https://github.com/yandex/ClickHouse/pull/4466) ([Danila Kutenin](https://github.com/danlark1)) -* Combine rules for graphite rollup from dedicated aggregation and retention patterns. [#4426](https://github.com/yandex/ClickHouse/pull/4426) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) -* Added `max_execution_speed` and `max_execution_speed_bytes` to limit resource usage. Added `min_execution_speed_bytes` setting to complement the `min_execution_speed`. [#4430](https://github.com/yandex/ClickHouse/pull/4430) ([Winter Zhang](https://github.com/zhang2014)) -* Implemented function `flatten`. [#4555](https://github.com/yandex/ClickHouse/pull/4555) [#4409](https://github.com/yandex/ClickHouse/pull/4409) ([alexey-milovidov](https://github.com/alexey-milovidov), [kzon](https://github.com/kzon)) -* Added functions `arrayEnumerateDenseRanked` and `arrayEnumerateUniqRanked` (it's like `arrayEnumerateUniq` but allows to fine tune array depth to look inside multidimensional arrays). [#4475](https://github.com/yandex/ClickHouse/pull/4475) ([proller](https://github.com/proller)) [#4601](https://github.com/yandex/ClickHouse/pull/4601) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Multiple JOINS with some restrictions: no asterisks, no complex aliases in ON/WHERE/GROUP BY/... [#4462](https://github.com/yandex/ClickHouse/pull/4462) ([Artem Zuikov](https://github.com/4ertus2)) +* Added full support for `Protobuf` format (input and output, nested data structures). [#4174](https://github.com/ClickHouse/ClickHouse/pull/4174) [#4493](https://github.com/ClickHouse/ClickHouse/pull/4493) ([Vitaly Baranov](https://github.com/vitlibar)) +* Added bitmap functions with Roaring Bitmaps. [#4207](https://github.com/ClickHouse/ClickHouse/pull/4207) ([Andy Yang](https://github.com/andyyzh)) [#4568](https://github.com/ClickHouse/ClickHouse/pull/4568) ([Vitaly Baranov](https://github.com/vitlibar)) +* Parquet format support. [#4448](https://github.com/ClickHouse/ClickHouse/pull/4448) ([proller](https://github.com/proller)) +* N-gram distance was added for fuzzy string comparison. It is similar to q-gram metrics in R language. [#4466](https://github.com/ClickHouse/ClickHouse/pull/4466) ([Danila Kutenin](https://github.com/danlark1)) +* Combine rules for graphite rollup from dedicated aggregation and retention patterns. [#4426](https://github.com/ClickHouse/ClickHouse/pull/4426) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) +* Added `max_execution_speed` and `max_execution_speed_bytes` to limit resource usage. Added `min_execution_speed_bytes` setting to complement the `min_execution_speed`. [#4430](https://github.com/ClickHouse/ClickHouse/pull/4430) ([Winter Zhang](https://github.com/zhang2014)) +* Implemented function `flatten`. [#4555](https://github.com/ClickHouse/ClickHouse/pull/4555) [#4409](https://github.com/ClickHouse/ClickHouse/pull/4409) ([alexey-milovidov](https://github.com/alexey-milovidov), [kzon](https://github.com/kzon)) +* Added functions `arrayEnumerateDenseRanked` and `arrayEnumerateUniqRanked` (it's like `arrayEnumerateUniq` but allows to fine tune array depth to look inside multidimensional arrays). [#4475](https://github.com/ClickHouse/ClickHouse/pull/4475) ([proller](https://github.com/proller)) [#4601](https://github.com/ClickHouse/ClickHouse/pull/4601) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Multiple JOINS with some restrictions: no asterisks, no complex aliases in ON/WHERE/GROUP BY/... [#4462](https://github.com/ClickHouse/ClickHouse/pull/4462) ([Artem Zuikov](https://github.com/4ertus2)) ### Bug Fixes * This release also contains all bug fixes from 19.3 and 19.1. -* Fixed bug in data skipping indices: order of granules after INSERT was incorrect. [#4407](https://github.com/yandex/ClickHouse/pull/4407) ([Nikita Vasilev](https://github.com/nikvas0)) -* Fixed `set` index for `Nullable` and `LowCardinality` columns. Before it, `set` index with `Nullable` or `LowCardinality` column led to error `Data type must be deserialized with multiple streams` while selecting. [#4594](https://github.com/yandex/ClickHouse/pull/4594) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Correctly set update_time on full `executable` dictionary update. [#4551](https://github.com/yandex/ClickHouse/pull/4551) ([Tema Novikov](https://github.com/temoon)) -* Fix broken progress bar in 19.3. [#4627](https://github.com/yandex/ClickHouse/pull/4627) ([filimonov](https://github.com/filimonov)) -* Fixed inconsistent values of MemoryTracker when memory region was shrinked, in certain cases. [#4619](https://github.com/yandex/ClickHouse/pull/4619) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed undefined behaviour in ThreadPool. [#4612](https://github.com/yandex/ClickHouse/pull/4612) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed a very rare crash with the message `mutex lock failed: Invalid argument` that could happen when a MergeTree table was dropped concurrently with a SELECT. [#4608](https://github.com/yandex/ClickHouse/pull/4608) ([Alex Zatelepin](https://github.com/ztlpn)) -* ODBC driver compatibility with `LowCardinality` data type. [#4381](https://github.com/yandex/ClickHouse/pull/4381) ([proller](https://github.com/proller)) -* FreeBSD: Fixup for `AIOcontextPool: Found io_event with unknown id 0` error. [#4438](https://github.com/yandex/ClickHouse/pull/4438) ([urgordeadbeef](https://github.com/urgordeadbeef)) -* `system.part_log` table was created regardless to configuration. [#4483](https://github.com/yandex/ClickHouse/pull/4483) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix undefined behaviour in `dictIsIn` function for cache dictionaries. [#4515](https://github.com/yandex/ClickHouse/pull/4515) ([alesapin](https://github.com/alesapin)) -* Fixed a deadlock when a SELECT query locks the same table multiple times (e.g. from different threads or when executing multiple subqueries) and there is a concurrent DDL query. [#4535](https://github.com/yandex/ClickHouse/pull/4535) ([Alex Zatelepin](https://github.com/ztlpn)) -* Disable compile_expressions by default until we get own `llvm` contrib and can test it with `clang` and `asan`. [#4579](https://github.com/yandex/ClickHouse/pull/4579) ([alesapin](https://github.com/alesapin)) -* Prevent `std::terminate` when `invalidate_query` for `clickhouse` external dictionary source has returned wrong resultset (empty or more than one row or more than one column). Fixed issue when the `invalidate_query` was performed every five seconds regardless to the `lifetime`. [#4583](https://github.com/yandex/ClickHouse/pull/4583) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Avoid deadlock when the `invalidate_query` for a dictionary with `clickhouse` source was involving `system.dictionaries` table or `Dictionaries` database (rare case). [#4599](https://github.com/yandex/ClickHouse/pull/4599) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixes for CROSS JOIN with empty WHERE. [#4598](https://github.com/yandex/ClickHouse/pull/4598) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed segfault in function "replicate" when constant argument is passed. [#4603](https://github.com/yandex/ClickHouse/pull/4603) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix lambda function with predicate optimizer. [#4408](https://github.com/yandex/ClickHouse/pull/4408) ([Winter Zhang](https://github.com/zhang2014)) -* Multiple JOINs multiple fixes. [#4595](https://github.com/yandex/ClickHouse/pull/4595) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed bug in data skipping indices: order of granules after INSERT was incorrect. [#4407](https://github.com/ClickHouse/ClickHouse/pull/4407) ([Nikita Vasilev](https://github.com/nikvas0)) +* Fixed `set` index for `Nullable` and `LowCardinality` columns. Before it, `set` index with `Nullable` or `LowCardinality` column led to error `Data type must be deserialized with multiple streams` while selecting. [#4594](https://github.com/ClickHouse/ClickHouse/pull/4594) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Correctly set update_time on full `executable` dictionary update. [#4551](https://github.com/ClickHouse/ClickHouse/pull/4551) ([Tema Novikov](https://github.com/temoon)) +* Fix broken progress bar in 19.3. [#4627](https://github.com/ClickHouse/ClickHouse/pull/4627) ([filimonov](https://github.com/filimonov)) +* Fixed inconsistent values of MemoryTracker when memory region was shrinked, in certain cases. [#4619](https://github.com/ClickHouse/ClickHouse/pull/4619) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed undefined behaviour in ThreadPool. [#4612](https://github.com/ClickHouse/ClickHouse/pull/4612) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed a very rare crash with the message `mutex lock failed: Invalid argument` that could happen when a MergeTree table was dropped concurrently with a SELECT. [#4608](https://github.com/ClickHouse/ClickHouse/pull/4608) ([Alex Zatelepin](https://github.com/ztlpn)) +* ODBC driver compatibility with `LowCardinality` data type. [#4381](https://github.com/ClickHouse/ClickHouse/pull/4381) ([proller](https://github.com/proller)) +* FreeBSD: Fixup for `AIOcontextPool: Found io_event with unknown id 0` error. [#4438](https://github.com/ClickHouse/ClickHouse/pull/4438) ([urgordeadbeef](https://github.com/urgordeadbeef)) +* `system.part_log` table was created regardless to configuration. [#4483](https://github.com/ClickHouse/ClickHouse/pull/4483) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix undefined behaviour in `dictIsIn` function for cache dictionaries. [#4515](https://github.com/ClickHouse/ClickHouse/pull/4515) ([alesapin](https://github.com/alesapin)) +* Fixed a deadlock when a SELECT query locks the same table multiple times (e.g. from different threads or when executing multiple subqueries) and there is a concurrent DDL query. [#4535](https://github.com/ClickHouse/ClickHouse/pull/4535) ([Alex Zatelepin](https://github.com/ztlpn)) +* Disable compile_expressions by default until we get own `llvm` contrib and can test it with `clang` and `asan`. [#4579](https://github.com/ClickHouse/ClickHouse/pull/4579) ([alesapin](https://github.com/alesapin)) +* Prevent `std::terminate` when `invalidate_query` for `clickhouse` external dictionary source has returned wrong resultset (empty or more than one row or more than one column). Fixed issue when the `invalidate_query` was performed every five seconds regardless to the `lifetime`. [#4583](https://github.com/ClickHouse/ClickHouse/pull/4583) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Avoid deadlock when the `invalidate_query` for a dictionary with `clickhouse` source was involving `system.dictionaries` table or `Dictionaries` database (rare case). [#4599](https://github.com/ClickHouse/ClickHouse/pull/4599) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixes for CROSS JOIN with empty WHERE. [#4598](https://github.com/ClickHouse/ClickHouse/pull/4598) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed segfault in function "replicate" when constant argument is passed. [#4603](https://github.com/ClickHouse/ClickHouse/pull/4603) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix lambda function with predicate optimizer. [#4408](https://github.com/ClickHouse/ClickHouse/pull/4408) ([Winter Zhang](https://github.com/zhang2014)) +* Multiple JOINs multiple fixes. [#4595](https://github.com/ClickHouse/ClickHouse/pull/4595) ([Artem Zuikov](https://github.com/4ertus2)) ### Improvements -* Support aliases in JOIN ON section for right table columns. [#4412](https://github.com/yandex/ClickHouse/pull/4412) ([Artem Zuikov](https://github.com/4ertus2)) -* Result of multiple JOINs need correct result names to be used in subselects. Replace flat aliases with source names in result. [#4474](https://github.com/yandex/ClickHouse/pull/4474) ([Artem Zuikov](https://github.com/4ertus2)) -* Improve push-down logic for joined statements. [#4387](https://github.com/yandex/ClickHouse/pull/4387) ([Ivan](https://github.com/abyss7)) +* Support aliases in JOIN ON section for right table columns. [#4412](https://github.com/ClickHouse/ClickHouse/pull/4412) ([Artem Zuikov](https://github.com/4ertus2)) +* Result of multiple JOINs need correct result names to be used in subselects. Replace flat aliases with source names in result. [#4474](https://github.com/ClickHouse/ClickHouse/pull/4474) ([Artem Zuikov](https://github.com/4ertus2)) +* Improve push-down logic for joined statements. [#4387](https://github.com/ClickHouse/ClickHouse/pull/4387) ([Ivan](https://github.com/abyss7)) ### Performance Improvements -* Improved heuristics of "move to PREWHERE" optimization. [#4405](https://github.com/yandex/ClickHouse/pull/4405) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Use proper lookup tables that uses HashTable's API for 8-bit and 16-bit keys. [#4536](https://github.com/yandex/ClickHouse/pull/4536) ([Amos Bird](https://github.com/amosbird)) -* Improved performance of string comparison. [#4564](https://github.com/yandex/ClickHouse/pull/4564) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Cleanup distributed DDL queue in a separate thread so that it doesn't slow down the main loop that processes distributed DDL tasks. [#4502](https://github.com/yandex/ClickHouse/pull/4502) ([Alex Zatelepin](https://github.com/ztlpn)) -* When `min_bytes_to_use_direct_io` is set to 1, not every file was opened with O_DIRECT mode because the data size to read was sometimes underestimated by the size of one compressed block. [#4526](https://github.com/yandex/ClickHouse/pull/4526) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Improved heuristics of "move to PREWHERE" optimization. [#4405](https://github.com/ClickHouse/ClickHouse/pull/4405) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Use proper lookup tables that uses HashTable's API for 8-bit and 16-bit keys. [#4536](https://github.com/ClickHouse/ClickHouse/pull/4536) ([Amos Bird](https://github.com/amosbird)) +* Improved performance of string comparison. [#4564](https://github.com/ClickHouse/ClickHouse/pull/4564) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Cleanup distributed DDL queue in a separate thread so that it doesn't slow down the main loop that processes distributed DDL tasks. [#4502](https://github.com/ClickHouse/ClickHouse/pull/4502) ([Alex Zatelepin](https://github.com/ztlpn)) +* When `min_bytes_to_use_direct_io` is set to 1, not every file was opened with O_DIRECT mode because the data size to read was sometimes underestimated by the size of one compressed block. [#4526](https://github.com/ClickHouse/ClickHouse/pull/4526) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Build/Testing/Packaging Improvement -* Added support for clang-9 [#4604](https://github.com/yandex/ClickHouse/pull/4604) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix wrong `__asm__` instructions (again) [#4621](https://github.com/yandex/ClickHouse/pull/4621) ([Konstantin Podshumok](https://github.com/podshumok)) -* Add ability to specify settings for `clickhouse-performance-test` from command line. [#4437](https://github.com/yandex/ClickHouse/pull/4437) ([alesapin](https://github.com/alesapin)) -* Add dictionaries tests to integration tests. [#4477](https://github.com/yandex/ClickHouse/pull/4477) ([alesapin](https://github.com/alesapin)) -* Added queries from the benchmark on the website to automated performance tests. [#4496](https://github.com/yandex/ClickHouse/pull/4496) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `xxhash.h` does not exist in external lz4 because it is an implementation detail and its symbols are namespaced with `XXH_NAMESPACE` macro. When lz4 is external, xxHash has to be external too, and the dependents have to link to it. [#4495](https://github.com/yandex/ClickHouse/pull/4495) ([Orivej Desh](https://github.com/orivej)) -* Fixed a case when `quantileTiming` aggregate function can be called with negative or floating point argument (this fixes fuzz test with undefined behaviour sanitizer). [#4506](https://github.com/yandex/ClickHouse/pull/4506) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Spelling error correction. [#4531](https://github.com/yandex/ClickHouse/pull/4531) ([sdk2](https://github.com/sdk2)) -* Fix compilation on Mac. [#4371](https://github.com/yandex/ClickHouse/pull/4371) ([Vitaly Baranov](https://github.com/vitlibar)) -* Build fixes for FreeBSD and various unusual build configurations. [#4444](https://github.com/yandex/ClickHouse/pull/4444) ([proller](https://github.com/proller)) +* Added support for clang-9 [#4604](https://github.com/ClickHouse/ClickHouse/pull/4604) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix wrong `__asm__` instructions (again) [#4621](https://github.com/ClickHouse/ClickHouse/pull/4621) ([Konstantin Podshumok](https://github.com/podshumok)) +* Add ability to specify settings for `clickhouse-performance-test` from command line. [#4437](https://github.com/ClickHouse/ClickHouse/pull/4437) ([alesapin](https://github.com/alesapin)) +* Add dictionaries tests to integration tests. [#4477](https://github.com/ClickHouse/ClickHouse/pull/4477) ([alesapin](https://github.com/alesapin)) +* Added queries from the benchmark on the website to automated performance tests. [#4496](https://github.com/ClickHouse/ClickHouse/pull/4496) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* `xxhash.h` does not exist in external lz4 because it is an implementation detail and its symbols are namespaced with `XXH_NAMESPACE` macro. When lz4 is external, xxHash has to be external too, and the dependents have to link to it. [#4495](https://github.com/ClickHouse/ClickHouse/pull/4495) ([Orivej Desh](https://github.com/orivej)) +* Fixed a case when `quantileTiming` aggregate function can be called with negative or floating point argument (this fixes fuzz test with undefined behaviour sanitizer). [#4506](https://github.com/ClickHouse/ClickHouse/pull/4506) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Spelling error correction. [#4531](https://github.com/ClickHouse/ClickHouse/pull/4531) ([sdk2](https://github.com/sdk2)) +* Fix compilation on Mac. [#4371](https://github.com/ClickHouse/ClickHouse/pull/4371) ([Vitaly Baranov](https://github.com/vitlibar)) +* Build fixes for FreeBSD and various unusual build configurations. [#4444](https://github.com/ClickHouse/ClickHouse/pull/4444) ([proller](https://github.com/proller)) ## ClickHouse release 19.3.9.1, 2019-04-02 ### Bug Fixes -* Fix crash in `FULL/RIGHT JOIN` when we joining on nullable vs not nullable. [#4855](https://github.com/yandex/ClickHouse/pull/4855) ([Artem Zuikov](https://github.com/4ertus2)) -* Fix segmentation fault in `clickhouse-copier`. [#4835](https://github.com/yandex/ClickHouse/pull/4835) ([proller](https://github.com/proller)) -* Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [#4850](https://github.com/yandex/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fix crash in `FULL/RIGHT JOIN` when we joining on nullable vs not nullable. [#4855](https://github.com/ClickHouse/ClickHouse/pull/4855) ([Artem Zuikov](https://github.com/4ertus2)) +* Fix segmentation fault in `clickhouse-copier`. [#4835](https://github.com/ClickHouse/ClickHouse/pull/4835) ([proller](https://github.com/proller)) +* Fixed reading from `Array(LowCardinality)` column in rare case when column contained a long sequence of empty arrays. [#4850](https://github.com/ClickHouse/ClickHouse/pull/4850) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) ### Build/Testing/Packaging Improvement -* Add a way to launch clickhouse-server image from a custom user [#4753](https://github.com/yandex/ClickHouse/pull/4753) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) +* Add a way to launch clickhouse-server image from a custom user [#4753](https://github.com/ClickHouse/ClickHouse/pull/4753) ([Mikhail f. Shiryaev](https://github.com/Felixoid)) ## ClickHouse release 19.3.7, 2019-03-12 ### Bug fixes -* Fixed error in #3920. This error manifestate itself as random cache corruption (messages `Unknown codec family code`, `Cannot seek through file`) and segfaults. This bug first appeared in version 19.1 and is present in versions up to 19.1.10 and 19.3.6. [#4623](https://github.com/yandex/ClickHouse/pull/4623) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed error in #3920. This error manifestate itself as random cache corruption (messages `Unknown codec family code`, `Cannot seek through file`) and segfaults. This bug first appeared in version 19.1 and is present in versions up to 19.1.10 and 19.3.6. [#4623](https://github.com/ClickHouse/ClickHouse/pull/4623) ([alexey-milovidov](https://github.com/alexey-milovidov)) ## ClickHouse release 19.3.6, 2019-03-02 ### Bug fixes -* When there are more than 1000 threads in a thread pool, `std::terminate` may happen on thread exit. [Azat Khuzhin](https://github.com/azat) [#4485](https://github.com/yandex/ClickHouse/pull/4485) [#4505](https://github.com/yandex/ClickHouse/pull/4505) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Now it's possible to create `ReplicatedMergeTree*` tables with comments on columns without defaults and tables with columns codecs without comments and defaults. Also fix comparison of codecs. [#4523](https://github.com/yandex/ClickHouse/pull/4523) ([alesapin](https://github.com/alesapin)) -* Fixed crash on JOIN with array or tuple. [#4552](https://github.com/yandex/ClickHouse/pull/4552) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed crash in clickhouse-copier with the message `ThreadStatus not created`. [#4540](https://github.com/yandex/ClickHouse/pull/4540) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed hangup on server shutdown if distributed DDLs were used. [#4472](https://github.com/yandex/ClickHouse/pull/4472) ([Alex Zatelepin](https://github.com/ztlpn)) -* Incorrect column numbers were printed in error message about text format parsing for columns with number greater than 10. [#4484](https://github.com/yandex/ClickHouse/pull/4484) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* When there are more than 1000 threads in a thread pool, `std::terminate` may happen on thread exit. [Azat Khuzhin](https://github.com/azat) [#4485](https://github.com/ClickHouse/ClickHouse/pull/4485) [#4505](https://github.com/ClickHouse/ClickHouse/pull/4505) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Now it's possible to create `ReplicatedMergeTree*` tables with comments on columns without defaults and tables with columns codecs without comments and defaults. Also fix comparison of codecs. [#4523](https://github.com/ClickHouse/ClickHouse/pull/4523) ([alesapin](https://github.com/alesapin)) +* Fixed crash on JOIN with array or tuple. [#4552](https://github.com/ClickHouse/ClickHouse/pull/4552) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed crash in clickhouse-copier with the message `ThreadStatus not created`. [#4540](https://github.com/ClickHouse/ClickHouse/pull/4540) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed hangup on server shutdown if distributed DDLs were used. [#4472](https://github.com/ClickHouse/ClickHouse/pull/4472) ([Alex Zatelepin](https://github.com/ztlpn)) +* Incorrect column numbers were printed in error message about text format parsing for columns with number greater than 10. [#4484](https://github.com/ClickHouse/ClickHouse/pull/4484) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Build/Testing/Packaging Improvements -* Fixed build with AVX enabled. [#4527](https://github.com/yandex/ClickHouse/pull/4527) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Enable extended accounting and IO accounting based on good known version instead of kernel under which it is compiled. [#4541](https://github.com/yandex/ClickHouse/pull/4541) ([nvartolomei](https://github.com/nvartolomei)) -* Allow to skip setting of core_dump.size_limit, warning instead of throw if limit set fail. [#4473](https://github.com/yandex/ClickHouse/pull/4473) ([proller](https://github.com/proller)) -* Removed the `inline` tags of `void readBinary(...)` in `Field.cpp`. Also merged redundant `namespace DB` blocks. [#4530](https://github.com/yandex/ClickHouse/pull/4530) ([hcz](https://github.com/hczhcz)) +* Fixed build with AVX enabled. [#4527](https://github.com/ClickHouse/ClickHouse/pull/4527) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Enable extended accounting and IO accounting based on good known version instead of kernel under which it is compiled. [#4541](https://github.com/ClickHouse/ClickHouse/pull/4541) ([nvartolomei](https://github.com/nvartolomei)) +* Allow to skip setting of core_dump.size_limit, warning instead of throw if limit set fail. [#4473](https://github.com/ClickHouse/ClickHouse/pull/4473) ([proller](https://github.com/proller)) +* Removed the `inline` tags of `void readBinary(...)` in `Field.cpp`. Also merged redundant `namespace DB` blocks. [#4530](https://github.com/ClickHouse/ClickHouse/pull/4530) ([hcz](https://github.com/hczhcz)) ## ClickHouse release 19.3.5, 2019-02-21 ### Bug fixes -* Fixed bug with large http insert queries processing. [#4454](https://github.com/yandex/ClickHouse/pull/4454) ([alesapin](https://github.com/alesapin)) -* Fixed backward incompatibility with old versions due to wrong implementation of `send_logs_level` setting. [#4445](https://github.com/yandex/ClickHouse/pull/4445) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed backward incompatibility of table function `remote` introduced with column comments. [#4446](https://github.com/yandex/ClickHouse/pull/4446) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed bug with large http insert queries processing. [#4454](https://github.com/ClickHouse/ClickHouse/pull/4454) ([alesapin](https://github.com/alesapin)) +* Fixed backward incompatibility with old versions due to wrong implementation of `send_logs_level` setting. [#4445](https://github.com/ClickHouse/ClickHouse/pull/4445) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed backward incompatibility of table function `remote` introduced with column comments. [#4446](https://github.com/ClickHouse/ClickHouse/pull/4446) ([alexey-milovidov](https://github.com/alexey-milovidov)) ## ClickHouse release 19.3.4, 2019-02-16 ### Improvements -* Table index size is not accounted for memory limits when doing `ATTACH TABLE` query. Avoided the possibility that a table cannot be attached after being detached. [#4396](https://github.com/yandex/ClickHouse/pull/4396) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Slightly raised up the limit on max string and array size received from ZooKeeper. It allows to continue to work with increased size of `CLIENT_JVMFLAGS=-Djute.maxbuffer=...` on ZooKeeper. [#4398](https://github.com/yandex/ClickHouse/pull/4398) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Allow to repair abandoned replica even if it already has huge number of nodes in its queue. [#4399](https://github.com/yandex/ClickHouse/pull/4399) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Add one required argument to `SET` index (max stored rows number). [#4386](https://github.com/yandex/ClickHouse/pull/4386) ([Nikita Vasilev](https://github.com/nikvas0)) +* Table index size is not accounted for memory limits when doing `ATTACH TABLE` query. Avoided the possibility that a table cannot be attached after being detached. [#4396](https://github.com/ClickHouse/ClickHouse/pull/4396) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Slightly raised up the limit on max string and array size received from ZooKeeper. It allows to continue to work with increased size of `CLIENT_JVMFLAGS=-Djute.maxbuffer=...` on ZooKeeper. [#4398](https://github.com/ClickHouse/ClickHouse/pull/4398) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Allow to repair abandoned replica even if it already has huge number of nodes in its queue. [#4399](https://github.com/ClickHouse/ClickHouse/pull/4399) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add one required argument to `SET` index (max stored rows number). [#4386](https://github.com/ClickHouse/ClickHouse/pull/4386) ([Nikita Vasilev](https://github.com/nikvas0)) ### Bug Fixes -* Fixed `WITH ROLLUP` result for group by single `LowCardinality` key. [#4384](https://github.com/yandex/ClickHouse/pull/4384) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Fixed bug in the set index (dropping a granule if it contains more than `max_rows` rows). [#4386](https://github.com/yandex/ClickHouse/pull/4386) ([Nikita Vasilev](https://github.com/nikvas0)) -* A lot of FreeBSD build fixes. [#4397](https://github.com/yandex/ClickHouse/pull/4397) ([proller](https://github.com/proller)) -* Fixed aliases substitution in queries with subquery containing same alias (issue [#4110](https://github.com/yandex/ClickHouse/issues/4110)). [#4351](https://github.com/yandex/ClickHouse/pull/4351) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed `WITH ROLLUP` result for group by single `LowCardinality` key. [#4384](https://github.com/ClickHouse/ClickHouse/pull/4384) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Fixed bug in the set index (dropping a granule if it contains more than `max_rows` rows). [#4386](https://github.com/ClickHouse/ClickHouse/pull/4386) ([Nikita Vasilev](https://github.com/nikvas0)) +* A lot of FreeBSD build fixes. [#4397](https://github.com/ClickHouse/ClickHouse/pull/4397) ([proller](https://github.com/proller)) +* Fixed aliases substitution in queries with subquery containing same alias (issue [#4110](https://github.com/ClickHouse/ClickHouse/issues/4110)). [#4351](https://github.com/ClickHouse/ClickHouse/pull/4351) ([Artem Zuikov](https://github.com/4ertus2)) ### Build/Testing/Packaging Improvements -* Add ability to run `clickhouse-server` for stateless tests in docker image. [#4347](https://github.com/yandex/ClickHouse/pull/4347) ([Vasily Nemkov](https://github.com/Enmk)) +* Add ability to run `clickhouse-server` for stateless tests in docker image. [#4347](https://github.com/ClickHouse/ClickHouse/pull/4347) ([Vasily Nemkov](https://github.com/Enmk)) ## ClickHouse release 19.3.3, 2019-02-13 ### New Features -* Added the `KILL MUTATION` statement that allows removing mutations that are for some reasons stuck. Added `latest_failed_part`, `latest_fail_time`, `latest_fail_reason` fields to the `system.mutations` table for easier troubleshooting. [#4287](https://github.com/yandex/ClickHouse/pull/4287) ([Alex Zatelepin](https://github.com/ztlpn)) -* Added aggregate function `entropy` which computes Shannon entropy. [#4238](https://github.com/yandex/ClickHouse/pull/4238) ([Quid37](https://github.com/Quid37)) -* Added ability to send queries `INSERT INTO tbl VALUES (....` to server without splitting on `query` and `data` parts. [#4301](https://github.com/yandex/ClickHouse/pull/4301) ([alesapin](https://github.com/alesapin)) -* Generic implementation of `arrayWithConstant` function was added. [#4322](https://github.com/yandex/ClickHouse/pull/4322) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Implemented `NOT BETWEEN` comparison operator. [#4228](https://github.com/yandex/ClickHouse/pull/4228) ([Dmitry Naumov](https://github.com/nezed)) -* Implement `sumMapFiltered` in order to be able to limit the number of keys for which values will be summed by `sumMap`. [#4129](https://github.com/yandex/ClickHouse/pull/4129) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) -* Added support of `Nullable` types in `mysql` table function. [#4198](https://github.com/yandex/ClickHouse/pull/4198) ([Emmanuel Donin de Rosière](https://github.com/edonin)) -* Support for arbitrary constant expressions in `LIMIT` clause. [#4246](https://github.com/yandex/ClickHouse/pull/4246) ([k3box](https://github.com/k3box)) -* Added `topKWeighted` aggregate function that takes additional argument with (unsigned integer) weight. [#4245](https://github.com/yandex/ClickHouse/pull/4245) ([Andrew Golman](https://github.com/andrewgolman)) -* `StorageJoin` now supports `join_any_take_last_row` setting that allows overwriting existing values of the same key. [#3973](https://github.com/yandex/ClickHouse/pull/3973) ([Amos Bird](https://github.com/amosbird) -* Added function `toStartOfInterval`. [#4304](https://github.com/yandex/ClickHouse/pull/4304) ([Vitaly Baranov](https://github.com/vitlibar)) -* Added `RowBinaryWithNamesAndTypes` format. [#4200](https://github.com/yandex/ClickHouse/pull/4200) ([Oleg V. Kozlyuk](https://github.com/DarkWanderer)) -* Added `IPv4` and `IPv6` data types. More effective implementations of `IPv*` functions. [#3669](https://github.com/yandex/ClickHouse/pull/3669) ([Vasily Nemkov](https://github.com/Enmk)) -* Added function `toStartOfTenMinutes()`. [#4298](https://github.com/yandex/ClickHouse/pull/4298) ([Vitaly Baranov](https://github.com/vitlibar)) -* Added `Protobuf` output format. [#4005](https://github.com/yandex/ClickHouse/pull/4005) [#4158](https://github.com/yandex/ClickHouse/pull/4158) ([Vitaly Baranov](https://github.com/vitlibar)) -* Added brotli support for HTTP interface for data import (INSERTs). [#4235](https://github.com/yandex/ClickHouse/pull/4235) ([Mikhail ](https://github.com/fandyushin)) -* Added hints while user make typo in function name or type in command line client. [#4239](https://github.com/yandex/ClickHouse/pull/4239) ([Danila Kutenin](https://github.com/danlark1)) -* Added `Query-Id` to Server's HTTP Response header. [#4231](https://github.com/yandex/ClickHouse/pull/4231) ([Mikhail ](https://github.com/fandyushin)) +* Added the `KILL MUTATION` statement that allows removing mutations that are for some reasons stuck. Added `latest_failed_part`, `latest_fail_time`, `latest_fail_reason` fields to the `system.mutations` table for easier troubleshooting. [#4287](https://github.com/ClickHouse/ClickHouse/pull/4287) ([Alex Zatelepin](https://github.com/ztlpn)) +* Added aggregate function `entropy` which computes Shannon entropy. [#4238](https://github.com/ClickHouse/ClickHouse/pull/4238) ([Quid37](https://github.com/Quid37)) +* Added ability to send queries `INSERT INTO tbl VALUES (....` to server without splitting on `query` and `data` parts. [#4301](https://github.com/ClickHouse/ClickHouse/pull/4301) ([alesapin](https://github.com/alesapin)) +* Generic implementation of `arrayWithConstant` function was added. [#4322](https://github.com/ClickHouse/ClickHouse/pull/4322) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Implemented `NOT BETWEEN` comparison operator. [#4228](https://github.com/ClickHouse/ClickHouse/pull/4228) ([Dmitry Naumov](https://github.com/nezed)) +* Implement `sumMapFiltered` in order to be able to limit the number of keys for which values will be summed by `sumMap`. [#4129](https://github.com/ClickHouse/ClickHouse/pull/4129) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) +* Added support of `Nullable` types in `mysql` table function. [#4198](https://github.com/ClickHouse/ClickHouse/pull/4198) ([Emmanuel Donin de Rosière](https://github.com/edonin)) +* Support for arbitrary constant expressions in `LIMIT` clause. [#4246](https://github.com/ClickHouse/ClickHouse/pull/4246) ([k3box](https://github.com/k3box)) +* Added `topKWeighted` aggregate function that takes additional argument with (unsigned integer) weight. [#4245](https://github.com/ClickHouse/ClickHouse/pull/4245) ([Andrew Golman](https://github.com/andrewgolman)) +* `StorageJoin` now supports `join_any_take_last_row` setting that allows overwriting existing values of the same key. [#3973](https://github.com/ClickHouse/ClickHouse/pull/3973) ([Amos Bird](https://github.com/amosbird) +* Added function `toStartOfInterval`. [#4304](https://github.com/ClickHouse/ClickHouse/pull/4304) ([Vitaly Baranov](https://github.com/vitlibar)) +* Added `RowBinaryWithNamesAndTypes` format. [#4200](https://github.com/ClickHouse/ClickHouse/pull/4200) ([Oleg V. Kozlyuk](https://github.com/DarkWanderer)) +* Added `IPv4` and `IPv6` data types. More effective implementations of `IPv*` functions. [#3669](https://github.com/ClickHouse/ClickHouse/pull/3669) ([Vasily Nemkov](https://github.com/Enmk)) +* Added function `toStartOfTenMinutes()`. [#4298](https://github.com/ClickHouse/ClickHouse/pull/4298) ([Vitaly Baranov](https://github.com/vitlibar)) +* Added `Protobuf` output format. [#4005](https://github.com/ClickHouse/ClickHouse/pull/4005) [#4158](https://github.com/ClickHouse/ClickHouse/pull/4158) ([Vitaly Baranov](https://github.com/vitlibar)) +* Added brotli support for HTTP interface for data import (INSERTs). [#4235](https://github.com/ClickHouse/ClickHouse/pull/4235) ([Mikhail ](https://github.com/fandyushin)) +* Added hints while user make typo in function name or type in command line client. [#4239](https://github.com/ClickHouse/ClickHouse/pull/4239) ([Danila Kutenin](https://github.com/danlark1)) +* Added `Query-Id` to Server's HTTP Response header. [#4231](https://github.com/ClickHouse/ClickHouse/pull/4231) ([Mikhail ](https://github.com/fandyushin)) ### Experimental features -* Added `minmax` and `set` data skipping indices for MergeTree table engines family. [#4143](https://github.com/yandex/ClickHouse/pull/4143) ([Nikita Vasilev](https://github.com/nikvas0)) -* Added conversion of `CROSS JOIN` to `INNER JOIN` if possible. [#4221](https://github.com/yandex/ClickHouse/pull/4221) [#4266](https://github.com/yandex/ClickHouse/pull/4266) ([Artem Zuikov](https://github.com/4ertus2)) +* Added `minmax` and `set` data skipping indices for MergeTree table engines family. [#4143](https://github.com/ClickHouse/ClickHouse/pull/4143) ([Nikita Vasilev](https://github.com/nikvas0)) +* Added conversion of `CROSS JOIN` to `INNER JOIN` if possible. [#4221](https://github.com/ClickHouse/ClickHouse/pull/4221) [#4266](https://github.com/ClickHouse/ClickHouse/pull/4266) ([Artem Zuikov](https://github.com/4ertus2)) ### Bug Fixes -* Fixed `Not found column` for duplicate columns in `JOIN ON` section. [#4279](https://github.com/yandex/ClickHouse/pull/4279) ([Artem Zuikov](https://github.com/4ertus2)) -* Make `START REPLICATED SENDS` command start replicated sends. [#4229](https://github.com/yandex/ClickHouse/pull/4229) ([nvartolomei](https://github.com/nvartolomei)) -* Fixed aggregate functions execution with `Array(LowCardinality)` arguments. [#4055](https://github.com/yandex/ClickHouse/pull/4055) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Fixed wrong behaviour when doing `INSERT ... SELECT ... FROM file(...)` query and file has `CSVWithNames` or `TSVWIthNames` format and the first data row is missing. [#4297](https://github.com/yandex/ClickHouse/pull/4297) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed crash on dictionary reload if dictionary not available. This bug was appeared in 19.1.6. [#4188](https://github.com/yandex/ClickHouse/pull/4188) ([proller](https://github.com/proller)) -* Fixed `ALL JOIN` with duplicates in right table. [#4184](https://github.com/yandex/ClickHouse/pull/4184) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed segmentation fault with `use_uncompressed_cache=1` and exception with wrong uncompressed size. This bug was appeared in 19.1.6. [#4186](https://github.com/yandex/ClickHouse/pull/4186) ([alesapin](https://github.com/alesapin)) -* Fixed `compile_expressions` bug with comparison of big (more than int16) dates. [#4341](https://github.com/yandex/ClickHouse/pull/4341) ([alesapin](https://github.com/alesapin)) -* Fixed infinite loop when selecting from table function `numbers(0)`. [#4280](https://github.com/yandex/ClickHouse/pull/4280) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Temporarily disable predicate optimization for `ORDER BY`. [#3890](https://github.com/yandex/ClickHouse/pull/3890) ([Winter Zhang](https://github.com/zhang2014)) -* Fixed `Illegal instruction` error when using base64 functions on old CPUs. This error has been reproduced only when ClickHouse was compiled with gcc-8. [#4275](https://github.com/yandex/ClickHouse/pull/4275) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed `No message received` error when interacting with PostgreSQL ODBC Driver through TLS connection. Also fixes segfault when using MySQL ODBC Driver. [#4170](https://github.com/yandex/ClickHouse/pull/4170) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed incorrect result when `Date` and `DateTime` arguments are used in branches of conditional operator (function `if`). Added generic case for function `if`. [#4243](https://github.com/yandex/ClickHouse/pull/4243) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* ClickHouse dictionaries now load within `clickhouse` process. [#4166](https://github.com/yandex/ClickHouse/pull/4166) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed deadlock when `SELECT` from a table with `File` engine was retried after `No such file or directory` error. [#4161](https://github.com/yandex/ClickHouse/pull/4161) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed race condition when selecting from `system.tables` may give `table doesn't exist` error. [#4313](https://github.com/yandex/ClickHouse/pull/4313) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `clickhouse-client` can segfault on exit while loading data for command line suggestions if it was run in interactive mode. [#4317](https://github.com/yandex/ClickHouse/pull/4317) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed a bug when the execution of mutations containing `IN` operators was producing incorrect results. [#4099](https://github.com/yandex/ClickHouse/pull/4099) ([Alex Zatelepin](https://github.com/ztlpn)) -* Fixed error: if there is a database with `Dictionary` engine, all dictionaries forced to load at server startup, and if there is a dictionary with ClickHouse source from localhost, the dictionary cannot load. [#4255](https://github.com/yandex/ClickHouse/pull/4255) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed error when system logs are tried to create again at server shutdown. [#4254](https://github.com/yandex/ClickHouse/pull/4254) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Correctly return the right type and properly handle locks in `joinGet` function. [#4153](https://github.com/yandex/ClickHouse/pull/4153) ([Amos Bird](https://github.com/amosbird)) -* Added `sumMapWithOverflow` function. [#4151](https://github.com/yandex/ClickHouse/pull/4151) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) -* Fixed segfault with `allow_experimental_multiple_joins_emulation`. [52de2c](https://github.com/yandex/ClickHouse/commit/52de2cd927f7b5257dd67e175f0a5560a48840d0) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed bug with incorrect `Date` and `DateTime` comparison. [#4237](https://github.com/yandex/ClickHouse/pull/4237) ([valexey](https://github.com/valexey)) -* Fixed fuzz test under undefined behavior sanitizer: added parameter type check for `quantile*Weighted` family of functions. [#4145](https://github.com/yandex/ClickHouse/pull/4145) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed rare race condition when removing of old data parts can fail with `File not found` error. [#4378](https://github.com/yandex/ClickHouse/pull/4378) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix install package with missing /etc/clickhouse-server/config.xml. [#4343](https://github.com/yandex/ClickHouse/pull/4343) ([proller](https://github.com/proller)) +* Fixed `Not found column` for duplicate columns in `JOIN ON` section. [#4279](https://github.com/ClickHouse/ClickHouse/pull/4279) ([Artem Zuikov](https://github.com/4ertus2)) +* Make `START REPLICATED SENDS` command start replicated sends. [#4229](https://github.com/ClickHouse/ClickHouse/pull/4229) ([nvartolomei](https://github.com/nvartolomei)) +* Fixed aggregate functions execution with `Array(LowCardinality)` arguments. [#4055](https://github.com/ClickHouse/ClickHouse/pull/4055) ([KochetovNicolai](https://github.com/KochetovNicolai)) +* Fixed wrong behaviour when doing `INSERT ... SELECT ... FROM file(...)` query and file has `CSVWithNames` or `TSVWIthNames` format and the first data row is missing. [#4297](https://github.com/ClickHouse/ClickHouse/pull/4297) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed crash on dictionary reload if dictionary not available. This bug was appeared in 19.1.6. [#4188](https://github.com/ClickHouse/ClickHouse/pull/4188) ([proller](https://github.com/proller)) +* Fixed `ALL JOIN` with duplicates in right table. [#4184](https://github.com/ClickHouse/ClickHouse/pull/4184) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed segmentation fault with `use_uncompressed_cache=1` and exception with wrong uncompressed size. This bug was appeared in 19.1.6. [#4186](https://github.com/ClickHouse/ClickHouse/pull/4186) ([alesapin](https://github.com/alesapin)) +* Fixed `compile_expressions` bug with comparison of big (more than int16) dates. [#4341](https://github.com/ClickHouse/ClickHouse/pull/4341) ([alesapin](https://github.com/alesapin)) +* Fixed infinite loop when selecting from table function `numbers(0)`. [#4280](https://github.com/ClickHouse/ClickHouse/pull/4280) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Temporarily disable predicate optimization for `ORDER BY`. [#3890](https://github.com/ClickHouse/ClickHouse/pull/3890) ([Winter Zhang](https://github.com/zhang2014)) +* Fixed `Illegal instruction` error when using base64 functions on old CPUs. This error has been reproduced only when ClickHouse was compiled with gcc-8. [#4275](https://github.com/ClickHouse/ClickHouse/pull/4275) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed `No message received` error when interacting with PostgreSQL ODBC Driver through TLS connection. Also fixes segfault when using MySQL ODBC Driver. [#4170](https://github.com/ClickHouse/ClickHouse/pull/4170) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed incorrect result when `Date` and `DateTime` arguments are used in branches of conditional operator (function `if`). Added generic case for function `if`. [#4243](https://github.com/ClickHouse/ClickHouse/pull/4243) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* ClickHouse dictionaries now load within `clickhouse` process. [#4166](https://github.com/ClickHouse/ClickHouse/pull/4166) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed deadlock when `SELECT` from a table with `File` engine was retried after `No such file or directory` error. [#4161](https://github.com/ClickHouse/ClickHouse/pull/4161) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed race condition when selecting from `system.tables` may give `table doesn't exist` error. [#4313](https://github.com/ClickHouse/ClickHouse/pull/4313) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* `clickhouse-client` can segfault on exit while loading data for command line suggestions if it was run in interactive mode. [#4317](https://github.com/ClickHouse/ClickHouse/pull/4317) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed a bug when the execution of mutations containing `IN` operators was producing incorrect results. [#4099](https://github.com/ClickHouse/ClickHouse/pull/4099) ([Alex Zatelepin](https://github.com/ztlpn)) +* Fixed error: if there is a database with `Dictionary` engine, all dictionaries forced to load at server startup, and if there is a dictionary with ClickHouse source from localhost, the dictionary cannot load. [#4255](https://github.com/ClickHouse/ClickHouse/pull/4255) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed error when system logs are tried to create again at server shutdown. [#4254](https://github.com/ClickHouse/ClickHouse/pull/4254) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Correctly return the right type and properly handle locks in `joinGet` function. [#4153](https://github.com/ClickHouse/ClickHouse/pull/4153) ([Amos Bird](https://github.com/amosbird)) +* Added `sumMapWithOverflow` function. [#4151](https://github.com/ClickHouse/ClickHouse/pull/4151) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) +* Fixed segfault with `allow_experimental_multiple_joins_emulation`. [52de2c](https://github.com/ClickHouse/ClickHouse/commit/52de2cd927f7b5257dd67e175f0a5560a48840d0) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed bug with incorrect `Date` and `DateTime` comparison. [#4237](https://github.com/ClickHouse/ClickHouse/pull/4237) ([valexey](https://github.com/valexey)) +* Fixed fuzz test under undefined behavior sanitizer: added parameter type check for `quantile*Weighted` family of functions. [#4145](https://github.com/ClickHouse/ClickHouse/pull/4145) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed rare race condition when removing of old data parts can fail with `File not found` error. [#4378](https://github.com/ClickHouse/ClickHouse/pull/4378) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix install package with missing /etc/clickhouse-server/config.xml. [#4343](https://github.com/ClickHouse/ClickHouse/pull/4343) ([proller](https://github.com/proller)) ### Build/Testing/Packaging Improvements -* Debian package: correct /etc/clickhouse-server/preprocessed link according to config. [#4205](https://github.com/yandex/ClickHouse/pull/4205) ([proller](https://github.com/proller)) -* Various build fixes for FreeBSD. [#4225](https://github.com/yandex/ClickHouse/pull/4225) ([proller](https://github.com/proller)) -* Added ability to create, fill and drop tables in perftest. [#4220](https://github.com/yandex/ClickHouse/pull/4220) ([alesapin](https://github.com/alesapin)) -* Added a script to check for duplicate includes. [#4326](https://github.com/yandex/ClickHouse/pull/4326) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added ability to run queries by index in performance test. [#4264](https://github.com/yandex/ClickHouse/pull/4264) ([alesapin](https://github.com/alesapin)) -* Package with debug symbols is suggested to be installed. [#4274](https://github.com/yandex/ClickHouse/pull/4274) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Refactoring of performance-test. Better logging and signals handling. [#4171](https://github.com/yandex/ClickHouse/pull/4171) ([alesapin](https://github.com/alesapin)) -* Added docs to anonymized Yandex.Metrika datasets. [#4164](https://github.com/yandex/ClickHouse/pull/4164) ([alesapin](https://github.com/alesapin)) -* Аdded tool for converting an old month-partitioned part to the custom-partitioned format. [#4195](https://github.com/yandex/ClickHouse/pull/4195) ([Alex Zatelepin](https://github.com/ztlpn)) -* Added docs about two datasets in s3. [#4144](https://github.com/yandex/ClickHouse/pull/4144) ([alesapin](https://github.com/alesapin)) -* Added script which creates changelog from pull requests description. [#4169](https://github.com/yandex/ClickHouse/pull/4169) [#4173](https://github.com/yandex/ClickHouse/pull/4173) ([KochetovNicolai](https://github.com/KochetovNicolai)) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Added puppet module for Clickhouse. [#4182](https://github.com/yandex/ClickHouse/pull/4182) ([Maxim Fedotov](https://github.com/MaxFedotov)) -* Added docs for a group of undocumented functions. [#4168](https://github.com/yandex/ClickHouse/pull/4168) ([Winter Zhang](https://github.com/zhang2014)) -* ARM build fixes. [#4210](https://github.com/yandex/ClickHouse/pull/4210)[#4306](https://github.com/yandex/ClickHouse/pull/4306) [#4291](https://github.com/yandex/ClickHouse/pull/4291) ([proller](https://github.com/proller)) ([proller](https://github.com/proller)) -* Dictionary tests now able to run from `ctest`. [#4189](https://github.com/yandex/ClickHouse/pull/4189) ([proller](https://github.com/proller)) -* Now `/etc/ssl` is used as default directory with SSL certificates. [#4167](https://github.com/yandex/ClickHouse/pull/4167) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added checking SSE and AVX instruction at start. [#4234](https://github.com/yandex/ClickHouse/pull/4234) ([Igr](https://github.com/igron99)) -* Init script will wait server until start. [#4281](https://github.com/yandex/ClickHouse/pull/4281) ([proller](https://github.com/proller)) +* Debian package: correct /etc/clickhouse-server/preprocessed link according to config. [#4205](https://github.com/ClickHouse/ClickHouse/pull/4205) ([proller](https://github.com/proller)) +* Various build fixes for FreeBSD. [#4225](https://github.com/ClickHouse/ClickHouse/pull/4225) ([proller](https://github.com/proller)) +* Added ability to create, fill and drop tables in perftest. [#4220](https://github.com/ClickHouse/ClickHouse/pull/4220) ([alesapin](https://github.com/alesapin)) +* Added a script to check for duplicate includes. [#4326](https://github.com/ClickHouse/ClickHouse/pull/4326) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added ability to run queries by index in performance test. [#4264](https://github.com/ClickHouse/ClickHouse/pull/4264) ([alesapin](https://github.com/alesapin)) +* Package with debug symbols is suggested to be installed. [#4274](https://github.com/ClickHouse/ClickHouse/pull/4274) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Refactoring of performance-test. Better logging and signals handling. [#4171](https://github.com/ClickHouse/ClickHouse/pull/4171) ([alesapin](https://github.com/alesapin)) +* Added docs to anonymized Yandex.Metrika datasets. [#4164](https://github.com/ClickHouse/ClickHouse/pull/4164) ([alesapin](https://github.com/alesapin)) +* Аdded tool for converting an old month-partitioned part to the custom-partitioned format. [#4195](https://github.com/ClickHouse/ClickHouse/pull/4195) ([Alex Zatelepin](https://github.com/ztlpn)) +* Added docs about two datasets in s3. [#4144](https://github.com/ClickHouse/ClickHouse/pull/4144) ([alesapin](https://github.com/alesapin)) +* Added script which creates changelog from pull requests description. [#4169](https://github.com/ClickHouse/ClickHouse/pull/4169) [#4173](https://github.com/ClickHouse/ClickHouse/pull/4173) ([KochetovNicolai](https://github.com/KochetovNicolai)) ([KochetovNicolai](https://github.com/KochetovNicolai)) +* Added puppet module for Clickhouse. [#4182](https://github.com/ClickHouse/ClickHouse/pull/4182) ([Maxim Fedotov](https://github.com/MaxFedotov)) +* Added docs for a group of undocumented functions. [#4168](https://github.com/ClickHouse/ClickHouse/pull/4168) ([Winter Zhang](https://github.com/zhang2014)) +* ARM build fixes. [#4210](https://github.com/ClickHouse/ClickHouse/pull/4210)[#4306](https://github.com/ClickHouse/ClickHouse/pull/4306) [#4291](https://github.com/ClickHouse/ClickHouse/pull/4291) ([proller](https://github.com/proller)) ([proller](https://github.com/proller)) +* Dictionary tests now able to run from `ctest`. [#4189](https://github.com/ClickHouse/ClickHouse/pull/4189) ([proller](https://github.com/proller)) +* Now `/etc/ssl` is used as default directory with SSL certificates. [#4167](https://github.com/ClickHouse/ClickHouse/pull/4167) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added checking SSE and AVX instruction at start. [#4234](https://github.com/ClickHouse/ClickHouse/pull/4234) ([Igr](https://github.com/igron99)) +* Init script will wait server until start. [#4281](https://github.com/ClickHouse/ClickHouse/pull/4281) ([proller](https://github.com/proller)) ### Backward Incompatible Changes -* Removed `allow_experimental_low_cardinality_type` setting. `LowCardinality` data types are production ready. [#4323](https://github.com/yandex/ClickHouse/pull/4323) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Reduce mark cache size and uncompressed cache size accordingly to available memory amount. [#4240](https://github.com/yandex/ClickHouse/pull/4240) ([Lopatin Konstantin](https://github.com/k-lopatin) -* Added keyword `INDEX` in `CREATE TABLE` query. A column with name `index` must be quoted with backticks or double quotes: `` `index` ``. [#4143](https://github.com/yandex/ClickHouse/pull/4143) ([Nikita Vasilev](https://github.com/nikvas0)) -* `sumMap` now promote result type instead of overflow. The old `sumMap` behavior can be obtained by using `sumMapWithOverflow` function. [#4151](https://github.com/yandex/ClickHouse/pull/4151) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) +* Removed `allow_experimental_low_cardinality_type` setting. `LowCardinality` data types are production ready. [#4323](https://github.com/ClickHouse/ClickHouse/pull/4323) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Reduce mark cache size and uncompressed cache size accordingly to available memory amount. [#4240](https://github.com/ClickHouse/ClickHouse/pull/4240) ([Lopatin Konstantin](https://github.com/k-lopatin) +* Added keyword `INDEX` in `CREATE TABLE` query. A column with name `index` must be quoted with backticks or double quotes: `` `index` ``. [#4143](https://github.com/ClickHouse/ClickHouse/pull/4143) ([Nikita Vasilev](https://github.com/nikvas0)) +* `sumMap` now promote result type instead of overflow. The old `sumMap` behavior can be obtained by using `sumMapWithOverflow` function. [#4151](https://github.com/ClickHouse/ClickHouse/pull/4151) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) ### Performance Improvements -* `std::sort` replaced by `pdqsort` for queries without `LIMIT`. [#4236](https://github.com/yandex/ClickHouse/pull/4236) ([Evgenii Pravda](https://github.com/kvinty)) -* Now server reuse threads from global thread pool. This affects performance in some corner cases. [#4150](https://github.com/yandex/ClickHouse/pull/4150) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* `std::sort` replaced by `pdqsort` for queries without `LIMIT`. [#4236](https://github.com/ClickHouse/ClickHouse/pull/4236) ([Evgenii Pravda](https://github.com/kvinty)) +* Now server reuse threads from global thread pool. This affects performance in some corner cases. [#4150](https://github.com/ClickHouse/ClickHouse/pull/4150) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Improvements -* Implemented AIO support for FreeBSD. [#4305](https://github.com/yandex/ClickHouse/pull/4305) ([urgordeadbeef](https://github.com/urgordeadbeef)) -* `SELECT * FROM a JOIN b USING a, b` now return `a` and `b` columns only from the left table. [#4141](https://github.com/yandex/ClickHouse/pull/4141) ([Artem Zuikov](https://github.com/4ertus2)) -* Allow `-C` option of client to work as `-c` option. [#4232](https://github.com/yandex/ClickHouse/pull/4232) ([syominsergey](https://github.com/syominsergey)) -* Now option `--password` used without value requires password from stdin. [#4230](https://github.com/yandex/ClickHouse/pull/4230) ([BSD_Conqueror](https://github.com/bsd-conqueror)) -* Added highlighting of unescaped metacharacters in string literals that contain `LIKE` expressions or regexps. [#4327](https://github.com/yandex/ClickHouse/pull/4327) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added cancelling of HTTP read only queries if client socket goes away. [#4213](https://github.com/yandex/ClickHouse/pull/4213) ([nvartolomei](https://github.com/nvartolomei)) -* Now server reports progress to keep client connections alive. [#4215](https://github.com/yandex/ClickHouse/pull/4215) ([Ivan](https://github.com/abyss7)) -* Slightly better message with reason for OPTIMIZE query with `optimize_throw_if_noop` setting enabled. [#4294](https://github.com/yandex/ClickHouse/pull/4294) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added support of `--version` option for clickhouse server. [#4251](https://github.com/yandex/ClickHouse/pull/4251) ([Lopatin Konstantin](https://github.com/k-lopatin)) -* Added `--help/-h` option to `clickhouse-server`. [#4233](https://github.com/yandex/ClickHouse/pull/4233) ([Yuriy Baranov](https://github.com/yurriy)) -* Added support for scalar subqueries with aggregate function state result. [#4348](https://github.com/yandex/ClickHouse/pull/4348) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) -* Improved server shutdown time and ALTERs waiting time. [#4372](https://github.com/yandex/ClickHouse/pull/4372) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added info about the replicated_can_become_leader setting to system.replicas and add logging if the replica won't try to become leader. [#4379](https://github.com/yandex/ClickHouse/pull/4379) ([Alex Zatelepin](https://github.com/ztlpn)) +* Implemented AIO support for FreeBSD. [#4305](https://github.com/ClickHouse/ClickHouse/pull/4305) ([urgordeadbeef](https://github.com/urgordeadbeef)) +* `SELECT * FROM a JOIN b USING a, b` now return `a` and `b` columns only from the left table. [#4141](https://github.com/ClickHouse/ClickHouse/pull/4141) ([Artem Zuikov](https://github.com/4ertus2)) +* Allow `-C` option of client to work as `-c` option. [#4232](https://github.com/ClickHouse/ClickHouse/pull/4232) ([syominsergey](https://github.com/syominsergey)) +* Now option `--password` used without value requires password from stdin. [#4230](https://github.com/ClickHouse/ClickHouse/pull/4230) ([BSD_Conqueror](https://github.com/bsd-conqueror)) +* Added highlighting of unescaped metacharacters in string literals that contain `LIKE` expressions or regexps. [#4327](https://github.com/ClickHouse/ClickHouse/pull/4327) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added cancelling of HTTP read only queries if client socket goes away. [#4213](https://github.com/ClickHouse/ClickHouse/pull/4213) ([nvartolomei](https://github.com/nvartolomei)) +* Now server reports progress to keep client connections alive. [#4215](https://github.com/ClickHouse/ClickHouse/pull/4215) ([Ivan](https://github.com/abyss7)) +* Slightly better message with reason for OPTIMIZE query with `optimize_throw_if_noop` setting enabled. [#4294](https://github.com/ClickHouse/ClickHouse/pull/4294) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added support of `--version` option for clickhouse server. [#4251](https://github.com/ClickHouse/ClickHouse/pull/4251) ([Lopatin Konstantin](https://github.com/k-lopatin)) +* Added `--help/-h` option to `clickhouse-server`. [#4233](https://github.com/ClickHouse/ClickHouse/pull/4233) ([Yuriy Baranov](https://github.com/yurriy)) +* Added support for scalar subqueries with aggregate function state result. [#4348](https://github.com/ClickHouse/ClickHouse/pull/4348) ([Nikolai Kochetov](https://github.com/KochetovNicolai)) +* Improved server shutdown time and ALTERs waiting time. [#4372](https://github.com/ClickHouse/ClickHouse/pull/4372) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added info about the replicated_can_become_leader setting to system.replicas and add logging if the replica won't try to become leader. [#4379](https://github.com/ClickHouse/ClickHouse/pull/4379) ([Alex Zatelepin](https://github.com/ztlpn)) ## ClickHouse release 19.1.14, 2019-03-14 -* Fixed error `Column ... queried more than once` that may happen if the setting `asterisk_left_columns_only` is set to 1 in case of using `GLOBAL JOIN` with `SELECT *` (rare case). The issue does not exist in 19.3 and newer. [6bac7d8d](https://github.com/yandex/ClickHouse/pull/4692/commits/6bac7d8d11a9b0d6de0b32b53c47eb2f6f8e7062) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed error `Column ... queried more than once` that may happen if the setting `asterisk_left_columns_only` is set to 1 in case of using `GLOBAL JOIN` with `SELECT *` (rare case). The issue does not exist in 19.3 and newer. [6bac7d8d](https://github.com/ClickHouse/ClickHouse/pull/4692/commits/6bac7d8d11a9b0d6de0b32b53c47eb2f6f8e7062) ([Artem Zuikov](https://github.com/4ertus2)) ## ClickHouse release 19.1.13, 2019-03-12 @@ -1278,164 +1278,164 @@ This release contains exactly the same set of patches as 19.3.6. ## ClickHouse release 19.1.9, 2019-02-21 ### Bug fixes -* Fixed backward incompatibility with old versions due to wrong implementation of `send_logs_level` setting. [#4445](https://github.com/yandex/ClickHouse/pull/4445) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed backward incompatibility of table function `remote` introduced with column comments. [#4446](https://github.com/yandex/ClickHouse/pull/4446) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed backward incompatibility with old versions due to wrong implementation of `send_logs_level` setting. [#4445](https://github.com/ClickHouse/ClickHouse/pull/4445) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed backward incompatibility of table function `remote` introduced with column comments. [#4446](https://github.com/ClickHouse/ClickHouse/pull/4446) ([alexey-milovidov](https://github.com/alexey-milovidov)) ## ClickHouse release 19.1.8, 2019-02-16 ### Bug Fixes -* Fix install package with missing /etc/clickhouse-server/config.xml. [#4343](https://github.com/yandex/ClickHouse/pull/4343) ([proller](https://github.com/proller)) +* Fix install package with missing /etc/clickhouse-server/config.xml. [#4343](https://github.com/ClickHouse/ClickHouse/pull/4343) ([proller](https://github.com/proller)) ## ClickHouse release 19.1.7, 2019-02-15 ### Bug Fixes -* Correctly return the right type and properly handle locks in `joinGet` function. [#4153](https://github.com/yandex/ClickHouse/pull/4153) ([Amos Bird](https://github.com/amosbird)) -* Fixed error when system logs are tried to create again at server shutdown. [#4254](https://github.com/yandex/ClickHouse/pull/4254) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed error: if there is a database with `Dictionary` engine, all dictionaries forced to load at server startup, and if there is a dictionary with ClickHouse source from localhost, the dictionary cannot load. [#4255](https://github.com/yandex/ClickHouse/pull/4255) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed a bug when the execution of mutations containing `IN` operators was producing incorrect results. [#4099](https://github.com/yandex/ClickHouse/pull/4099) ([Alex Zatelepin](https://github.com/ztlpn)) -* `clickhouse-client` can segfault on exit while loading data for command line suggestions if it was run in interactive mode. [#4317](https://github.com/yandex/ClickHouse/pull/4317) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed race condition when selecting from `system.tables` may give `table doesn't exist` error. [#4313](https://github.com/yandex/ClickHouse/pull/4313) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed deadlock when `SELECT` from a table with `File` engine was retried after `No such file or directory` error. [#4161](https://github.com/yandex/ClickHouse/pull/4161) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed an issue: local ClickHouse dictionaries are loaded via TCP, but should load within process. [#4166](https://github.com/yandex/ClickHouse/pull/4166) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed `No message received` error when interacting with PostgreSQL ODBC Driver through TLS connection. Also fixes segfault when using MySQL ODBC Driver. [#4170](https://github.com/yandex/ClickHouse/pull/4170) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Temporarily disable predicate optimization for `ORDER BY`. [#3890](https://github.com/yandex/ClickHouse/pull/3890) ([Winter Zhang](https://github.com/zhang2014)) -* Fixed infinite loop when selecting from table function `numbers(0)`. [#4280](https://github.com/yandex/ClickHouse/pull/4280) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed `compile_expressions` bug with comparison of big (more than int16) dates. [#4341](https://github.com/yandex/ClickHouse/pull/4341) ([alesapin](https://github.com/alesapin)) -* Fixed segmentation fault with `uncompressed_cache=1` and exception with wrong uncompressed size. [#4186](https://github.com/yandex/ClickHouse/pull/4186) ([alesapin](https://github.com/alesapin)) -* Fixed `ALL JOIN` with duplicates in right table. [#4184](https://github.com/yandex/ClickHouse/pull/4184) ([Artem Zuikov](https://github.com/4ertus2)) -* Fixed wrong behaviour when doing `INSERT ... SELECT ... FROM file(...)` query and file has `CSVWithNames` or `TSVWIthNames` format and the first data row is missing. [#4297](https://github.com/yandex/ClickHouse/pull/4297) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed aggregate functions execution with `Array(LowCardinality)` arguments. [#4055](https://github.com/yandex/ClickHouse/pull/4055) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Debian package: correct /etc/clickhouse-server/preprocessed link according to config. [#4205](https://github.com/yandex/ClickHouse/pull/4205) ([proller](https://github.com/proller)) -* Fixed fuzz test under undefined behavior sanitizer: added parameter type check for `quantile*Weighted` family of functions. [#4145](https://github.com/yandex/ClickHouse/pull/4145) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Make `START REPLICATED SENDS` command start replicated sends. [#4229](https://github.com/yandex/ClickHouse/pull/4229) ([nvartolomei](https://github.com/nvartolomei)) -* Fixed `Not found column` for duplicate columns in JOIN ON section. [#4279](https://github.com/yandex/ClickHouse/pull/4279) ([Artem Zuikov](https://github.com/4ertus2)) -* Now `/etc/ssl` is used as default directory with SSL certificates. [#4167](https://github.com/yandex/ClickHouse/pull/4167) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed crash on dictionary reload if dictionary not available. [#4188](https://github.com/yandex/ClickHouse/pull/4188) ([proller](https://github.com/proller)) -* Fixed bug with incorrect `Date` and `DateTime` comparison. [#4237](https://github.com/yandex/ClickHouse/pull/4237) ([valexey](https://github.com/valexey)) -* Fixed incorrect result when `Date` and `DateTime` arguments are used in branches of conditional operator (function `if`). Added generic case for function `if`. [#4243](https://github.com/yandex/ClickHouse/pull/4243) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Correctly return the right type and properly handle locks in `joinGet` function. [#4153](https://github.com/ClickHouse/ClickHouse/pull/4153) ([Amos Bird](https://github.com/amosbird)) +* Fixed error when system logs are tried to create again at server shutdown. [#4254](https://github.com/ClickHouse/ClickHouse/pull/4254) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed error: if there is a database with `Dictionary` engine, all dictionaries forced to load at server startup, and if there is a dictionary with ClickHouse source from localhost, the dictionary cannot load. [#4255](https://github.com/ClickHouse/ClickHouse/pull/4255) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed a bug when the execution of mutations containing `IN` operators was producing incorrect results. [#4099](https://github.com/ClickHouse/ClickHouse/pull/4099) ([Alex Zatelepin](https://github.com/ztlpn)) +* `clickhouse-client` can segfault on exit while loading data for command line suggestions if it was run in interactive mode. [#4317](https://github.com/ClickHouse/ClickHouse/pull/4317) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed race condition when selecting from `system.tables` may give `table doesn't exist` error. [#4313](https://github.com/ClickHouse/ClickHouse/pull/4313) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed deadlock when `SELECT` from a table with `File` engine was retried after `No such file or directory` error. [#4161](https://github.com/ClickHouse/ClickHouse/pull/4161) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed an issue: local ClickHouse dictionaries are loaded via TCP, but should load within process. [#4166](https://github.com/ClickHouse/ClickHouse/pull/4166) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed `No message received` error when interacting with PostgreSQL ODBC Driver through TLS connection. Also fixes segfault when using MySQL ODBC Driver. [#4170](https://github.com/ClickHouse/ClickHouse/pull/4170) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Temporarily disable predicate optimization for `ORDER BY`. [#3890](https://github.com/ClickHouse/ClickHouse/pull/3890) ([Winter Zhang](https://github.com/zhang2014)) +* Fixed infinite loop when selecting from table function `numbers(0)`. [#4280](https://github.com/ClickHouse/ClickHouse/pull/4280) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed `compile_expressions` bug with comparison of big (more than int16) dates. [#4341](https://github.com/ClickHouse/ClickHouse/pull/4341) ([alesapin](https://github.com/alesapin)) +* Fixed segmentation fault with `uncompressed_cache=1` and exception with wrong uncompressed size. [#4186](https://github.com/ClickHouse/ClickHouse/pull/4186) ([alesapin](https://github.com/alesapin)) +* Fixed `ALL JOIN` with duplicates in right table. [#4184](https://github.com/ClickHouse/ClickHouse/pull/4184) ([Artem Zuikov](https://github.com/4ertus2)) +* Fixed wrong behaviour when doing `INSERT ... SELECT ... FROM file(...)` query and file has `CSVWithNames` or `TSVWIthNames` format and the first data row is missing. [#4297](https://github.com/ClickHouse/ClickHouse/pull/4297) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed aggregate functions execution with `Array(LowCardinality)` arguments. [#4055](https://github.com/ClickHouse/ClickHouse/pull/4055) ([KochetovNicolai](https://github.com/KochetovNicolai)) +* Debian package: correct /etc/clickhouse-server/preprocessed link according to config. [#4205](https://github.com/ClickHouse/ClickHouse/pull/4205) ([proller](https://github.com/proller)) +* Fixed fuzz test under undefined behavior sanitizer: added parameter type check for `quantile*Weighted` family of functions. [#4145](https://github.com/ClickHouse/ClickHouse/pull/4145) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Make `START REPLICATED SENDS` command start replicated sends. [#4229](https://github.com/ClickHouse/ClickHouse/pull/4229) ([nvartolomei](https://github.com/nvartolomei)) +* Fixed `Not found column` for duplicate columns in JOIN ON section. [#4279](https://github.com/ClickHouse/ClickHouse/pull/4279) ([Artem Zuikov](https://github.com/4ertus2)) +* Now `/etc/ssl` is used as default directory with SSL certificates. [#4167](https://github.com/ClickHouse/ClickHouse/pull/4167) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed crash on dictionary reload if dictionary not available. [#4188](https://github.com/ClickHouse/ClickHouse/pull/4188) ([proller](https://github.com/proller)) +* Fixed bug with incorrect `Date` and `DateTime` comparison. [#4237](https://github.com/ClickHouse/ClickHouse/pull/4237) ([valexey](https://github.com/valexey)) +* Fixed incorrect result when `Date` and `DateTime` arguments are used in branches of conditional operator (function `if`). Added generic case for function `if`. [#4243](https://github.com/ClickHouse/ClickHouse/pull/4243) ([alexey-milovidov](https://github.com/alexey-milovidov)) ## ClickHouse release 19.1.6, 2019-01-24 ### New Features -* Custom per column compression codecs for tables. [#3899](https://github.com/yandex/ClickHouse/pull/3899) [#4111](https://github.com/yandex/ClickHouse/pull/4111) ([alesapin](https://github.com/alesapin), [Winter Zhang](https://github.com/zhang2014), [Anatoly](https://github.com/Sindbag)) -* Added compression codec `Delta`. [#4052](https://github.com/yandex/ClickHouse/pull/4052) ([alesapin](https://github.com/alesapin)) -* Allow to `ALTER` compression codecs. [#4054](https://github.com/yandex/ClickHouse/pull/4054) ([alesapin](https://github.com/alesapin)) -* Added functions `left`, `right`, `trim`, `ltrim`, `rtrim`, `timestampadd`, `timestampsub` for SQL standard compatibility. [#3826](https://github.com/yandex/ClickHouse/pull/3826) ([Ivan Blinkov](https://github.com/blinkov)) -* Support for write in `HDFS` tables and `hdfs` table function. [#4084](https://github.com/yandex/ClickHouse/pull/4084) ([alesapin](https://github.com/alesapin)) -* Added functions to search for multiple constant strings from big haystack: `multiPosition`, `multiSearch` ,`firstMatch` also with `-UTF8`, `-CaseInsensitive`, and `-CaseInsensitiveUTF8` variants. [#4053](https://github.com/yandex/ClickHouse/pull/4053) ([Danila Kutenin](https://github.com/danlark1)) -* Pruning of unused shards if `SELECT` query filters by sharding key (setting `optimize_skip_unused_shards`). [#3851](https://github.com/yandex/ClickHouse/pull/3851) ([Gleb Kanterov](https://github.com/kanterov), [Ivan](https://github.com/abyss7)) -* Allow `Kafka` engine to ignore some number of parsing errors per block. [#4094](https://github.com/yandex/ClickHouse/pull/4094) ([Ivan](https://github.com/abyss7)) -* Added support for `CatBoost` multiclass models evaluation. Function `modelEvaluate` returns tuple with per-class raw predictions for multiclass models. `libcatboostmodel.so` should be built with [#607](https://github.com/catboost/catboost/pull/607). [#3959](https://github.com/yandex/ClickHouse/pull/3959) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Added functions `filesystemAvailable`, `filesystemFree`, `filesystemCapacity`. [#4097](https://github.com/yandex/ClickHouse/pull/4097) ([Boris Granveaud](https://github.com/bgranvea)) -* Added hashing functions `xxHash64` and `xxHash32`. [#3905](https://github.com/yandex/ClickHouse/pull/3905) ([filimonov](https://github.com/filimonov)) -* Added `gccMurmurHash` hashing function (GCC flavoured Murmur hash) which uses the same hash seed as [gcc](https://github.com/gcc-mirror/gcc/blob/41d6b10e96a1de98e90a7c0378437c3255814b16/libstdc%2B%2B-v3/include/bits/functional_hash.h#L191) [#4000](https://github.com/yandex/ClickHouse/pull/4000) ([sundyli](https://github.com/sundy-li)) -* Added hashing functions `javaHash`, `hiveHash`. [#3811](https://github.com/yandex/ClickHouse/pull/3811) ([shangshujie365](https://github.com/shangshujie365)) -* Added table function `remoteSecure`. Function works as `remote`, but uses secure connection. [#4088](https://github.com/yandex/ClickHouse/pull/4088) ([proller](https://github.com/proller)) +* Custom per column compression codecs for tables. [#3899](https://github.com/ClickHouse/ClickHouse/pull/3899) [#4111](https://github.com/ClickHouse/ClickHouse/pull/4111) ([alesapin](https://github.com/alesapin), [Winter Zhang](https://github.com/zhang2014), [Anatoly](https://github.com/Sindbag)) +* Added compression codec `Delta`. [#4052](https://github.com/ClickHouse/ClickHouse/pull/4052) ([alesapin](https://github.com/alesapin)) +* Allow to `ALTER` compression codecs. [#4054](https://github.com/ClickHouse/ClickHouse/pull/4054) ([alesapin](https://github.com/alesapin)) +* Added functions `left`, `right`, `trim`, `ltrim`, `rtrim`, `timestampadd`, `timestampsub` for SQL standard compatibility. [#3826](https://github.com/ClickHouse/ClickHouse/pull/3826) ([Ivan Blinkov](https://github.com/blinkov)) +* Support for write in `HDFS` tables and `hdfs` table function. [#4084](https://github.com/ClickHouse/ClickHouse/pull/4084) ([alesapin](https://github.com/alesapin)) +* Added functions to search for multiple constant strings from big haystack: `multiPosition`, `multiSearch` ,`firstMatch` also with `-UTF8`, `-CaseInsensitive`, and `-CaseInsensitiveUTF8` variants. [#4053](https://github.com/ClickHouse/ClickHouse/pull/4053) ([Danila Kutenin](https://github.com/danlark1)) +* Pruning of unused shards if `SELECT` query filters by sharding key (setting `optimize_skip_unused_shards`). [#3851](https://github.com/ClickHouse/ClickHouse/pull/3851) ([Gleb Kanterov](https://github.com/kanterov), [Ivan](https://github.com/abyss7)) +* Allow `Kafka` engine to ignore some number of parsing errors per block. [#4094](https://github.com/ClickHouse/ClickHouse/pull/4094) ([Ivan](https://github.com/abyss7)) +* Added support for `CatBoost` multiclass models evaluation. Function `modelEvaluate` returns tuple with per-class raw predictions for multiclass models. `libcatboostmodel.so` should be built with [#607](https://github.com/catboost/catboost/pull/607). [#3959](https://github.com/ClickHouse/ClickHouse/pull/3959) ([KochetovNicolai](https://github.com/KochetovNicolai)) +* Added functions `filesystemAvailable`, `filesystemFree`, `filesystemCapacity`. [#4097](https://github.com/ClickHouse/ClickHouse/pull/4097) ([Boris Granveaud](https://github.com/bgranvea)) +* Added hashing functions `xxHash64` and `xxHash32`. [#3905](https://github.com/ClickHouse/ClickHouse/pull/3905) ([filimonov](https://github.com/filimonov)) +* Added `gccMurmurHash` hashing function (GCC flavoured Murmur hash) which uses the same hash seed as [gcc](https://github.com/gcc-mirror/gcc/blob/41d6b10e96a1de98e90a7c0378437c3255814b16/libstdc%2B%2B-v3/include/bits/functional_hash.h#L191) [#4000](https://github.com/ClickHouse/ClickHouse/pull/4000) ([sundyli](https://github.com/sundy-li)) +* Added hashing functions `javaHash`, `hiveHash`. [#3811](https://github.com/ClickHouse/ClickHouse/pull/3811) ([shangshujie365](https://github.com/shangshujie365)) +* Added table function `remoteSecure`. Function works as `remote`, but uses secure connection. [#4088](https://github.com/ClickHouse/ClickHouse/pull/4088) ([proller](https://github.com/proller)) ### Experimental features -* Added multiple JOINs emulation (`allow_experimental_multiple_joins_emulation` setting). [#3946](https://github.com/yandex/ClickHouse/pull/3946) ([Artem Zuikov](https://github.com/4ertus2)) +* Added multiple JOINs emulation (`allow_experimental_multiple_joins_emulation` setting). [#3946](https://github.com/ClickHouse/ClickHouse/pull/3946) ([Artem Zuikov](https://github.com/4ertus2)) ### Bug Fixes -* Make `compiled_expression_cache_size` setting limited by default to lower memory consumption. [#4041](https://github.com/yandex/ClickHouse/pull/4041) ([alesapin](https://github.com/alesapin)) -* Fix a bug that led to hangups in threads that perform ALTERs of Replicated tables and in the thread that updates configuration from ZooKeeper. [#2947](https://github.com/yandex/ClickHouse/issues/2947) [#3891](https://github.com/yandex/ClickHouse/issues/3891) [#3934](https://github.com/yandex/ClickHouse/pull/3934) ([Alex Zatelepin](https://github.com/ztlpn)) -* Fixed a race condition when executing a distributed ALTER task. The race condition led to more than one replica trying to execute the task and all replicas except one failing with a ZooKeeper error. [#3904](https://github.com/yandex/ClickHouse/pull/3904) ([Alex Zatelepin](https://github.com/ztlpn)) -* Fix a bug when `from_zk` config elements weren't refreshed after a request to ZooKeeper timed out. [#2947](https://github.com/yandex/ClickHouse/issues/2947) [#3947](https://github.com/yandex/ClickHouse/pull/3947) ([Alex Zatelepin](https://github.com/ztlpn)) -* Fix bug with wrong prefix for IPv4 subnet masks. [#3945](https://github.com/yandex/ClickHouse/pull/3945) ([alesapin](https://github.com/alesapin)) -* Fixed crash (`std::terminate`) in rare cases when a new thread cannot be created due to exhausted resources. [#3956](https://github.com/yandex/ClickHouse/pull/3956) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix bug when in `remote` table function execution when wrong restrictions were used for in `getStructureOfRemoteTable`. [#4009](https://github.com/yandex/ClickHouse/pull/4009) ([alesapin](https://github.com/alesapin)) -* Fix a leak of netlink sockets. They were placed in a pool where they were never deleted and new sockets were created at the start of a new thread when all current sockets were in use. [#4017](https://github.com/yandex/ClickHouse/pull/4017) ([Alex Zatelepin](https://github.com/ztlpn)) -* Fix bug with closing `/proc/self/fd` directory earlier than all fds were read from `/proc` after forking `odbc-bridge` subprocess. [#4120](https://github.com/yandex/ClickHouse/pull/4120) ([alesapin](https://github.com/alesapin)) -* Fixed String to UInt monotonic conversion in case of usage String in primary key. [#3870](https://github.com/yandex/ClickHouse/pull/3870) ([Winter Zhang](https://github.com/zhang2014)) -* Fixed error in calculation of integer conversion function monotonicity. [#3921](https://github.com/yandex/ClickHouse/pull/3921) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed segfault in `arrayEnumerateUniq`, `arrayEnumerateDense` functions in case of some invalid arguments. [#3909](https://github.com/yandex/ClickHouse/pull/3909) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fix UB in StorageMerge. [#3910](https://github.com/yandex/ClickHouse/pull/3910) ([Amos Bird](https://github.com/amosbird)) -* Fixed segfault in functions `addDays`, `subtractDays`. [#3913](https://github.com/yandex/ClickHouse/pull/3913) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed error: functions `round`, `floor`, `trunc`, `ceil` may return bogus result when executed on integer argument and large negative scale. [#3914](https://github.com/yandex/ClickHouse/pull/3914) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed a bug induced by 'kill query sync' which leads to a core dump. [#3916](https://github.com/yandex/ClickHouse/pull/3916) ([muVulDeePecker](https://github.com/fancyqlx)) -* Fix bug with long delay after empty replication queue. [#3928](https://github.com/yandex/ClickHouse/pull/3928) [#3932](https://github.com/yandex/ClickHouse/pull/3932) ([alesapin](https://github.com/alesapin)) -* Fixed excessive memory usage in case of inserting into table with `LowCardinality` primary key. [#3955](https://github.com/yandex/ClickHouse/pull/3955) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Fixed `LowCardinality` serialization for `Native` format in case of empty arrays. [#3907](https://github.com/yandex/ClickHouse/issues/3907) [#4011](https://github.com/yandex/ClickHouse/pull/4011) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Fixed incorrect result while using distinct by single LowCardinality numeric column. [#3895](https://github.com/yandex/ClickHouse/issues/3895) [#4012](https://github.com/yandex/ClickHouse/pull/4012) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Fixed specialized aggregation with LowCardinality key (in case when `compile` setting is enabled). [#3886](https://github.com/yandex/ClickHouse/pull/3886) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Fix user and password forwarding for replicated tables queries. [#3957](https://github.com/yandex/ClickHouse/pull/3957) ([alesapin](https://github.com/alesapin)) ([小路](https://github.com/nicelulu)) -* Fixed very rare race condition that can happen when listing tables in Dictionary database while reloading dictionaries. [#3970](https://github.com/yandex/ClickHouse/pull/3970) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed incorrect result when HAVING was used with ROLLUP or CUBE. [#3756](https://github.com/yandex/ClickHouse/issues/3756) [#3837](https://github.com/yandex/ClickHouse/pull/3837) ([Sam Chou](https://github.com/reflection)) -* Fixed column aliases for query with `JOIN ON` syntax and distributed tables. [#3980](https://github.com/yandex/ClickHouse/pull/3980) ([Winter Zhang](https://github.com/zhang2014)) -* Fixed error in internal implementation of `quantileTDigest` (found by Artem Vakhrushev). This error never happens in ClickHouse and was relevant only for those who use ClickHouse codebase as a library directly. [#3935](https://github.com/yandex/ClickHouse/pull/3935) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Make `compiled_expression_cache_size` setting limited by default to lower memory consumption. [#4041](https://github.com/ClickHouse/ClickHouse/pull/4041) ([alesapin](https://github.com/alesapin)) +* Fix a bug that led to hangups in threads that perform ALTERs of Replicated tables and in the thread that updates configuration from ZooKeeper. [#2947](https://github.com/ClickHouse/ClickHouse/issues/2947) [#3891](https://github.com/ClickHouse/ClickHouse/issues/3891) [#3934](https://github.com/ClickHouse/ClickHouse/pull/3934) ([Alex Zatelepin](https://github.com/ztlpn)) +* Fixed a race condition when executing a distributed ALTER task. The race condition led to more than one replica trying to execute the task and all replicas except one failing with a ZooKeeper error. [#3904](https://github.com/ClickHouse/ClickHouse/pull/3904) ([Alex Zatelepin](https://github.com/ztlpn)) +* Fix a bug when `from_zk` config elements weren't refreshed after a request to ZooKeeper timed out. [#2947](https://github.com/ClickHouse/ClickHouse/issues/2947) [#3947](https://github.com/ClickHouse/ClickHouse/pull/3947) ([Alex Zatelepin](https://github.com/ztlpn)) +* Fix bug with wrong prefix for IPv4 subnet masks. [#3945](https://github.com/ClickHouse/ClickHouse/pull/3945) ([alesapin](https://github.com/alesapin)) +* Fixed crash (`std::terminate`) in rare cases when a new thread cannot be created due to exhausted resources. [#3956](https://github.com/ClickHouse/ClickHouse/pull/3956) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix bug when in `remote` table function execution when wrong restrictions were used for in `getStructureOfRemoteTable`. [#4009](https://github.com/ClickHouse/ClickHouse/pull/4009) ([alesapin](https://github.com/alesapin)) +* Fix a leak of netlink sockets. They were placed in a pool where they were never deleted and new sockets were created at the start of a new thread when all current sockets were in use. [#4017](https://github.com/ClickHouse/ClickHouse/pull/4017) ([Alex Zatelepin](https://github.com/ztlpn)) +* Fix bug with closing `/proc/self/fd` directory earlier than all fds were read from `/proc` after forking `odbc-bridge` subprocess. [#4120](https://github.com/ClickHouse/ClickHouse/pull/4120) ([alesapin](https://github.com/alesapin)) +* Fixed String to UInt monotonic conversion in case of usage String in primary key. [#3870](https://github.com/ClickHouse/ClickHouse/pull/3870) ([Winter Zhang](https://github.com/zhang2014)) +* Fixed error in calculation of integer conversion function monotonicity. [#3921](https://github.com/ClickHouse/ClickHouse/pull/3921) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed segfault in `arrayEnumerateUniq`, `arrayEnumerateDense` functions in case of some invalid arguments. [#3909](https://github.com/ClickHouse/ClickHouse/pull/3909) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fix UB in StorageMerge. [#3910](https://github.com/ClickHouse/ClickHouse/pull/3910) ([Amos Bird](https://github.com/amosbird)) +* Fixed segfault in functions `addDays`, `subtractDays`. [#3913](https://github.com/ClickHouse/ClickHouse/pull/3913) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed error: functions `round`, `floor`, `trunc`, `ceil` may return bogus result when executed on integer argument and large negative scale. [#3914](https://github.com/ClickHouse/ClickHouse/pull/3914) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed a bug induced by 'kill query sync' which leads to a core dump. [#3916](https://github.com/ClickHouse/ClickHouse/pull/3916) ([muVulDeePecker](https://github.com/fancyqlx)) +* Fix bug with long delay after empty replication queue. [#3928](https://github.com/ClickHouse/ClickHouse/pull/3928) [#3932](https://github.com/ClickHouse/ClickHouse/pull/3932) ([alesapin](https://github.com/alesapin)) +* Fixed excessive memory usage in case of inserting into table with `LowCardinality` primary key. [#3955](https://github.com/ClickHouse/ClickHouse/pull/3955) ([KochetovNicolai](https://github.com/KochetovNicolai)) +* Fixed `LowCardinality` serialization for `Native` format in case of empty arrays. [#3907](https://github.com/ClickHouse/ClickHouse/issues/3907) [#4011](https://github.com/ClickHouse/ClickHouse/pull/4011) ([KochetovNicolai](https://github.com/KochetovNicolai)) +* Fixed incorrect result while using distinct by single LowCardinality numeric column. [#3895](https://github.com/ClickHouse/ClickHouse/issues/3895) [#4012](https://github.com/ClickHouse/ClickHouse/pull/4012) ([KochetovNicolai](https://github.com/KochetovNicolai)) +* Fixed specialized aggregation with LowCardinality key (in case when `compile` setting is enabled). [#3886](https://github.com/ClickHouse/ClickHouse/pull/3886) ([KochetovNicolai](https://github.com/KochetovNicolai)) +* Fix user and password forwarding for replicated tables queries. [#3957](https://github.com/ClickHouse/ClickHouse/pull/3957) ([alesapin](https://github.com/alesapin)) ([小路](https://github.com/nicelulu)) +* Fixed very rare race condition that can happen when listing tables in Dictionary database while reloading dictionaries. [#3970](https://github.com/ClickHouse/ClickHouse/pull/3970) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed incorrect result when HAVING was used with ROLLUP or CUBE. [#3756](https://github.com/ClickHouse/ClickHouse/issues/3756) [#3837](https://github.com/ClickHouse/ClickHouse/pull/3837) ([Sam Chou](https://github.com/reflection)) +* Fixed column aliases for query with `JOIN ON` syntax and distributed tables. [#3980](https://github.com/ClickHouse/ClickHouse/pull/3980) ([Winter Zhang](https://github.com/zhang2014)) +* Fixed error in internal implementation of `quantileTDigest` (found by Artem Vakhrushev). This error never happens in ClickHouse and was relevant only for those who use ClickHouse codebase as a library directly. [#3935](https://github.com/ClickHouse/ClickHouse/pull/3935) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Improvements -* Support for `IF NOT EXISTS` in `ALTER TABLE ADD COLUMN` statements along with `IF EXISTS` in `DROP/MODIFY/CLEAR/COMMENT COLUMN`. [#3900](https://github.com/yandex/ClickHouse/pull/3900) ([Boris Granveaud](https://github.com/bgranvea)) -* Function `parseDateTimeBestEffort`: support for formats `DD.MM.YYYY`, `DD.MM.YY`, `DD-MM-YYYY`, `DD-Mon-YYYY`, `DD/Month/YYYY` and similar. [#3922](https://github.com/yandex/ClickHouse/pull/3922) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* `CapnProtoInputStream` now support jagged structures. [#4063](https://github.com/yandex/ClickHouse/pull/4063) ([Odin Hultgren Van Der Horst](https://github.com/Miniwoffer)) -* Usability improvement: added a check that server process is started from the data directory's owner. Do not allow to start server from root if the data belongs to non-root user. [#3785](https://github.com/yandex/ClickHouse/pull/3785) ([sergey-v-galtsev](https://github.com/sergey-v-galtsev)) -* Better logic of checking required columns during analysis of queries with JOINs. [#3930](https://github.com/yandex/ClickHouse/pull/3930) ([Artem Zuikov](https://github.com/4ertus2)) -* Decreased the number of connections in case of large number of Distributed tables in a single server. [#3726](https://github.com/yandex/ClickHouse/pull/3726) ([Winter Zhang](https://github.com/zhang2014)) -* Supported totals row for `WITH TOTALS` query for ODBC driver. [#3836](https://github.com/yandex/ClickHouse/pull/3836) ([Maksim Koritckiy](https://github.com/nightweb)) -* Allowed to use `Enum`s as integers inside if function. [#3875](https://github.com/yandex/ClickHouse/pull/3875) ([Ivan](https://github.com/abyss7)) -* Added `low_cardinality_allow_in_native_format` setting. If disabled, do not use `LowCadrinality` type in `Native` format. [#3879](https://github.com/yandex/ClickHouse/pull/3879) ([KochetovNicolai](https://github.com/KochetovNicolai)) -* Removed some redundant objects from compiled expressions cache to lower memory usage. [#4042](https://github.com/yandex/ClickHouse/pull/4042) ([alesapin](https://github.com/alesapin)) -* Add check that `SET send_logs_level = 'value'` query accept appropriate value. [#3873](https://github.com/yandex/ClickHouse/pull/3873) ([Sabyanin Maxim](https://github.com/s-mx)) -* Fixed data type check in type conversion functions. [#3896](https://github.com/yandex/ClickHouse/pull/3896) ([Winter Zhang](https://github.com/zhang2014)) +* Support for `IF NOT EXISTS` in `ALTER TABLE ADD COLUMN` statements along with `IF EXISTS` in `DROP/MODIFY/CLEAR/COMMENT COLUMN`. [#3900](https://github.com/ClickHouse/ClickHouse/pull/3900) ([Boris Granveaud](https://github.com/bgranvea)) +* Function `parseDateTimeBestEffort`: support for formats `DD.MM.YYYY`, `DD.MM.YY`, `DD-MM-YYYY`, `DD-Mon-YYYY`, `DD/Month/YYYY` and similar. [#3922](https://github.com/ClickHouse/ClickHouse/pull/3922) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* `CapnProtoInputStream` now support jagged structures. [#4063](https://github.com/ClickHouse/ClickHouse/pull/4063) ([Odin Hultgren Van Der Horst](https://github.com/Miniwoffer)) +* Usability improvement: added a check that server process is started from the data directory's owner. Do not allow to start server from root if the data belongs to non-root user. [#3785](https://github.com/ClickHouse/ClickHouse/pull/3785) ([sergey-v-galtsev](https://github.com/sergey-v-galtsev)) +* Better logic of checking required columns during analysis of queries with JOINs. [#3930](https://github.com/ClickHouse/ClickHouse/pull/3930) ([Artem Zuikov](https://github.com/4ertus2)) +* Decreased the number of connections in case of large number of Distributed tables in a single server. [#3726](https://github.com/ClickHouse/ClickHouse/pull/3726) ([Winter Zhang](https://github.com/zhang2014)) +* Supported totals row for `WITH TOTALS` query for ODBC driver. [#3836](https://github.com/ClickHouse/ClickHouse/pull/3836) ([Maksim Koritckiy](https://github.com/nightweb)) +* Allowed to use `Enum`s as integers inside if function. [#3875](https://github.com/ClickHouse/ClickHouse/pull/3875) ([Ivan](https://github.com/abyss7)) +* Added `low_cardinality_allow_in_native_format` setting. If disabled, do not use `LowCadrinality` type in `Native` format. [#3879](https://github.com/ClickHouse/ClickHouse/pull/3879) ([KochetovNicolai](https://github.com/KochetovNicolai)) +* Removed some redundant objects from compiled expressions cache to lower memory usage. [#4042](https://github.com/ClickHouse/ClickHouse/pull/4042) ([alesapin](https://github.com/alesapin)) +* Add check that `SET send_logs_level = 'value'` query accept appropriate value. [#3873](https://github.com/ClickHouse/ClickHouse/pull/3873) ([Sabyanin Maxim](https://github.com/s-mx)) +* Fixed data type check in type conversion functions. [#3896](https://github.com/ClickHouse/ClickHouse/pull/3896) ([Winter Zhang](https://github.com/zhang2014)) ### Performance Improvements -* Add a MergeTree setting `use_minimalistic_part_header_in_zookeeper`. If enabled, Replicated tables will store compact part metadata in a single part znode. This can dramatically reduce ZooKeeper snapshot size (especially if the tables have a lot of columns). Note that after enabling this setting you will not be able to downgrade to a version that doesn't support it. [#3960](https://github.com/yandex/ClickHouse/pull/3960) ([Alex Zatelepin](https://github.com/ztlpn)) -* Add an DFA-based implementation for functions `sequenceMatch` and `sequenceCount` in case pattern doesn't contain time. [#4004](https://github.com/yandex/ClickHouse/pull/4004) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) -* Performance improvement for integer numbers serialization. [#3968](https://github.com/yandex/ClickHouse/pull/3968) ([Amos Bird](https://github.com/amosbird)) -* Zero left padding PODArray so that -1 element is always valid and zeroed. It's used for branchless calculation of offsets. [#3920](https://github.com/yandex/ClickHouse/pull/3920) ([Amos Bird](https://github.com/amosbird)) -* Reverted `jemalloc` version which lead to performance degradation. [#4018](https://github.com/yandex/ClickHouse/pull/4018) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Add a MergeTree setting `use_minimalistic_part_header_in_zookeeper`. If enabled, Replicated tables will store compact part metadata in a single part znode. This can dramatically reduce ZooKeeper snapshot size (especially if the tables have a lot of columns). Note that after enabling this setting you will not be able to downgrade to a version that doesn't support it. [#3960](https://github.com/ClickHouse/ClickHouse/pull/3960) ([Alex Zatelepin](https://github.com/ztlpn)) +* Add an DFA-based implementation for functions `sequenceMatch` and `sequenceCount` in case pattern doesn't contain time. [#4004](https://github.com/ClickHouse/ClickHouse/pull/4004) ([Léo Ercolanelli](https://github.com/ercolanelli-leo)) +* Performance improvement for integer numbers serialization. [#3968](https://github.com/ClickHouse/ClickHouse/pull/3968) ([Amos Bird](https://github.com/amosbird)) +* Zero left padding PODArray so that -1 element is always valid and zeroed. It's used for branchless calculation of offsets. [#3920](https://github.com/ClickHouse/ClickHouse/pull/3920) ([Amos Bird](https://github.com/amosbird)) +* Reverted `jemalloc` version which lead to performance degradation. [#4018](https://github.com/ClickHouse/ClickHouse/pull/4018) ([alexey-milovidov](https://github.com/alexey-milovidov)) ### Backward Incompatible Changes -* Removed undocumented feature `ALTER MODIFY PRIMARY KEY` because it was superseded by the `ALTER MODIFY ORDER BY` command. [#3887](https://github.com/yandex/ClickHouse/pull/3887) ([Alex Zatelepin](https://github.com/ztlpn)) -* Removed function `shardByHash`. [#3833](https://github.com/yandex/ClickHouse/pull/3833) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Forbid using scalar subqueries with result of type `AggregateFunction`. [#3865](https://github.com/yandex/ClickHouse/pull/3865) ([Ivan](https://github.com/abyss7)) +* Removed undocumented feature `ALTER MODIFY PRIMARY KEY` because it was superseded by the `ALTER MODIFY ORDER BY` command. [#3887](https://github.com/ClickHouse/ClickHouse/pull/3887) ([Alex Zatelepin](https://github.com/ztlpn)) +* Removed function `shardByHash`. [#3833](https://github.com/ClickHouse/ClickHouse/pull/3833) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Forbid using scalar subqueries with result of type `AggregateFunction`. [#3865](https://github.com/ClickHouse/ClickHouse/pull/3865) ([Ivan](https://github.com/abyss7)) ### Build/Testing/Packaging Improvements -* Added support for PowerPC (`ppc64le`) build. [#4132](https://github.com/yandex/ClickHouse/pull/4132) ([Danila Kutenin](https://github.com/danlark1)) -* Stateful functional tests are run on public available dataset. [#3969](https://github.com/yandex/ClickHouse/pull/3969) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed error when the server cannot start with the `bash: /usr/bin/clickhouse-extract-from-config: Operation not permitted` message within Docker or systemd-nspawn. [#4136](https://github.com/yandex/ClickHouse/pull/4136) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Updated `rdkafka` library to v1.0.0-RC5. Used cppkafka instead of raw C interface. [#4025](https://github.com/yandex/ClickHouse/pull/4025) ([Ivan](https://github.com/abyss7)) -* Updated `mariadb-client` library. Fixed one of issues found by UBSan. [#3924](https://github.com/yandex/ClickHouse/pull/3924) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Some fixes for UBSan builds. [#3926](https://github.com/yandex/ClickHouse/pull/3926) [#3021](https://github.com/yandex/ClickHouse/pull/3021) [#3948](https://github.com/yandex/ClickHouse/pull/3948) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added support for PowerPC (`ppc64le`) build. [#4132](https://github.com/ClickHouse/ClickHouse/pull/4132) ([Danila Kutenin](https://github.com/danlark1)) +* Stateful functional tests are run on public available dataset. [#3969](https://github.com/ClickHouse/ClickHouse/pull/3969) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed error when the server cannot start with the `bash: /usr/bin/clickhouse-extract-from-config: Operation not permitted` message within Docker or systemd-nspawn. [#4136](https://github.com/ClickHouse/ClickHouse/pull/4136) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Updated `rdkafka` library to v1.0.0-RC5. Used cppkafka instead of raw C interface. [#4025](https://github.com/ClickHouse/ClickHouse/pull/4025) ([Ivan](https://github.com/abyss7)) +* Updated `mariadb-client` library. Fixed one of issues found by UBSan. [#3924](https://github.com/ClickHouse/ClickHouse/pull/3924) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Some fixes for UBSan builds. [#3926](https://github.com/ClickHouse/ClickHouse/pull/3926) [#3021](https://github.com/ClickHouse/ClickHouse/pull/3021) [#3948](https://github.com/ClickHouse/ClickHouse/pull/3948) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Added per-commit runs of tests with UBSan build. * Added per-commit runs of PVS-Studio static analyzer. -* Fixed bugs found by PVS-Studio. [#4013](https://github.com/yandex/ClickHouse/pull/4013) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed glibc compatibility issues. [#4100](https://github.com/yandex/ClickHouse/pull/4100) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Move Docker images to 18.10 and add compatibility file for glibc >= 2.28 [#3965](https://github.com/yandex/ClickHouse/pull/3965) ([alesapin](https://github.com/alesapin)) -* Add env variable if user don't want to chown directories in server Docker image. [#3967](https://github.com/yandex/ClickHouse/pull/3967) ([alesapin](https://github.com/alesapin)) -* Enabled most of the warnings from `-Weverything` in clang. Enabled `-Wpedantic`. [#3986](https://github.com/yandex/ClickHouse/pull/3986) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Added a few more warnings that are available only in clang 8. [#3993](https://github.com/yandex/ClickHouse/pull/3993) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Link to `libLLVM` rather than to individual LLVM libs when using shared linking. [#3989](https://github.com/yandex/ClickHouse/pull/3989) ([Orivej Desh](https://github.com/orivej)) -* Added sanitizer variables for test images. [#4072](https://github.com/yandex/ClickHouse/pull/4072) ([alesapin](https://github.com/alesapin)) -* `clickhouse-server` debian package will recommend `libcap2-bin` package to use `setcap` tool for setting capabilities. This is optional. [#4093](https://github.com/yandex/ClickHouse/pull/4093) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Improved compilation time, fixed includes. [#3898](https://github.com/yandex/ClickHouse/pull/3898) ([proller](https://github.com/proller)) -* Added performance tests for hash functions. [#3918](https://github.com/yandex/ClickHouse/pull/3918) ([filimonov](https://github.com/filimonov)) -* Fixed cyclic library dependences. [#3958](https://github.com/yandex/ClickHouse/pull/3958) ([proller](https://github.com/proller)) -* Improved compilation with low available memory. [#4030](https://github.com/yandex/ClickHouse/pull/4030) ([proller](https://github.com/proller)) -* Added test script to reproduce performance degradation in `jemalloc`. [#4036](https://github.com/yandex/ClickHouse/pull/4036) ([alexey-milovidov](https://github.com/alexey-milovidov)) -* Fixed misspells in comments and string literals under `dbms`. [#4122](https://github.com/yandex/ClickHouse/pull/4122) ([maiha](https://github.com/maiha)) -* Fixed typos in comments. [#4089](https://github.com/yandex/ClickHouse/pull/4089) ([Evgenii Pravda](https://github.com/kvinty)) +* Fixed bugs found by PVS-Studio. [#4013](https://github.com/ClickHouse/ClickHouse/pull/4013) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed glibc compatibility issues. [#4100](https://github.com/ClickHouse/ClickHouse/pull/4100) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Move Docker images to 18.10 and add compatibility file for glibc >= 2.28 [#3965](https://github.com/ClickHouse/ClickHouse/pull/3965) ([alesapin](https://github.com/alesapin)) +* Add env variable if user don't want to chown directories in server Docker image. [#3967](https://github.com/ClickHouse/ClickHouse/pull/3967) ([alesapin](https://github.com/alesapin)) +* Enabled most of the warnings from `-Weverything` in clang. Enabled `-Wpedantic`. [#3986](https://github.com/ClickHouse/ClickHouse/pull/3986) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Added a few more warnings that are available only in clang 8. [#3993](https://github.com/ClickHouse/ClickHouse/pull/3993) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Link to `libLLVM` rather than to individual LLVM libs when using shared linking. [#3989](https://github.com/ClickHouse/ClickHouse/pull/3989) ([Orivej Desh](https://github.com/orivej)) +* Added sanitizer variables for test images. [#4072](https://github.com/ClickHouse/ClickHouse/pull/4072) ([alesapin](https://github.com/alesapin)) +* `clickhouse-server` debian package will recommend `libcap2-bin` package to use `setcap` tool for setting capabilities. This is optional. [#4093](https://github.com/ClickHouse/ClickHouse/pull/4093) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Improved compilation time, fixed includes. [#3898](https://github.com/ClickHouse/ClickHouse/pull/3898) ([proller](https://github.com/proller)) +* Added performance tests for hash functions. [#3918](https://github.com/ClickHouse/ClickHouse/pull/3918) ([filimonov](https://github.com/filimonov)) +* Fixed cyclic library dependences. [#3958](https://github.com/ClickHouse/ClickHouse/pull/3958) ([proller](https://github.com/proller)) +* Improved compilation with low available memory. [#4030](https://github.com/ClickHouse/ClickHouse/pull/4030) ([proller](https://github.com/proller)) +* Added test script to reproduce performance degradation in `jemalloc`. [#4036](https://github.com/ClickHouse/ClickHouse/pull/4036) ([alexey-milovidov](https://github.com/alexey-milovidov)) +* Fixed misspells in comments and string literals under `dbms`. [#4122](https://github.com/ClickHouse/ClickHouse/pull/4122) ([maiha](https://github.com/maiha)) +* Fixed typos in comments. [#4089](https://github.com/ClickHouse/ClickHouse/pull/4089) ([Evgenii Pravda](https://github.com/kvinty)) ## ClickHouse release 18.16.1, 2018-12-21 ### Bug fixes: -* Fixed an error that led to problems with updating dictionaries with the ODBC source. [#3825](https://github.com/yandex/ClickHouse/issues/3825), [#3829](https://github.com/yandex/ClickHouse/issues/3829) -* JIT compilation of aggregate functions now works with LowCardinality columns. [#3838](https://github.com/yandex/ClickHouse/issues/3838) +* Fixed an error that led to problems with updating dictionaries with the ODBC source. [#3825](https://github.com/ClickHouse/ClickHouse/issues/3825), [#3829](https://github.com/ClickHouse/ClickHouse/issues/3829) +* JIT compilation of aggregate functions now works with LowCardinality columns. [#3838](https://github.com/ClickHouse/ClickHouse/issues/3838) ### Improvements: -* Added the `low_cardinality_allow_in_native_format` setting (enabled by default). When disabled, LowCardinality columns will be converted to ordinary columns for SELECT queries and ordinary columns will be expected for INSERT queries. [#3879](https://github.com/yandex/ClickHouse/pull/3879) +* Added the `low_cardinality_allow_in_native_format` setting (enabled by default). When disabled, LowCardinality columns will be converted to ordinary columns for SELECT queries and ordinary columns will be expected for INSERT queries. [#3879](https://github.com/ClickHouse/ClickHouse/pull/3879) ### Build improvements: @@ -1445,104 +1445,104 @@ This release contains exactly the same set of patches as 19.3.6. ### New features: -* `DEFAULT` expressions are evaluated for missing fields when loading data in semi-structured input formats (`JSONEachRow`, `TSKV`). The feature is enabled with the `insert_sample_with_metadata` setting. [#3555](https://github.com/yandex/ClickHouse/pull/3555) -* The `ALTER TABLE` query now has the `MODIFY ORDER BY` action for changing the sorting key when adding or removing a table column. This is useful for tables in the `MergeTree` family that perform additional tasks when merging based on this sorting key, such as `SummingMergeTree`, `AggregatingMergeTree`, and so on. [#3581](https://github.com/yandex/ClickHouse/pull/3581) [#3755](https://github.com/yandex/ClickHouse/pull/3755) -* For tables in the `MergeTree` family, now you can specify a different sorting key (`ORDER BY`) and index (`PRIMARY KEY`). The sorting key can be longer than the index. [#3581](https://github.com/yandex/ClickHouse/pull/3581) -* Added the `hdfs` table function and the `HDFS` table engine for importing and exporting data to HDFS. [chenxing-xc](https://github.com/yandex/ClickHouse/pull/3617) -* Added functions for working with base64: `base64Encode`, `base64Decode`, `tryBase64Decode`. [Alexander Krasheninnikov](https://github.com/yandex/ClickHouse/pull/3350) -* Now you can use a parameter to configure the precision of the `uniqCombined` aggregate function (select the number of HyperLogLog cells). [#3406](https://github.com/yandex/ClickHouse/pull/3406) -* Added the `system.contributors` table that contains the names of everyone who made commits in ClickHouse. [#3452](https://github.com/yandex/ClickHouse/pull/3452) -* Added the ability to omit the partition for the `ALTER TABLE ... FREEZE` query in order to back up all partitions at once. [#3514](https://github.com/yandex/ClickHouse/pull/3514) -* Added `dictGet` and `dictGetOrDefault` functions that don't require specifying the type of return value. The type is determined automatically from the dictionary description. [Amos Bird](https://github.com/yandex/ClickHouse/pull/3564) -* Now you can specify comments for a column in the table description and change it using `ALTER`. [#3377](https://github.com/yandex/ClickHouse/pull/3377) -* Reading is supported for `Join` type tables with simple keys. [Amos Bird](https://github.com/yandex/ClickHouse/pull/3728) -* Now you can specify the options `join_use_nulls`, `max_rows_in_join`, `max_bytes_in_join`, and `join_overflow_mode` when creating a `Join` type table. [Amos Bird](https://github.com/yandex/ClickHouse/pull/3728) -* Added the `joinGet` function that allows you to use a `Join` type table like a dictionary. [Amos Bird](https://github.com/yandex/ClickHouse/pull/3728) -* Added the `partition_key`, `sorting_key`, `primary_key`, and `sampling_key` columns to the `system.tables` table in order to provide information about table keys. [#3609](https://github.com/yandex/ClickHouse/pull/3609) -* Added the `is_in_partition_key`, `is_in_sorting_key`, `is_in_primary_key`, and `is_in_sampling_key` columns to the `system.columns` table. [#3609](https://github.com/yandex/ClickHouse/pull/3609) -* Added the `min_time` and `max_time` columns to the `system.parts` table. These columns are populated when the partitioning key is an expression consisting of `DateTime` columns. [Emmanuel Donin de Rosière](https://github.com/yandex/ClickHouse/pull/3800) +* `DEFAULT` expressions are evaluated for missing fields when loading data in semi-structured input formats (`JSONEachRow`, `TSKV`). The feature is enabled with the `insert_sample_with_metadata` setting. [#3555](https://github.com/ClickHouse/ClickHouse/pull/3555) +* The `ALTER TABLE` query now has the `MODIFY ORDER BY` action for changing the sorting key when adding or removing a table column. This is useful for tables in the `MergeTree` family that perform additional tasks when merging based on this sorting key, such as `SummingMergeTree`, `AggregatingMergeTree`, and so on. [#3581](https://github.com/ClickHouse/ClickHouse/pull/3581) [#3755](https://github.com/ClickHouse/ClickHouse/pull/3755) +* For tables in the `MergeTree` family, now you can specify a different sorting key (`ORDER BY`) and index (`PRIMARY KEY`). The sorting key can be longer than the index. [#3581](https://github.com/ClickHouse/ClickHouse/pull/3581) +* Added the `hdfs` table function and the `HDFS` table engine for importing and exporting data to HDFS. [chenxing-xc](https://github.com/ClickHouse/ClickHouse/pull/3617) +* Added functions for working with base64: `base64Encode`, `base64Decode`, `tryBase64Decode`. [Alexander Krasheninnikov](https://github.com/ClickHouse/ClickHouse/pull/3350) +* Now you can use a parameter to configure the precision of the `uniqCombined` aggregate function (select the number of HyperLogLog cells). [#3406](https://github.com/ClickHouse/ClickHouse/pull/3406) +* Added the `system.contributors` table that contains the names of everyone who made commits in ClickHouse. [#3452](https://github.com/ClickHouse/ClickHouse/pull/3452) +* Added the ability to omit the partition for the `ALTER TABLE ... FREEZE` query in order to back up all partitions at once. [#3514](https://github.com/ClickHouse/ClickHouse/pull/3514) +* Added `dictGet` and `dictGetOrDefault` functions that don't require specifying the type of return value. The type is determined automatically from the dictionary description. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3564) +* Now you can specify comments for a column in the table description and change it using `ALTER`. [#3377](https://github.com/ClickHouse/ClickHouse/pull/3377) +* Reading is supported for `Join` type tables with simple keys. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3728) +* Now you can specify the options `join_use_nulls`, `max_rows_in_join`, `max_bytes_in_join`, and `join_overflow_mode` when creating a `Join` type table. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3728) +* Added the `joinGet` function that allows you to use a `Join` type table like a dictionary. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3728) +* Added the `partition_key`, `sorting_key`, `primary_key`, and `sampling_key` columns to the `system.tables` table in order to provide information about table keys. [#3609](https://github.com/ClickHouse/ClickHouse/pull/3609) +* Added the `is_in_partition_key`, `is_in_sorting_key`, `is_in_primary_key`, and `is_in_sampling_key` columns to the `system.columns` table. [#3609](https://github.com/ClickHouse/ClickHouse/pull/3609) +* Added the `min_time` and `max_time` columns to the `system.parts` table. These columns are populated when the partitioning key is an expression consisting of `DateTime` columns. [Emmanuel Donin de Rosière](https://github.com/ClickHouse/ClickHouse/pull/3800) ### Bug fixes: -* Fixes and performance improvements for the `LowCardinality` data type. `GROUP BY` using `LowCardinality(Nullable(...))`. Getting the values of `extremes`. Processing high-order functions. `LEFT ARRAY JOIN`. Distributed `GROUP BY`. Functions that return `Array`. Execution of `ORDER BY`. Writing to `Distributed` tables (nicelulu). Backward compatibility for `INSERT` queries from old clients that implement the `Native` protocol. Support for `LowCardinality` for `JOIN`. Improved performance when working in a single stream. [#3823](https://github.com/yandex/ClickHouse/pull/3823) [#3803](https://github.com/yandex/ClickHouse/pull/3803) [#3799](https://github.com/yandex/ClickHouse/pull/3799) [#3769](https://github.com/yandex/ClickHouse/pull/3769) [#3744](https://github.com/yandex/ClickHouse/pull/3744) [#3681](https://github.com/yandex/ClickHouse/pull/3681) [#3651](https://github.com/yandex/ClickHouse/pull/3651) [#3649](https://github.com/yandex/ClickHouse/pull/3649) [#3641](https://github.com/yandex/ClickHouse/pull/3641) [#3632](https://github.com/yandex/ClickHouse/pull/3632) [#3568](https://github.com/yandex/ClickHouse/pull/3568) [#3523](https://github.com/yandex/ClickHouse/pull/3523) [#3518](https://github.com/yandex/ClickHouse/pull/3518) -* Fixed how the `select_sequential_consistency` option works. Previously, when this setting was enabled, an incomplete result was sometimes returned after beginning to write to a new partition. [#2863](https://github.com/yandex/ClickHouse/pull/2863) -* Databases are correctly specified when executing DDL `ON CLUSTER` queries and `ALTER UPDATE/DELETE`. [#3772](https://github.com/yandex/ClickHouse/pull/3772) [#3460](https://github.com/yandex/ClickHouse/pull/3460) -* Databases are correctly specified for subqueries inside a VIEW. [#3521](https://github.com/yandex/ClickHouse/pull/3521) -* Fixed a bug in `PREWHERE` with `FINAL` for `VersionedCollapsingMergeTree`. [7167bfd7](https://github.com/yandex/ClickHouse/commit/7167bfd7b365538f7a91c4307ad77e552ab4e8c1) -* Now you can use `KILL QUERY` to cancel queries that have not started yet because they are waiting for the table to be locked. [#3517](https://github.com/yandex/ClickHouse/pull/3517) -* Corrected date and time calculations if the clocks were moved back at midnight (this happens in Iran, and happened in Moscow from 1981 to 1983). Previously, this led to the time being reset a day earlier than necessary, and also caused incorrect formatting of the date and time in text format. [#3819](https://github.com/yandex/ClickHouse/pull/3819) -* Fixed bugs in some cases of `VIEW` and subqueries that omit the database. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/3521) -* Fixed a race condition when simultaneously reading from a `MATERIALIZED VIEW` and deleting a `MATERIALIZED VIEW` due to not locking the internal `MATERIALIZED VIEW`. [#3404](https://github.com/yandex/ClickHouse/pull/3404) [#3694](https://github.com/yandex/ClickHouse/pull/3694) -* Fixed the error `Lock handler cannot be nullptr.` [#3689](https://github.com/yandex/ClickHouse/pull/3689) -* Fixed query processing when the `compile_expressions` option is enabled (it's enabled by default). Nondeterministic constant expressions like the `now` function are no longer unfolded. [#3457](https://github.com/yandex/ClickHouse/pull/3457) +* Fixes and performance improvements for the `LowCardinality` data type. `GROUP BY` using `LowCardinality(Nullable(...))`. Getting the values of `extremes`. Processing high-order functions. `LEFT ARRAY JOIN`. Distributed `GROUP BY`. Functions that return `Array`. Execution of `ORDER BY`. Writing to `Distributed` tables (nicelulu). Backward compatibility for `INSERT` queries from old clients that implement the `Native` protocol. Support for `LowCardinality` for `JOIN`. Improved performance when working in a single stream. [#3823](https://github.com/ClickHouse/ClickHouse/pull/3823) [#3803](https://github.com/ClickHouse/ClickHouse/pull/3803) [#3799](https://github.com/ClickHouse/ClickHouse/pull/3799) [#3769](https://github.com/ClickHouse/ClickHouse/pull/3769) [#3744](https://github.com/ClickHouse/ClickHouse/pull/3744) [#3681](https://github.com/ClickHouse/ClickHouse/pull/3681) [#3651](https://github.com/ClickHouse/ClickHouse/pull/3651) [#3649](https://github.com/ClickHouse/ClickHouse/pull/3649) [#3641](https://github.com/ClickHouse/ClickHouse/pull/3641) [#3632](https://github.com/ClickHouse/ClickHouse/pull/3632) [#3568](https://github.com/ClickHouse/ClickHouse/pull/3568) [#3523](https://github.com/ClickHouse/ClickHouse/pull/3523) [#3518](https://github.com/ClickHouse/ClickHouse/pull/3518) +* Fixed how the `select_sequential_consistency` option works. Previously, when this setting was enabled, an incomplete result was sometimes returned after beginning to write to a new partition. [#2863](https://github.com/ClickHouse/ClickHouse/pull/2863) +* Databases are correctly specified when executing DDL `ON CLUSTER` queries and `ALTER UPDATE/DELETE`. [#3772](https://github.com/ClickHouse/ClickHouse/pull/3772) [#3460](https://github.com/ClickHouse/ClickHouse/pull/3460) +* Databases are correctly specified for subqueries inside a VIEW. [#3521](https://github.com/ClickHouse/ClickHouse/pull/3521) +* Fixed a bug in `PREWHERE` with `FINAL` for `VersionedCollapsingMergeTree`. [7167bfd7](https://github.com/ClickHouse/ClickHouse/commit/7167bfd7b365538f7a91c4307ad77e552ab4e8c1) +* Now you can use `KILL QUERY` to cancel queries that have not started yet because they are waiting for the table to be locked. [#3517](https://github.com/ClickHouse/ClickHouse/pull/3517) +* Corrected date and time calculations if the clocks were moved back at midnight (this happens in Iran, and happened in Moscow from 1981 to 1983). Previously, this led to the time being reset a day earlier than necessary, and also caused incorrect formatting of the date and time in text format. [#3819](https://github.com/ClickHouse/ClickHouse/pull/3819) +* Fixed bugs in some cases of `VIEW` and subqueries that omit the database. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3521) +* Fixed a race condition when simultaneously reading from a `MATERIALIZED VIEW` and deleting a `MATERIALIZED VIEW` due to not locking the internal `MATERIALIZED VIEW`. [#3404](https://github.com/ClickHouse/ClickHouse/pull/3404) [#3694](https://github.com/ClickHouse/ClickHouse/pull/3694) +* Fixed the error `Lock handler cannot be nullptr.` [#3689](https://github.com/ClickHouse/ClickHouse/pull/3689) +* Fixed query processing when the `compile_expressions` option is enabled (it's enabled by default). Nondeterministic constant expressions like the `now` function are no longer unfolded. [#3457](https://github.com/ClickHouse/ClickHouse/pull/3457) * Fixed a crash when specifying a non-constant scale argument in `toDecimal32/64/128` functions. -* Fixed an error when trying to insert an array with `NULL` elements in the `Values` format into a column of type `Array` without `Nullable` (if `input_format_values_interpret_expressions` = 1). [#3487](https://github.com/yandex/ClickHouse/pull/3487) [#3503](https://github.com/yandex/ClickHouse/pull/3503) -* Fixed continuous error logging in `DDLWorker` if ZooKeeper is not available. [8f50c620](https://github.com/yandex/ClickHouse/commit/8f50c620334988b28018213ec0092fe6423847e2) -* Fixed the return type for `quantile*` functions from `Date` and `DateTime` types of arguments. [#3580](https://github.com/yandex/ClickHouse/pull/3580) -* Fixed the `WITH` clause if it specifies a simple alias without expressions. [#3570](https://github.com/yandex/ClickHouse/pull/3570) -* Fixed processing of queries with named sub-queries and qualified column names when `enable_optimize_predicate_expression` is enabled. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/3588) -* Fixed the error `Attempt to attach to nullptr thread group` when working with materialized views. [Marek Vavruša](https://github.com/yandex/ClickHouse/pull/3623) -* Fixed a crash when passing certain incorrect arguments to the `arrayReverse` function. [73e3a7b6](https://github.com/yandex/ClickHouse/commit/73e3a7b662161d6005e7727d8a711b930386b871) -* Fixed the buffer overflow in the `extractURLParameter` function. Improved performance. Added correct processing of strings containing zero bytes. [141e9799](https://github.com/yandex/ClickHouse/commit/141e9799e49201d84ea8e951d1bed4fb6d3dacb5) -* Fixed buffer overflow in the `lowerUTF8` and `upperUTF8` functions. Removed the ability to execute these functions over `FixedString` type arguments. [#3662](https://github.com/yandex/ClickHouse/pull/3662) -* Fixed a rare race condition when deleting `MergeTree` tables. [#3680](https://github.com/yandex/ClickHouse/pull/3680) -* Fixed a race condition when reading from `Buffer` tables and simultaneously performing `ALTER` or `DROP` on the target tables. [#3719](https://github.com/yandex/ClickHouse/pull/3719) -* Fixed a segfault if the `max_temporary_non_const_columns` limit was exceeded. [#3788](https://github.com/yandex/ClickHouse/pull/3788) +* Fixed an error when trying to insert an array with `NULL` elements in the `Values` format into a column of type `Array` without `Nullable` (if `input_format_values_interpret_expressions` = 1). [#3487](https://github.com/ClickHouse/ClickHouse/pull/3487) [#3503](https://github.com/ClickHouse/ClickHouse/pull/3503) +* Fixed continuous error logging in `DDLWorker` if ZooKeeper is not available. [8f50c620](https://github.com/ClickHouse/ClickHouse/commit/8f50c620334988b28018213ec0092fe6423847e2) +* Fixed the return type for `quantile*` functions from `Date` and `DateTime` types of arguments. [#3580](https://github.com/ClickHouse/ClickHouse/pull/3580) +* Fixed the `WITH` clause if it specifies a simple alias without expressions. [#3570](https://github.com/ClickHouse/ClickHouse/pull/3570) +* Fixed processing of queries with named sub-queries and qualified column names when `enable_optimize_predicate_expression` is enabled. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3588) +* Fixed the error `Attempt to attach to nullptr thread group` when working with materialized views. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3623) +* Fixed a crash when passing certain incorrect arguments to the `arrayReverse` function. [73e3a7b6](https://github.com/ClickHouse/ClickHouse/commit/73e3a7b662161d6005e7727d8a711b930386b871) +* Fixed the buffer overflow in the `extractURLParameter` function. Improved performance. Added correct processing of strings containing zero bytes. [141e9799](https://github.com/ClickHouse/ClickHouse/commit/141e9799e49201d84ea8e951d1bed4fb6d3dacb5) +* Fixed buffer overflow in the `lowerUTF8` and `upperUTF8` functions. Removed the ability to execute these functions over `FixedString` type arguments. [#3662](https://github.com/ClickHouse/ClickHouse/pull/3662) +* Fixed a rare race condition when deleting `MergeTree` tables. [#3680](https://github.com/ClickHouse/ClickHouse/pull/3680) +* Fixed a race condition when reading from `Buffer` tables and simultaneously performing `ALTER` or `DROP` on the target tables. [#3719](https://github.com/ClickHouse/ClickHouse/pull/3719) +* Fixed a segfault if the `max_temporary_non_const_columns` limit was exceeded. [#3788](https://github.com/ClickHouse/ClickHouse/pull/3788) ### Improvements: -* The server does not write the processed configuration files to the `/etc/clickhouse-server/` directory. Instead, it saves them in the `preprocessed_configs` directory inside `path`. This means that the `/etc/clickhouse-server/` directory doesn't have write access for the `clickhouse` user, which improves security. [#2443](https://github.com/yandex/ClickHouse/pull/2443) -* The `min_merge_bytes_to_use_direct_io` option is set to 10 GiB by default. A merge that forms large parts of tables from the MergeTree family will be performed in `O_DIRECT` mode, which prevents excessive page cache eviction. [#3504](https://github.com/yandex/ClickHouse/pull/3504) -* Accelerated server start when there is a very large number of tables. [#3398](https://github.com/yandex/ClickHouse/pull/3398) -* Added a connection pool and HTTP `Keep-Alive` for connections between replicas. [#3594](https://github.com/yandex/ClickHouse/pull/3594) -* If the query syntax is invalid, the `400 Bad Request` code is returned in the `HTTP` interface (500 was returned previously). [31bc680a](https://github.com/yandex/ClickHouse/commit/31bc680ac5f4bb1d0360a8ba4696fa84bb47d6ab) -* The `join_default_strictness` option is set to `ALL` by default for compatibility. [120e2cbe](https://github.com/yandex/ClickHouse/commit/120e2cbe2ff4fbad626c28042d9b28781c805afe) -* Removed logging to `stderr` from the `re2` library for invalid or complex regular expressions. [#3723](https://github.com/yandex/ClickHouse/pull/3723) -* Added for the `Kafka` table engine: checks for subscriptions before beginning to read from Kafka; the kafka_max_block_size setting for the table. [Marek Vavruša](https://github.com/yandex/ClickHouse/pull/3396) -* The `cityHash64`, `farmHash64`, `metroHash64`, `sipHash64`, `halfMD5`, `murmurHash2_32`, `murmurHash2_64`, `murmurHash3_32`, and `murmurHash3_64` functions now work for any number of arguments and for arguments in the form of tuples. [#3451](https://github.com/yandex/ClickHouse/pull/3451) [#3519](https://github.com/yandex/ClickHouse/pull/3519) -* The `arrayReverse` function now works with any types of arrays. [73e3a7b6](https://github.com/yandex/ClickHouse/commit/73e3a7b662161d6005e7727d8a711b930386b871) -* Added an optional parameter: the slot size for the `timeSlots` function. [Kirill Shvakov](https://github.com/yandex/ClickHouse/pull/3724) -* For `FULL` and `RIGHT JOIN`, the `max_block_size` setting is used for a stream of non-joined data from the right table. [Amos Bird](https://github.com/yandex/ClickHouse/pull/3699) -* Added the `--secure` command line parameter in `clickhouse-benchmark` and `clickhouse-performance-test` to enable TLS. [#3688](https://github.com/yandex/ClickHouse/pull/3688) [#3690](https://github.com/yandex/ClickHouse/pull/3690) -* Type conversion when the structure of a `Buffer` type table does not match the structure of the destination table. [Vitaly Baranov](https://github.com/yandex/ClickHouse/pull/3603) -* Added the `tcp_keep_alive_timeout` option to enable keep-alive packets after inactivity for the specified time interval. [#3441](https://github.com/yandex/ClickHouse/pull/3441) -* Removed unnecessary quoting of values for the partition key in the `system.parts` table if it consists of a single column. [#3652](https://github.com/yandex/ClickHouse/pull/3652) -* The modulo function works for `Date` and `DateTime` data types. [#3385](https://github.com/yandex/ClickHouse/pull/3385) -* Added synonyms for the `POWER`, `LN`, `LCASE`, `UCASE`, `REPLACE`, `LOCATE`, `SUBSTR`, and `MID` functions. [#3774](https://github.com/yandex/ClickHouse/pull/3774) [#3763](https://github.com/yandex/ClickHouse/pull/3763) Some function names are case-insensitive for compatibility with the SQL standard. Added syntactic sugar `SUBSTRING(expr FROM start FOR length)` for compatibility with SQL. [#3804](https://github.com/yandex/ClickHouse/pull/3804) -* Added the ability to `mlock` memory pages corresponding to `clickhouse-server` executable code to prevent it from being forced out of memory. This feature is disabled by default. [#3553](https://github.com/yandex/ClickHouse/pull/3553) -* Improved performance when reading from `O_DIRECT` (with the `min_bytes_to_use_direct_io` option enabled). [#3405](https://github.com/yandex/ClickHouse/pull/3405) -* Improved performance of the `dictGet...OrDefault` function for a constant key argument and a non-constant default argument. [Amos Bird](https://github.com/yandex/ClickHouse/pull/3563) -* The `firstSignificantSubdomain` function now processes the domains `gov`, `mil`, and `edu`. [Igor Hatarist](https://github.com/yandex/ClickHouse/pull/3601) Improved performance. [#3628](https://github.com/yandex/ClickHouse/pull/3628) +* The server does not write the processed configuration files to the `/etc/clickhouse-server/` directory. Instead, it saves them in the `preprocessed_configs` directory inside `path`. This means that the `/etc/clickhouse-server/` directory doesn't have write access for the `clickhouse` user, which improves security. [#2443](https://github.com/ClickHouse/ClickHouse/pull/2443) +* The `min_merge_bytes_to_use_direct_io` option is set to 10 GiB by default. A merge that forms large parts of tables from the MergeTree family will be performed in `O_DIRECT` mode, which prevents excessive page cache eviction. [#3504](https://github.com/ClickHouse/ClickHouse/pull/3504) +* Accelerated server start when there is a very large number of tables. [#3398](https://github.com/ClickHouse/ClickHouse/pull/3398) +* Added a connection pool and HTTP `Keep-Alive` for connections between replicas. [#3594](https://github.com/ClickHouse/ClickHouse/pull/3594) +* If the query syntax is invalid, the `400 Bad Request` code is returned in the `HTTP` interface (500 was returned previously). [31bc680a](https://github.com/ClickHouse/ClickHouse/commit/31bc680ac5f4bb1d0360a8ba4696fa84bb47d6ab) +* The `join_default_strictness` option is set to `ALL` by default for compatibility. [120e2cbe](https://github.com/ClickHouse/ClickHouse/commit/120e2cbe2ff4fbad626c28042d9b28781c805afe) +* Removed logging to `stderr` from the `re2` library for invalid or complex regular expressions. [#3723](https://github.com/ClickHouse/ClickHouse/pull/3723) +* Added for the `Kafka` table engine: checks for subscriptions before beginning to read from Kafka; the kafka_max_block_size setting for the table. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3396) +* The `cityHash64`, `farmHash64`, `metroHash64`, `sipHash64`, `halfMD5`, `murmurHash2_32`, `murmurHash2_64`, `murmurHash3_32`, and `murmurHash3_64` functions now work for any number of arguments and for arguments in the form of tuples. [#3451](https://github.com/ClickHouse/ClickHouse/pull/3451) [#3519](https://github.com/ClickHouse/ClickHouse/pull/3519) +* The `arrayReverse` function now works with any types of arrays. [73e3a7b6](https://github.com/ClickHouse/ClickHouse/commit/73e3a7b662161d6005e7727d8a711b930386b871) +* Added an optional parameter: the slot size for the `timeSlots` function. [Kirill Shvakov](https://github.com/ClickHouse/ClickHouse/pull/3724) +* For `FULL` and `RIGHT JOIN`, the `max_block_size` setting is used for a stream of non-joined data from the right table. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3699) +* Added the `--secure` command line parameter in `clickhouse-benchmark` and `clickhouse-performance-test` to enable TLS. [#3688](https://github.com/ClickHouse/ClickHouse/pull/3688) [#3690](https://github.com/ClickHouse/ClickHouse/pull/3690) +* Type conversion when the structure of a `Buffer` type table does not match the structure of the destination table. [Vitaly Baranov](https://github.com/ClickHouse/ClickHouse/pull/3603) +* Added the `tcp_keep_alive_timeout` option to enable keep-alive packets after inactivity for the specified time interval. [#3441](https://github.com/ClickHouse/ClickHouse/pull/3441) +* Removed unnecessary quoting of values for the partition key in the `system.parts` table if it consists of a single column. [#3652](https://github.com/ClickHouse/ClickHouse/pull/3652) +* The modulo function works for `Date` and `DateTime` data types. [#3385](https://github.com/ClickHouse/ClickHouse/pull/3385) +* Added synonyms for the `POWER`, `LN`, `LCASE`, `UCASE`, `REPLACE`, `LOCATE`, `SUBSTR`, and `MID` functions. [#3774](https://github.com/ClickHouse/ClickHouse/pull/3774) [#3763](https://github.com/ClickHouse/ClickHouse/pull/3763) Some function names are case-insensitive for compatibility with the SQL standard. Added syntactic sugar `SUBSTRING(expr FROM start FOR length)` for compatibility with SQL. [#3804](https://github.com/ClickHouse/ClickHouse/pull/3804) +* Added the ability to `mlock` memory pages corresponding to `clickhouse-server` executable code to prevent it from being forced out of memory. This feature is disabled by default. [#3553](https://github.com/ClickHouse/ClickHouse/pull/3553) +* Improved performance when reading from `O_DIRECT` (with the `min_bytes_to_use_direct_io` option enabled). [#3405](https://github.com/ClickHouse/ClickHouse/pull/3405) +* Improved performance of the `dictGet...OrDefault` function for a constant key argument and a non-constant default argument. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3563) +* The `firstSignificantSubdomain` function now processes the domains `gov`, `mil`, and `edu`. [Igor Hatarist](https://github.com/ClickHouse/ClickHouse/pull/3601) Improved performance. [#3628](https://github.com/ClickHouse/ClickHouse/pull/3628) * Ability to specify custom environment variables for starting `clickhouse-server` using the `SYS-V init.d` script by defining `CLICKHOUSE_PROGRAM_ENV` in `/etc/default/clickhouse`. -[Pavlo Bashynskyi](https://github.com/yandex/ClickHouse/pull/3612) -* Correct return code for the clickhouse-server init script. [#3516](https://github.com/yandex/ClickHouse/pull/3516) -* The `system.metrics` table now has the `VersionInteger` metric, and `system.build_options` has the added line `VERSION_INTEGER`, which contains the numeric form of the ClickHouse version, such as `18016000`. [#3644](https://github.com/yandex/ClickHouse/pull/3644) -* Removed the ability to compare the `Date` type with a number to avoid potential errors like `date = 2018-12-17`, where quotes around the date are omitted by mistake. [#3687](https://github.com/yandex/ClickHouse/pull/3687) -* Fixed the behavior of stateful functions like `rowNumberInAllBlocks`. They previously output a result that was one number larger due to starting during query analysis. [Amos Bird](https://github.com/yandex/ClickHouse/pull/3729) -* If the `force_restore_data` file can't be deleted, an error message is displayed. [Amos Bird](https://github.com/yandex/ClickHouse/pull/3794) +[Pavlo Bashynskyi](https://github.com/ClickHouse/ClickHouse/pull/3612) +* Correct return code for the clickhouse-server init script. [#3516](https://github.com/ClickHouse/ClickHouse/pull/3516) +* The `system.metrics` table now has the `VersionInteger` metric, and `system.build_options` has the added line `VERSION_INTEGER`, which contains the numeric form of the ClickHouse version, such as `18016000`. [#3644](https://github.com/ClickHouse/ClickHouse/pull/3644) +* Removed the ability to compare the `Date` type with a number to avoid potential errors like `date = 2018-12-17`, where quotes around the date are omitted by mistake. [#3687](https://github.com/ClickHouse/ClickHouse/pull/3687) +* Fixed the behavior of stateful functions like `rowNumberInAllBlocks`. They previously output a result that was one number larger due to starting during query analysis. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3729) +* If the `force_restore_data` file can't be deleted, an error message is displayed. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3794) ### Build improvements: -* Updated the `jemalloc` library, which fixes a potential memory leak. [Amos Bird](https://github.com/yandex/ClickHouse/pull/3557) -* Profiling with `jemalloc` is enabled by default in order to debug builds. [2cc82f5c](https://github.com/yandex/ClickHouse/commit/2cc82f5cbe266421cd4c1165286c2c47e5ffcb15) -* Added the ability to run integration tests when only `Docker` is installed on the system. [#3650](https://github.com/yandex/ClickHouse/pull/3650) -* Added the fuzz expression test in SELECT queries. [#3442](https://github.com/yandex/ClickHouse/pull/3442) -* Added a stress test for commits, which performs functional tests in parallel and in random order to detect more race conditions. [#3438](https://github.com/yandex/ClickHouse/pull/3438) -* Improved the method for starting clickhouse-server in a Docker image. [Elghazal Ahmed](https://github.com/yandex/ClickHouse/pull/3663) -* For a Docker image, added support for initializing databases using files in the `/docker-entrypoint-initdb.d` directory. [Konstantin Lebedev](https://github.com/yandex/ClickHouse/pull/3695) -* Fixes for builds on ARM. [#3709](https://github.com/yandex/ClickHouse/pull/3709) +* Updated the `jemalloc` library, which fixes a potential memory leak. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3557) +* Profiling with `jemalloc` is enabled by default in order to debug builds. [2cc82f5c](https://github.com/ClickHouse/ClickHouse/commit/2cc82f5cbe266421cd4c1165286c2c47e5ffcb15) +* Added the ability to run integration tests when only `Docker` is installed on the system. [#3650](https://github.com/ClickHouse/ClickHouse/pull/3650) +* Added the fuzz expression test in SELECT queries. [#3442](https://github.com/ClickHouse/ClickHouse/pull/3442) +* Added a stress test for commits, which performs functional tests in parallel and in random order to detect more race conditions. [#3438](https://github.com/ClickHouse/ClickHouse/pull/3438) +* Improved the method for starting clickhouse-server in a Docker image. [Elghazal Ahmed](https://github.com/ClickHouse/ClickHouse/pull/3663) +* For a Docker image, added support for initializing databases using files in the `/docker-entrypoint-initdb.d` directory. [Konstantin Lebedev](https://github.com/ClickHouse/ClickHouse/pull/3695) +* Fixes for builds on ARM. [#3709](https://github.com/ClickHouse/ClickHouse/pull/3709) ### Backward incompatible changes: -* Removed the ability to compare the `Date` type with a number. Instead of `toDate('2018-12-18') = 17883`, you must use explicit type conversion `= toDate(17883)` [#3687](https://github.com/yandex/ClickHouse/pull/3687) +* Removed the ability to compare the `Date` type with a number. Instead of `toDate('2018-12-18') = 17883`, you must use explicit type conversion `= toDate(17883)` [#3687](https://github.com/ClickHouse/ClickHouse/pull/3687) ## ClickHouse release 18.14.19, 2018-12-19 ### Bug fixes: -* Fixed an error that led to problems with updating dictionaries with the ODBC source. [#3825](https://github.com/yandex/ClickHouse/issues/3825), [#3829](https://github.com/yandex/ClickHouse/issues/3829) -* Databases are correctly specified when executing DDL `ON CLUSTER` queries. [#3460](https://github.com/yandex/ClickHouse/pull/3460) -* Fixed a segfault if the `max_temporary_non_const_columns` limit was exceeded. [#3788](https://github.com/yandex/ClickHouse/pull/3788) +* Fixed an error that led to problems with updating dictionaries with the ODBC source. [#3825](https://github.com/ClickHouse/ClickHouse/issues/3825), [#3829](https://github.com/ClickHouse/ClickHouse/issues/3829) +* Databases are correctly specified when executing DDL `ON CLUSTER` queries. [#3460](https://github.com/ClickHouse/ClickHouse/pull/3460) +* Fixed a segfault if the `max_temporary_non_const_columns` limit was exceeded. [#3788](https://github.com/ClickHouse/ClickHouse/pull/3788) ### Build improvements: @@ -1551,296 +1551,296 @@ This release contains exactly the same set of patches as 19.3.6. ## ClickHouse release 18.14.18, 2018-12-04 ### Bug fixes: -* Fixed error in `dictGet...` function for dictionaries of type `range`, if one of the arguments is constant and other is not. [#3751](https://github.com/yandex/ClickHouse/pull/3751) -* Fixed error that caused messages `netlink: '...': attribute type 1 has an invalid length` to be printed in Linux kernel log, that was happening only on fresh enough versions of Linux kernel. [#3749](https://github.com/yandex/ClickHouse/pull/3749) -* Fixed segfault in function `empty` for argument of `FixedString` type. [Daniel, Dao Quang Minh](https://github.com/yandex/ClickHouse/pull/3703) -* Fixed excessive memory allocation when using large value of `max_query_size` setting (a memory chunk of `max_query_size` bytes was preallocated at once). [#3720](https://github.com/yandex/ClickHouse/pull/3720) +* Fixed error in `dictGet...` function for dictionaries of type `range`, if one of the arguments is constant and other is not. [#3751](https://github.com/ClickHouse/ClickHouse/pull/3751) +* Fixed error that caused messages `netlink: '...': attribute type 1 has an invalid length` to be printed in Linux kernel log, that was happening only on fresh enough versions of Linux kernel. [#3749](https://github.com/ClickHouse/ClickHouse/pull/3749) +* Fixed segfault in function `empty` for argument of `FixedString` type. [Daniel, Dao Quang Minh](https://github.com/ClickHouse/ClickHouse/pull/3703) +* Fixed excessive memory allocation when using large value of `max_query_size` setting (a memory chunk of `max_query_size` bytes was preallocated at once). [#3720](https://github.com/ClickHouse/ClickHouse/pull/3720) ### Build changes: -* Fixed build with LLVM/Clang libraries of version 7 from the OS packages (these libraries are used for runtime query compilation). [#3582](https://github.com/yandex/ClickHouse/pull/3582) +* Fixed build with LLVM/Clang libraries of version 7 from the OS packages (these libraries are used for runtime query compilation). [#3582](https://github.com/ClickHouse/ClickHouse/pull/3582) ## ClickHouse release 18.14.17, 2018-11-30 ### Bug fixes: -* Fixed cases when the ODBC bridge process did not terminate with the main server process. [#3642](https://github.com/yandex/ClickHouse/pull/3642) -* Fixed synchronous insertion into the `Distributed` table with a columns list that differs from the column list of the remote table. [#3673](https://github.com/yandex/ClickHouse/pull/3673) -* Fixed a rare race condition that can lead to a crash when dropping a MergeTree table. [#3643](https://github.com/yandex/ClickHouse/pull/3643) -* Fixed a query deadlock in case when query thread creation fails with the `Resource temporarily unavailable` error. [#3643](https://github.com/yandex/ClickHouse/pull/3643) -* Fixed parsing of the `ENGINE` clause when the `CREATE AS table` syntax was used and the `ENGINE` clause was specified before the `AS table` (the error resulted in ignoring the specified engine). [#3692](https://github.com/yandex/ClickHouse/pull/3692) +* Fixed cases when the ODBC bridge process did not terminate with the main server process. [#3642](https://github.com/ClickHouse/ClickHouse/pull/3642) +* Fixed synchronous insertion into the `Distributed` table with a columns list that differs from the column list of the remote table. [#3673](https://github.com/ClickHouse/ClickHouse/pull/3673) +* Fixed a rare race condition that can lead to a crash when dropping a MergeTree table. [#3643](https://github.com/ClickHouse/ClickHouse/pull/3643) +* Fixed a query deadlock in case when query thread creation fails with the `Resource temporarily unavailable` error. [#3643](https://github.com/ClickHouse/ClickHouse/pull/3643) +* Fixed parsing of the `ENGINE` clause when the `CREATE AS table` syntax was used and the `ENGINE` clause was specified before the `AS table` (the error resulted in ignoring the specified engine). [#3692](https://github.com/ClickHouse/ClickHouse/pull/3692) ## ClickHouse release 18.14.15, 2018-11-21 ### Bug fixes: -* The size of memory chunk was overestimated while deserializing the column of type `Array(String)` that leads to "Memory limit exceeded" errors. The issue appeared in version 18.12.13. [#3589](https://github.com/yandex/ClickHouse/issues/3589) +* The size of memory chunk was overestimated while deserializing the column of type `Array(String)` that leads to "Memory limit exceeded" errors. The issue appeared in version 18.12.13. [#3589](https://github.com/ClickHouse/ClickHouse/issues/3589) ## ClickHouse release 18.14.14, 2018-11-20 ### Bug fixes: -* Fixed `ON CLUSTER` queries when cluster configured as secure (flag ``). [#3599](https://github.com/yandex/ClickHouse/pull/3599) +* Fixed `ON CLUSTER` queries when cluster configured as secure (flag ``). [#3599](https://github.com/ClickHouse/ClickHouse/pull/3599) ### Build changes: -* Fixed problems (llvm-7 from system, macos) [#3582](https://github.com/yandex/ClickHouse/pull/3582) +* Fixed problems (llvm-7 from system, macos) [#3582](https://github.com/ClickHouse/ClickHouse/pull/3582) ## ClickHouse release 18.14.13, 2018-11-08 ### Bug fixes: -* Fixed the `Block structure mismatch in MergingSorted stream` error. [#3162](https://github.com/yandex/ClickHouse/issues/3162) -* Fixed `ON CLUSTER` queries in case when secure connections were turned on in the cluster config (the `` flag). [#3465](https://github.com/yandex/ClickHouse/pull/3465) -* Fixed an error in queries that used `SAMPLE`, `PREWHERE` and alias columns. [#3543](https://github.com/yandex/ClickHouse/pull/3543) -* Fixed a rare `unknown compression method` error when the `min_bytes_to_use_direct_io` setting was enabled. [3544](https://github.com/yandex/ClickHouse/pull/3544) +* Fixed the `Block structure mismatch in MergingSorted stream` error. [#3162](https://github.com/ClickHouse/ClickHouse/issues/3162) +* Fixed `ON CLUSTER` queries in case when secure connections were turned on in the cluster config (the `` flag). [#3465](https://github.com/ClickHouse/ClickHouse/pull/3465) +* Fixed an error in queries that used `SAMPLE`, `PREWHERE` and alias columns. [#3543](https://github.com/ClickHouse/ClickHouse/pull/3543) +* Fixed a rare `unknown compression method` error when the `min_bytes_to_use_direct_io` setting was enabled. [3544](https://github.com/ClickHouse/ClickHouse/pull/3544) ### Performance improvements: -* Fixed performance regression of queries with `GROUP BY` of columns of UInt16 or Date type when executing on AMD EPYC processors. [Igor Lapko](https://github.com/yandex/ClickHouse/pull/3512) -* Fixed performance regression of queries that process long strings. [#3530](https://github.com/yandex/ClickHouse/pull/3530) +* Fixed performance regression of queries with `GROUP BY` of columns of UInt16 or Date type when executing on AMD EPYC processors. [Igor Lapko](https://github.com/ClickHouse/ClickHouse/pull/3512) +* Fixed performance regression of queries that process long strings. [#3530](https://github.com/ClickHouse/ClickHouse/pull/3530) ### Build improvements: -* Improvements for simplifying the Arcadia build. [#3475](https://github.com/yandex/ClickHouse/pull/3475), [#3535](https://github.com/yandex/ClickHouse/pull/3535) +* Improvements for simplifying the Arcadia build. [#3475](https://github.com/ClickHouse/ClickHouse/pull/3475), [#3535](https://github.com/ClickHouse/ClickHouse/pull/3535) ## ClickHouse release 18.14.12, 2018-11-02 ### Bug fixes: -* Fixed a crash on joining two unnamed subqueries. [#3505](https://github.com/yandex/ClickHouse/pull/3505) -* Fixed generating incorrect queries (with an empty `WHERE` clause) when querying external databases. [hotid](https://github.com/yandex/ClickHouse/pull/3477) -* Fixed using an incorrect timeout value in ODBC dictionaries. [Marek Vavruša](https://github.com/yandex/ClickHouse/pull/3511) +* Fixed a crash on joining two unnamed subqueries. [#3505](https://github.com/ClickHouse/ClickHouse/pull/3505) +* Fixed generating incorrect queries (with an empty `WHERE` clause) when querying external databases. [hotid](https://github.com/ClickHouse/ClickHouse/pull/3477) +* Fixed using an incorrect timeout value in ODBC dictionaries. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3511) ## ClickHouse release 18.14.11, 2018-10-29 ### Bug fixes: -* Fixed the error `Block structure mismatch in UNION stream: different number of columns` in LIMIT queries. [#2156](https://github.com/yandex/ClickHouse/issues/2156) -* Fixed errors when merging data in tables containing arrays inside Nested structures. [#3397](https://github.com/yandex/ClickHouse/pull/3397) -* Fixed incorrect query results if the `merge_tree_uniform_read_distribution` setting is disabled (it is enabled by default). [#3429](https://github.com/yandex/ClickHouse/pull/3429) -* Fixed an error on inserts to a Distributed table in Native format. [#3411](https://github.com/yandex/ClickHouse/issues/3411) +* Fixed the error `Block structure mismatch in UNION stream: different number of columns` in LIMIT queries. [#2156](https://github.com/ClickHouse/ClickHouse/issues/2156) +* Fixed errors when merging data in tables containing arrays inside Nested structures. [#3397](https://github.com/ClickHouse/ClickHouse/pull/3397) +* Fixed incorrect query results if the `merge_tree_uniform_read_distribution` setting is disabled (it is enabled by default). [#3429](https://github.com/ClickHouse/ClickHouse/pull/3429) +* Fixed an error on inserts to a Distributed table in Native format. [#3411](https://github.com/ClickHouse/ClickHouse/issues/3411) ## ClickHouse release 18.14.10, 2018-10-23 -* The `compile_expressions` setting (JIT compilation of expressions) is disabled by default. [#3410](https://github.com/yandex/ClickHouse/pull/3410) +* The `compile_expressions` setting (JIT compilation of expressions) is disabled by default. [#3410](https://github.com/ClickHouse/ClickHouse/pull/3410) * The `enable_optimize_predicate_expression` setting is disabled by default. ## ClickHouse release 18.14.9, 2018-10-16 ### New features: -* The `WITH CUBE` modifier for `GROUP BY` (the alternative syntax `GROUP BY CUBE(...)` is also available). [#3172](https://github.com/yandex/ClickHouse/pull/3172) -* Added the `formatDateTime` function. [Alexandr Krasheninnikov](https://github.com/yandex/ClickHouse/pull/2770) -* Added the `JDBC` table engine and `jdbc` table function (requires installing clickhouse-jdbc-bridge). [Alexandr Krasheninnikov](https://github.com/yandex/ClickHouse/pull/3210) -* Added functions for working with the ISO week number: `toISOWeek`, `toISOYear`, `toStartOfISOYear`, and `toDayOfYear`. [#3146](https://github.com/yandex/ClickHouse/pull/3146) -* Now you can use `Nullable` columns for `MySQL` and `ODBC` tables. [#3362](https://github.com/yandex/ClickHouse/pull/3362) -* Nested data structures can be read as nested objects in `JSONEachRow` format. Added the `input_format_import_nested_json` setting. [Veloman Yunkan](https://github.com/yandex/ClickHouse/pull/3144) -* Parallel processing is available for many `MATERIALIZED VIEW`s when inserting data. See the `parallel_view_processing` setting. [Marek Vavruša](https://github.com/yandex/ClickHouse/pull/3208) -* Added the `SYSTEM FLUSH LOGS` query (forced log flushes to system tables such as `query_log`) [#3321](https://github.com/yandex/ClickHouse/pull/3321) -* Now you can use pre-defined `database` and `table` macros when declaring `Replicated` tables. [#3251](https://github.com/yandex/ClickHouse/pull/3251) -* Added the ability to read `Decimal` type values in engineering notation (indicating powers of ten). [#3153](https://github.com/yandex/ClickHouse/pull/3153) +* The `WITH CUBE` modifier for `GROUP BY` (the alternative syntax `GROUP BY CUBE(...)` is also available). [#3172](https://github.com/ClickHouse/ClickHouse/pull/3172) +* Added the `formatDateTime` function. [Alexandr Krasheninnikov](https://github.com/ClickHouse/ClickHouse/pull/2770) +* Added the `JDBC` table engine and `jdbc` table function (requires installing clickhouse-jdbc-bridge). [Alexandr Krasheninnikov](https://github.com/ClickHouse/ClickHouse/pull/3210) +* Added functions for working with the ISO week number: `toISOWeek`, `toISOYear`, `toStartOfISOYear`, and `toDayOfYear`. [#3146](https://github.com/ClickHouse/ClickHouse/pull/3146) +* Now you can use `Nullable` columns for `MySQL` and `ODBC` tables. [#3362](https://github.com/ClickHouse/ClickHouse/pull/3362) +* Nested data structures can be read as nested objects in `JSONEachRow` format. Added the `input_format_import_nested_json` setting. [Veloman Yunkan](https://github.com/ClickHouse/ClickHouse/pull/3144) +* Parallel processing is available for many `MATERIALIZED VIEW`s when inserting data. See the `parallel_view_processing` setting. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3208) +* Added the `SYSTEM FLUSH LOGS` query (forced log flushes to system tables such as `query_log`) [#3321](https://github.com/ClickHouse/ClickHouse/pull/3321) +* Now you can use pre-defined `database` and `table` macros when declaring `Replicated` tables. [#3251](https://github.com/ClickHouse/ClickHouse/pull/3251) +* Added the ability to read `Decimal` type values in engineering notation (indicating powers of ten). [#3153](https://github.com/ClickHouse/ClickHouse/pull/3153) ### Experimental features: -* Optimization of the GROUP BY clause for `LowCardinality data types.` [#3138](https://github.com/yandex/ClickHouse/pull/3138) -* Optimized calculation of expressions for `LowCardinality data types.` [#3200](https://github.com/yandex/ClickHouse/pull/3200) +* Optimization of the GROUP BY clause for `LowCardinality data types.` [#3138](https://github.com/ClickHouse/ClickHouse/pull/3138) +* Optimized calculation of expressions for `LowCardinality data types.` [#3200](https://github.com/ClickHouse/ClickHouse/pull/3200) ### Improvements: -* Significantly reduced memory consumption for queries with `ORDER BY` and `LIMIT`. See the `max_bytes_before_remerge_sort` setting. [#3205](https://github.com/yandex/ClickHouse/pull/3205) -* In the absence of `JOIN` (`LEFT`, `INNER`, ...), `INNER JOIN` is assumed. [#3147](https://github.com/yandex/ClickHouse/pull/3147) -* Qualified asterisks work correctly in queries with `JOIN`. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/3202) -* The `ODBC` table engine correctly chooses the method for quoting identifiers in the SQL dialect of a remote database. [Alexandr Krasheninnikov](https://github.com/yandex/ClickHouse/pull/3210) +* Significantly reduced memory consumption for queries with `ORDER BY` and `LIMIT`. See the `max_bytes_before_remerge_sort` setting. [#3205](https://github.com/ClickHouse/ClickHouse/pull/3205) +* In the absence of `JOIN` (`LEFT`, `INNER`, ...), `INNER JOIN` is assumed. [#3147](https://github.com/ClickHouse/ClickHouse/pull/3147) +* Qualified asterisks work correctly in queries with `JOIN`. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3202) +* The `ODBC` table engine correctly chooses the method for quoting identifiers in the SQL dialect of a remote database. [Alexandr Krasheninnikov](https://github.com/ClickHouse/ClickHouse/pull/3210) * The `compile_expressions` setting (JIT compilation of expressions) is enabled by default. -* Fixed behavior for simultaneous DROP DATABASE/TABLE IF EXISTS and CREATE DATABASE/TABLE IF NOT EXISTS. Previously, a `CREATE DATABASE ... IF NOT EXISTS` query could return the error message "File ... already exists", and the `CREATE TABLE ... IF NOT EXISTS` and `DROP TABLE IF EXISTS` queries could return `Table ... is creating or attaching right now`. [#3101](https://github.com/yandex/ClickHouse/pull/3101) -* LIKE and IN expressions with a constant right half are passed to the remote server when querying from MySQL or ODBC tables. [#3182](https://github.com/yandex/ClickHouse/pull/3182) -* Comparisons with constant expressions in a WHERE clause are passed to the remote server when querying from MySQL and ODBC tables. Previously, only comparisons with constants were passed. [#3182](https://github.com/yandex/ClickHouse/pull/3182) -* Correct calculation of row width in the terminal for `Pretty` formats, including strings with hieroglyphs. [Amos Bird](https://github.com/yandex/ClickHouse/pull/3257). +* Fixed behavior for simultaneous DROP DATABASE/TABLE IF EXISTS and CREATE DATABASE/TABLE IF NOT EXISTS. Previously, a `CREATE DATABASE ... IF NOT EXISTS` query could return the error message "File ... already exists", and the `CREATE TABLE ... IF NOT EXISTS` and `DROP TABLE IF EXISTS` queries could return `Table ... is creating or attaching right now`. [#3101](https://github.com/ClickHouse/ClickHouse/pull/3101) +* LIKE and IN expressions with a constant right half are passed to the remote server when querying from MySQL or ODBC tables. [#3182](https://github.com/ClickHouse/ClickHouse/pull/3182) +* Comparisons with constant expressions in a WHERE clause are passed to the remote server when querying from MySQL and ODBC tables. Previously, only comparisons with constants were passed. [#3182](https://github.com/ClickHouse/ClickHouse/pull/3182) +* Correct calculation of row width in the terminal for `Pretty` formats, including strings with hieroglyphs. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/3257). * `ON CLUSTER` can be specified for `ALTER UPDATE` queries. -* Improved performance for reading data in `JSONEachRow` format. [#3332](https://github.com/yandex/ClickHouse/pull/3332) -* Added synonyms for the `LENGTH` and `CHARACTER_LENGTH` functions for compatibility. The `CONCAT` function is no longer case-sensitive. [#3306](https://github.com/yandex/ClickHouse/pull/3306) -* Added the `TIMESTAMP` synonym for the `DateTime` type. [#3390](https://github.com/yandex/ClickHouse/pull/3390) +* Improved performance for reading data in `JSONEachRow` format. [#3332](https://github.com/ClickHouse/ClickHouse/pull/3332) +* Added synonyms for the `LENGTH` and `CHARACTER_LENGTH` functions for compatibility. The `CONCAT` function is no longer case-sensitive. [#3306](https://github.com/ClickHouse/ClickHouse/pull/3306) +* Added the `TIMESTAMP` synonym for the `DateTime` type. [#3390](https://github.com/ClickHouse/ClickHouse/pull/3390) * There is always space reserved for query_id in the server logs, even if the log line is not related to a query. This makes it easier to parse server text logs with third-party tools. -* Memory consumption by a query is logged when it exceeds the next level of an integer number of gigabytes. [#3205](https://github.com/yandex/ClickHouse/pull/3205) -* Added compatibility mode for the case when the client library that uses the Native protocol sends fewer columns by mistake than the server expects for the INSERT query. This scenario was possible when using the clickhouse-cpp library. Previously, this scenario caused the server to crash. [#3171](https://github.com/yandex/ClickHouse/pull/3171) -* In a user-defined WHERE expression in `clickhouse-copier`, you can now use a `partition_key` alias (for additional filtering by source table partition). This is useful if the partitioning scheme changes during copying, but only changes slightly. [#3166](https://github.com/yandex/ClickHouse/pull/3166) -* The workflow of the `Kafka` engine has been moved to a background thread pool in order to automatically reduce the speed of data reading at high loads. [Marek Vavruša](https://github.com/yandex/ClickHouse/pull/3215). -* Support for reading `Tuple` and `Nested` values of structures like `struct` in the `Cap'n'Proto format`. [Marek Vavruša](https://github.com/yandex/ClickHouse/pull/3216) -* The list of top-level domains for the `firstSignificantSubdomain` function now includes the domain `biz`. [decaseal](https://github.com/yandex/ClickHouse/pull/3219) -* In the configuration of external dictionaries, `null_value` is interpreted as the value of the default data type. [#3330](https://github.com/yandex/ClickHouse/pull/3330) -* Support for the `intDiv` and `intDivOrZero` functions for `Decimal`. [b48402e8](https://github.com/yandex/ClickHouse/commit/b48402e8712e2b9b151e0eef8193811d433a1264) -* Support for the `Date`, `DateTime`, `UUID`, and `Decimal` types as a key for the `sumMap` aggregate function. [#3281](https://github.com/yandex/ClickHouse/pull/3281) -* Support for the `Decimal` data type in external dictionaries. [#3324](https://github.com/yandex/ClickHouse/pull/3324) -* Support for the `Decimal` data type in `SummingMergeTree` tables. [#3348](https://github.com/yandex/ClickHouse/pull/3348) -* Added specializations for `UUID` in `if`. [#3366](https://github.com/yandex/ClickHouse/pull/3366) -* Reduced the number of `open` and `close` system calls when reading from a `MergeTree table`. [#3283](https://github.com/yandex/ClickHouse/pull/3283) -* A `TRUNCATE TABLE` query can be executed on any replica (the query is passed to the leader replica). [Kirill Shvakov](https://github.com/yandex/ClickHouse/pull/3375) +* Memory consumption by a query is logged when it exceeds the next level of an integer number of gigabytes. [#3205](https://github.com/ClickHouse/ClickHouse/pull/3205) +* Added compatibility mode for the case when the client library that uses the Native protocol sends fewer columns by mistake than the server expects for the INSERT query. This scenario was possible when using the clickhouse-cpp library. Previously, this scenario caused the server to crash. [#3171](https://github.com/ClickHouse/ClickHouse/pull/3171) +* In a user-defined WHERE expression in `clickhouse-copier`, you can now use a `partition_key` alias (for additional filtering by source table partition). This is useful if the partitioning scheme changes during copying, but only changes slightly. [#3166](https://github.com/ClickHouse/ClickHouse/pull/3166) +* The workflow of the `Kafka` engine has been moved to a background thread pool in order to automatically reduce the speed of data reading at high loads. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3215). +* Support for reading `Tuple` and `Nested` values of structures like `struct` in the `Cap'n'Proto format`. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3216) +* The list of top-level domains for the `firstSignificantSubdomain` function now includes the domain `biz`. [decaseal](https://github.com/ClickHouse/ClickHouse/pull/3219) +* In the configuration of external dictionaries, `null_value` is interpreted as the value of the default data type. [#3330](https://github.com/ClickHouse/ClickHouse/pull/3330) +* Support for the `intDiv` and `intDivOrZero` functions for `Decimal`. [b48402e8](https://github.com/ClickHouse/ClickHouse/commit/b48402e8712e2b9b151e0eef8193811d433a1264) +* Support for the `Date`, `DateTime`, `UUID`, and `Decimal` types as a key for the `sumMap` aggregate function. [#3281](https://github.com/ClickHouse/ClickHouse/pull/3281) +* Support for the `Decimal` data type in external dictionaries. [#3324](https://github.com/ClickHouse/ClickHouse/pull/3324) +* Support for the `Decimal` data type in `SummingMergeTree` tables. [#3348](https://github.com/ClickHouse/ClickHouse/pull/3348) +* Added specializations for `UUID` in `if`. [#3366](https://github.com/ClickHouse/ClickHouse/pull/3366) +* Reduced the number of `open` and `close` system calls when reading from a `MergeTree table`. [#3283](https://github.com/ClickHouse/ClickHouse/pull/3283) +* A `TRUNCATE TABLE` query can be executed on any replica (the query is passed to the leader replica). [Kirill Shvakov](https://github.com/ClickHouse/ClickHouse/pull/3375) ### Bug fixes: -* Fixed an issue with `Dictionary` tables for `range_hashed` dictionaries. This error occurred in version 18.12.17. [#1702](https://github.com/yandex/ClickHouse/pull/1702) -* Fixed an error when loading `range_hashed` dictionaries (the message `Unsupported type Nullable (...)`). This error occurred in version 18.12.17. [#3362](https://github.com/yandex/ClickHouse/pull/3362) -* Fixed errors in the `pointInPolygon` function due to the accumulation of inaccurate calculations for polygons with a large number of vertices located close to each other. [#3331](https://github.com/yandex/ClickHouse/pull/3331) [#3341](https://github.com/yandex/ClickHouse/pull/3341) -* If after merging data parts, the checksum for the resulting part differs from the result of the same merge in another replica, the result of the merge is deleted and the data part is downloaded from the other replica (this is the correct behavior). But after downloading the data part, it couldn't be added to the working set because of an error that the part already exists (because the data part was deleted with some delay after the merge). This led to cyclical attempts to download the same data. [#3194](https://github.com/yandex/ClickHouse/pull/3194) -* Fixed incorrect calculation of total memory consumption by queries (because of incorrect calculation, the `max_memory_usage_for_all_queries` setting worked incorrectly and the `MemoryTracking` metric had an incorrect value). This error occurred in version 18.12.13. [Marek Vavruša](https://github.com/yandex/ClickHouse/pull/3344) -* Fixed the functionality of `CREATE TABLE ... ON CLUSTER ... AS SELECT ...` This error occurred in version 18.12.13. [#3247](https://github.com/yandex/ClickHouse/pull/3247) -* Fixed unnecessary preparation of data structures for `JOIN`s on the server that initiates the query if the `JOIN` is only performed on remote servers. [#3340](https://github.com/yandex/ClickHouse/pull/3340) -* Fixed bugs in the `Kafka` engine: deadlocks after exceptions when starting to read data, and locks upon completion [Marek Vavruša](https://github.com/yandex/ClickHouse/pull/3215). -* For `Kafka` tables, the optional `schema` parameter was not passed (the schema of the `Cap'n'Proto` format). [Vojtech Splichal](https://github.com/yandex/ClickHouse/pull/3150) -* If the ensemble of ZooKeeper servers has servers that accept the connection but then immediately close it instead of responding to the handshake, ClickHouse chooses to connect another server. Previously, this produced the error `Cannot read all data. Bytes read: 0. Bytes expected: 4.` and the server couldn't start. [8218cf3a](https://github.com/yandex/ClickHouse/commit/8218cf3a5f39a43401953769d6d12a0bb8d29da9) -* If the ensemble of ZooKeeper servers contains servers for which the DNS query returns an error, these servers are ignored. [17b8e209](https://github.com/yandex/ClickHouse/commit/17b8e209221061325ad7ba0539f03c6e65f87f29) -* Fixed type conversion between `Date` and `DateTime` when inserting data in the `VALUES` format (if `input_format_values_interpret_expressions = 1`). Previously, the conversion was performed between the numerical value of the number of days in Unix Epoch time and the Unix timestamp, which led to unexpected results. [#3229](https://github.com/yandex/ClickHouse/pull/3229) -* Corrected type conversion between `Decimal` and integer numbers. [#3211](https://github.com/yandex/ClickHouse/pull/3211) -* Fixed errors in the `enable_optimize_predicate_expression` setting. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/3231) -* Fixed a parsing error in CSV format with floating-point numbers if a non-default CSV separator is used, such as `;` [#3155](https://github.com/yandex/ClickHouse/pull/3155) -* Fixed the `arrayCumSumNonNegative` function (it does not accumulate negative values if the accumulator is less than zero). [Aleksey Studnev](https://github.com/yandex/ClickHouse/pull/3163) -* Fixed how `Merge` tables work on top of `Distributed` tables when using `PREWHERE`. [#3165](https://github.com/yandex/ClickHouse/pull/3165) +* Fixed an issue with `Dictionary` tables for `range_hashed` dictionaries. This error occurred in version 18.12.17. [#1702](https://github.com/ClickHouse/ClickHouse/pull/1702) +* Fixed an error when loading `range_hashed` dictionaries (the message `Unsupported type Nullable (...)`). This error occurred in version 18.12.17. [#3362](https://github.com/ClickHouse/ClickHouse/pull/3362) +* Fixed errors in the `pointInPolygon` function due to the accumulation of inaccurate calculations for polygons with a large number of vertices located close to each other. [#3331](https://github.com/ClickHouse/ClickHouse/pull/3331) [#3341](https://github.com/ClickHouse/ClickHouse/pull/3341) +* If after merging data parts, the checksum for the resulting part differs from the result of the same merge in another replica, the result of the merge is deleted and the data part is downloaded from the other replica (this is the correct behavior). But after downloading the data part, it couldn't be added to the working set because of an error that the part already exists (because the data part was deleted with some delay after the merge). This led to cyclical attempts to download the same data. [#3194](https://github.com/ClickHouse/ClickHouse/pull/3194) +* Fixed incorrect calculation of total memory consumption by queries (because of incorrect calculation, the `max_memory_usage_for_all_queries` setting worked incorrectly and the `MemoryTracking` metric had an incorrect value). This error occurred in version 18.12.13. [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3344) +* Fixed the functionality of `CREATE TABLE ... ON CLUSTER ... AS SELECT ...` This error occurred in version 18.12.13. [#3247](https://github.com/ClickHouse/ClickHouse/pull/3247) +* Fixed unnecessary preparation of data structures for `JOIN`s on the server that initiates the query if the `JOIN` is only performed on remote servers. [#3340](https://github.com/ClickHouse/ClickHouse/pull/3340) +* Fixed bugs in the `Kafka` engine: deadlocks after exceptions when starting to read data, and locks upon completion [Marek Vavruša](https://github.com/ClickHouse/ClickHouse/pull/3215). +* For `Kafka` tables, the optional `schema` parameter was not passed (the schema of the `Cap'n'Proto` format). [Vojtech Splichal](https://github.com/ClickHouse/ClickHouse/pull/3150) +* If the ensemble of ZooKeeper servers has servers that accept the connection but then immediately close it instead of responding to the handshake, ClickHouse chooses to connect another server. Previously, this produced the error `Cannot read all data. Bytes read: 0. Bytes expected: 4.` and the server couldn't start. [8218cf3a](https://github.com/ClickHouse/ClickHouse/commit/8218cf3a5f39a43401953769d6d12a0bb8d29da9) +* If the ensemble of ZooKeeper servers contains servers for which the DNS query returns an error, these servers are ignored. [17b8e209](https://github.com/ClickHouse/ClickHouse/commit/17b8e209221061325ad7ba0539f03c6e65f87f29) +* Fixed type conversion between `Date` and `DateTime` when inserting data in the `VALUES` format (if `input_format_values_interpret_expressions = 1`). Previously, the conversion was performed between the numerical value of the number of days in Unix Epoch time and the Unix timestamp, which led to unexpected results. [#3229](https://github.com/ClickHouse/ClickHouse/pull/3229) +* Corrected type conversion between `Decimal` and integer numbers. [#3211](https://github.com/ClickHouse/ClickHouse/pull/3211) +* Fixed errors in the `enable_optimize_predicate_expression` setting. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3231) +* Fixed a parsing error in CSV format with floating-point numbers if a non-default CSV separator is used, such as `;` [#3155](https://github.com/ClickHouse/ClickHouse/pull/3155) +* Fixed the `arrayCumSumNonNegative` function (it does not accumulate negative values if the accumulator is less than zero). [Aleksey Studnev](https://github.com/ClickHouse/ClickHouse/pull/3163) +* Fixed how `Merge` tables work on top of `Distributed` tables when using `PREWHERE`. [#3165](https://github.com/ClickHouse/ClickHouse/pull/3165) * Bug fixes in the `ALTER UPDATE` query. -* Fixed bugs in the `odbc` table function that appeared in version 18.12. [#3197](https://github.com/yandex/ClickHouse/pull/3197) -* Fixed the operation of aggregate functions with `StateArray` combinators. [#3188](https://github.com/yandex/ClickHouse/pull/3188) -* Fixed a crash when dividing a `Decimal` value by zero. [69dd6609](https://github.com/yandex/ClickHouse/commit/69dd6609193beb4e7acd3e6ad216eca0ccfb8179) -* Fixed output of types for operations using `Decimal` and integer arguments. [#3224](https://github.com/yandex/ClickHouse/pull/3224) -* Fixed the segfault during `GROUP BY` on `Decimal128`. [3359ba06](https://github.com/yandex/ClickHouse/commit/3359ba06c39fcd05bfdb87d6c64154819621e13a) -* The `log_query_threads` setting (logging information about each thread of query execution) now takes effect only if the `log_queries` option (logging information about queries) is set to 1. Since the `log_query_threads` option is enabled by default, information about threads was previously logged even if query logging was disabled. [#3241](https://github.com/yandex/ClickHouse/pull/3241) -* Fixed an error in the distributed operation of the quantiles aggregate function (the error message `Not found column quantile...`). [292a8855](https://github.com/yandex/ClickHouse/commit/292a885533b8e3b41ce8993867069d14cbd5a664) -* Fixed the compatibility problem when working on a cluster of version 18.12.17 servers and older servers at the same time. For distributed queries with GROUP BY keys of both fixed and non-fixed length, if there was a large amount of data to aggregate, the returned data was not always fully aggregated (two different rows contained the same aggregation keys). [#3254](https://github.com/yandex/ClickHouse/pull/3254) -* Fixed handling of substitutions in `clickhouse-performance-test`, if the query contains only part of the substitutions declared in the test. [#3263](https://github.com/yandex/ClickHouse/pull/3263) -* Fixed an error when using `FINAL` with `PREWHERE`. [#3298](https://github.com/yandex/ClickHouse/pull/3298) -* Fixed an error when using `PREWHERE` over columns that were added during `ALTER`. [#3298](https://github.com/yandex/ClickHouse/pull/3298) -* Added a check for the absence of `arrayJoin` for `DEFAULT` and `MATERIALIZED` expressions. Previously, `arrayJoin` led to an error when inserting data. [#3337](https://github.com/yandex/ClickHouse/pull/3337) -* Added a check for the absence of `arrayJoin` in a `PREWHERE` clause. Previously, this led to messages like `Size ... doesn't match` or `Unknown compression method` when executing queries. [#3357](https://github.com/yandex/ClickHouse/pull/3357) -* Fixed segfault that could occur in rare cases after optimization that replaced AND chains from equality evaluations with the corresponding IN expression. [liuyimin-bytedance](https://github.com/yandex/ClickHouse/pull/3339) -* Minor corrections to `clickhouse-benchmark`: previously, client information was not sent to the server; now the number of queries executed is calculated more accurately when shutting down and for limiting the number of iterations. [#3351](https://github.com/yandex/ClickHouse/pull/3351) [#3352](https://github.com/yandex/ClickHouse/pull/3352) +* Fixed bugs in the `odbc` table function that appeared in version 18.12. [#3197](https://github.com/ClickHouse/ClickHouse/pull/3197) +* Fixed the operation of aggregate functions with `StateArray` combinators. [#3188](https://github.com/ClickHouse/ClickHouse/pull/3188) +* Fixed a crash when dividing a `Decimal` value by zero. [69dd6609](https://github.com/ClickHouse/ClickHouse/commit/69dd6609193beb4e7acd3e6ad216eca0ccfb8179) +* Fixed output of types for operations using `Decimal` and integer arguments. [#3224](https://github.com/ClickHouse/ClickHouse/pull/3224) +* Fixed the segfault during `GROUP BY` on `Decimal128`. [3359ba06](https://github.com/ClickHouse/ClickHouse/commit/3359ba06c39fcd05bfdb87d6c64154819621e13a) +* The `log_query_threads` setting (logging information about each thread of query execution) now takes effect only if the `log_queries` option (logging information about queries) is set to 1. Since the `log_query_threads` option is enabled by default, information about threads was previously logged even if query logging was disabled. [#3241](https://github.com/ClickHouse/ClickHouse/pull/3241) +* Fixed an error in the distributed operation of the quantiles aggregate function (the error message `Not found column quantile...`). [292a8855](https://github.com/ClickHouse/ClickHouse/commit/292a885533b8e3b41ce8993867069d14cbd5a664) +* Fixed the compatibility problem when working on a cluster of version 18.12.17 servers and older servers at the same time. For distributed queries with GROUP BY keys of both fixed and non-fixed length, if there was a large amount of data to aggregate, the returned data was not always fully aggregated (two different rows contained the same aggregation keys). [#3254](https://github.com/ClickHouse/ClickHouse/pull/3254) +* Fixed handling of substitutions in `clickhouse-performance-test`, if the query contains only part of the substitutions declared in the test. [#3263](https://github.com/ClickHouse/ClickHouse/pull/3263) +* Fixed an error when using `FINAL` with `PREWHERE`. [#3298](https://github.com/ClickHouse/ClickHouse/pull/3298) +* Fixed an error when using `PREWHERE` over columns that were added during `ALTER`. [#3298](https://github.com/ClickHouse/ClickHouse/pull/3298) +* Added a check for the absence of `arrayJoin` for `DEFAULT` and `MATERIALIZED` expressions. Previously, `arrayJoin` led to an error when inserting data. [#3337](https://github.com/ClickHouse/ClickHouse/pull/3337) +* Added a check for the absence of `arrayJoin` in a `PREWHERE` clause. Previously, this led to messages like `Size ... doesn't match` or `Unknown compression method` when executing queries. [#3357](https://github.com/ClickHouse/ClickHouse/pull/3357) +* Fixed segfault that could occur in rare cases after optimization that replaced AND chains from equality evaluations with the corresponding IN expression. [liuyimin-bytedance](https://github.com/ClickHouse/ClickHouse/pull/3339) +* Minor corrections to `clickhouse-benchmark`: previously, client information was not sent to the server; now the number of queries executed is calculated more accurately when shutting down and for limiting the number of iterations. [#3351](https://github.com/ClickHouse/ClickHouse/pull/3351) [#3352](https://github.com/ClickHouse/ClickHouse/pull/3352) ### Backward incompatible changes: -* Removed the `allow_experimental_decimal_type` option. The `Decimal` data type is available for default use. [#3329](https://github.com/yandex/ClickHouse/pull/3329) +* Removed the `allow_experimental_decimal_type` option. The `Decimal` data type is available for default use. [#3329](https://github.com/ClickHouse/ClickHouse/pull/3329) ## ClickHouse release 18.12.17, 2018-09-16 ### New features: -* `invalidate_query` (the ability to specify a query to check whether an external dictionary needs to be updated) is implemented for the `clickhouse` source. [#3126](https://github.com/yandex/ClickHouse/pull/3126) -* Added the ability to use `UInt*`, `Int*`, and `DateTime` data types (along with the `Date` type) as a `range_hashed` external dictionary key that defines the boundaries of ranges. Now `NULL` can be used to designate an open range. [Vasily Nemkov](https://github.com/yandex/ClickHouse/pull/3123) -* The `Decimal` type now supports `var*` and `stddev*` aggregate functions. [#3129](https://github.com/yandex/ClickHouse/pull/3129) -* The `Decimal` type now supports mathematical functions (`exp`, `sin` and so on.) [#3129](https://github.com/yandex/ClickHouse/pull/3129) -* The `system.part_log` table now has the `partition_id` column. [#3089](https://github.com/yandex/ClickHouse/pull/3089) +* `invalidate_query` (the ability to specify a query to check whether an external dictionary needs to be updated) is implemented for the `clickhouse` source. [#3126](https://github.com/ClickHouse/ClickHouse/pull/3126) +* Added the ability to use `UInt*`, `Int*`, and `DateTime` data types (along with the `Date` type) as a `range_hashed` external dictionary key that defines the boundaries of ranges. Now `NULL` can be used to designate an open range. [Vasily Nemkov](https://github.com/ClickHouse/ClickHouse/pull/3123) +* The `Decimal` type now supports `var*` and `stddev*` aggregate functions. [#3129](https://github.com/ClickHouse/ClickHouse/pull/3129) +* The `Decimal` type now supports mathematical functions (`exp`, `sin` and so on.) [#3129](https://github.com/ClickHouse/ClickHouse/pull/3129) +* The `system.part_log` table now has the `partition_id` column. [#3089](https://github.com/ClickHouse/ClickHouse/pull/3089) ### Bug fixes: -* `Merge` now works correctly on `Distributed` tables. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/3159) -* Fixed incompatibility (unnecessary dependency on the `glibc` version) that made it impossible to run ClickHouse on `Ubuntu Precise` and older versions. The incompatibility arose in version 18.12.13. [#3130](https://github.com/yandex/ClickHouse/pull/3130) -* Fixed errors in the `enable_optimize_predicate_expression` setting. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/3107) -* Fixed a minor issue with backwards compatibility that appeared when working with a cluster of replicas on versions earlier than 18.12.13 and simultaneously creating a new replica of a table on a server with a newer version (shown in the message `Can not clone replica, because the ... updated to new ClickHouse version`, which is logical, but shouldn't happen). [#3122](https://github.com/yandex/ClickHouse/pull/3122) +* `Merge` now works correctly on `Distributed` tables. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3159) +* Fixed incompatibility (unnecessary dependency on the `glibc` version) that made it impossible to run ClickHouse on `Ubuntu Precise` and older versions. The incompatibility arose in version 18.12.13. [#3130](https://github.com/ClickHouse/ClickHouse/pull/3130) +* Fixed errors in the `enable_optimize_predicate_expression` setting. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3107) +* Fixed a minor issue with backwards compatibility that appeared when working with a cluster of replicas on versions earlier than 18.12.13 and simultaneously creating a new replica of a table on a server with a newer version (shown in the message `Can not clone replica, because the ... updated to new ClickHouse version`, which is logical, but shouldn't happen). [#3122](https://github.com/ClickHouse/ClickHouse/pull/3122) ### Backward incompatible changes: -* The `enable_optimize_predicate_expression` option is enabled by default (which is rather optimistic). If query analysis errors occur that are related to searching for the column names, set `enable_optimize_predicate_expression` to 0. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/3107) +* The `enable_optimize_predicate_expression` option is enabled by default (which is rather optimistic). If query analysis errors occur that are related to searching for the column names, set `enable_optimize_predicate_expression` to 0. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3107) ## ClickHouse release 18.12.14, 2018-09-13 ### New features: -* Added support for `ALTER UPDATE` queries. [#3035](https://github.com/yandex/ClickHouse/pull/3035) -* Added the `allow_ddl` option, which restricts the user's access to DDL queries. [#3104](https://github.com/yandex/ClickHouse/pull/3104) -* Added the `min_merge_bytes_to_use_direct_io` option for `MergeTree` engines, which allows you to set a threshold for the total size of the merge (when above the threshold, data part files will be handled using O_DIRECT). [#3117](https://github.com/yandex/ClickHouse/pull/3117) -* The `system.merges` system table now contains the `partition_id` column. [#3099](https://github.com/yandex/ClickHouse/pull/3099) +* Added support for `ALTER UPDATE` queries. [#3035](https://github.com/ClickHouse/ClickHouse/pull/3035) +* Added the `allow_ddl` option, which restricts the user's access to DDL queries. [#3104](https://github.com/ClickHouse/ClickHouse/pull/3104) +* Added the `min_merge_bytes_to_use_direct_io` option for `MergeTree` engines, which allows you to set a threshold for the total size of the merge (when above the threshold, data part files will be handled using O_DIRECT). [#3117](https://github.com/ClickHouse/ClickHouse/pull/3117) +* The `system.merges` system table now contains the `partition_id` column. [#3099](https://github.com/ClickHouse/ClickHouse/pull/3099) ### Improvements -* If a data part remains unchanged during mutation, it isn't downloaded by replicas. [#3103](https://github.com/yandex/ClickHouse/pull/3103) -* Autocomplete is available for names of settings when working with `clickhouse-client`. [#3106](https://github.com/yandex/ClickHouse/pull/3106) +* If a data part remains unchanged during mutation, it isn't downloaded by replicas. [#3103](https://github.com/ClickHouse/ClickHouse/pull/3103) +* Autocomplete is available for names of settings when working with `clickhouse-client`. [#3106](https://github.com/ClickHouse/ClickHouse/pull/3106) ### Bug fixes: -* Added a check for the sizes of arrays that are elements of `Nested` type fields when inserting. [#3118](https://github.com/yandex/ClickHouse/pull/3118) +* Added a check for the sizes of arrays that are elements of `Nested` type fields when inserting. [#3118](https://github.com/ClickHouse/ClickHouse/pull/3118) * Fixed an error updating external dictionaries with the `ODBC` source and `hashed` storage. This error occurred in version 18.12.13. -* Fixed a crash when creating a temporary table from a query with an `IN` condition. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/3098) -* Fixed an error in aggregate functions for arrays that can have `NULL` elements. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/3097) +* Fixed a crash when creating a temporary table from a query with an `IN` condition. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3098) +* Fixed an error in aggregate functions for arrays that can have `NULL` elements. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/3097) ## ClickHouse release 18.12.13, 2018-09-10 ### New features: -* Added the `DECIMAL(digits, scale)` data type (`Decimal32(scale)`, `Decimal64(scale)`, `Decimal128(scale)`). To enable it, use the setting `allow_experimental_decimal_type`. [#2846](https://github.com/yandex/ClickHouse/pull/2846) [#2970](https://github.com/yandex/ClickHouse/pull/2970) [#3008](https://github.com/yandex/ClickHouse/pull/3008) [#3047](https://github.com/yandex/ClickHouse/pull/3047) -* New `WITH ROLLUP` modifier for `GROUP BY` (alternative syntax: `GROUP BY ROLLUP(...)`). [#2948](https://github.com/yandex/ClickHouse/pull/2948) -* In queries with JOIN, the star character expands to a list of columns in all tables, in compliance with the SQL standard. You can restore the old behavior by setting `asterisk_left_columns_only` to 1 on the user configuration level. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/2787) -* Added support for JOIN with table functions. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/2907) -* Autocomplete by pressing Tab in clickhouse-client. [Sergey Shcherbin](https://github.com/yandex/ClickHouse/pull/2447) -* Ctrl+C in clickhouse-client clears a query that was entered. [#2877](https://github.com/yandex/ClickHouse/pull/2877) -* Added the `join_default_strictness` setting (values: `"`, `'any'`, `'all'`). This allows you to not specify `ANY` or `ALL` for `JOIN`. [#2982](https://github.com/yandex/ClickHouse/pull/2982) -* Each line of the server log related to query processing shows the query ID. [#2482](https://github.com/yandex/ClickHouse/pull/2482) -* Now you can get query execution logs in clickhouse-client (use the `send_logs_level` setting). With distributed query processing, logs are cascaded from all the servers. [#2482](https://github.com/yandex/ClickHouse/pull/2482) -* The `system.query_log` and `system.processes` (`SHOW PROCESSLIST`) tables now have information about all changed settings when you run a query (the nested structure of the `Settings` data). Added the `log_query_settings` setting. [#2482](https://github.com/yandex/ClickHouse/pull/2482) -* The `system.query_log` and `system.processes` tables now show information about the number of threads that are participating in query execution (see the `thread_numbers` column). [#2482](https://github.com/yandex/ClickHouse/pull/2482) -* Added `ProfileEvents` counters that measure the time spent on reading and writing over the network and reading and writing to disk, the number of network errors, and the time spent waiting when network bandwidth is limited. [#2482](https://github.com/yandex/ClickHouse/pull/2482) -* Added `ProfileEvents`counters that contain the system metrics from rusage (you can use them to get information about CPU usage in userspace and the kernel, page faults, and context switches), as well as taskstats metrics (use these to obtain information about I/O wait time, CPU wait time, and the amount of data read and recorded, both with and without page cache). [#2482](https://github.com/yandex/ClickHouse/pull/2482) -* The `ProfileEvents` counters are applied globally and for each query, as well as for each query execution thread, which allows you to profile resource consumption by query in detail. [#2482](https://github.com/yandex/ClickHouse/pull/2482) -* Added the `system.query_thread_log` table, which contains information about each query execution thread. Added the `log_query_threads` setting. [#2482](https://github.com/yandex/ClickHouse/pull/2482) -* The `system.metrics` and `system.events` tables now have built-in documentation. [#3016](https://github.com/yandex/ClickHouse/pull/3016) -* Added the `arrayEnumerateDense` function. [Amos Bird](https://github.com/yandex/ClickHouse/pull/2975) -* Added the `arrayCumSumNonNegative` and `arrayDifference` functions. [Aleksey Studnev](https://github.com/yandex/ClickHouse/pull/2942) -* Added the `retention` aggregate function. [Sundy Li](https://github.com/yandex/ClickHouse/pull/2887) -* Now you can add (merge) states of aggregate functions by using the plus operator, and multiply the states of aggregate functions by a nonnegative constant. [#3062](https://github.com/yandex/ClickHouse/pull/3062) [#3034](https://github.com/yandex/ClickHouse/pull/3034) -* Tables in the MergeTree family now have the virtual column `_partition_id`. [#3089](https://github.com/yandex/ClickHouse/pull/3089) +* Added the `DECIMAL(digits, scale)` data type (`Decimal32(scale)`, `Decimal64(scale)`, `Decimal128(scale)`). To enable it, use the setting `allow_experimental_decimal_type`. [#2846](https://github.com/ClickHouse/ClickHouse/pull/2846) [#2970](https://github.com/ClickHouse/ClickHouse/pull/2970) [#3008](https://github.com/ClickHouse/ClickHouse/pull/3008) [#3047](https://github.com/ClickHouse/ClickHouse/pull/3047) +* New `WITH ROLLUP` modifier for `GROUP BY` (alternative syntax: `GROUP BY ROLLUP(...)`). [#2948](https://github.com/ClickHouse/ClickHouse/pull/2948) +* In queries with JOIN, the star character expands to a list of columns in all tables, in compliance with the SQL standard. You can restore the old behavior by setting `asterisk_left_columns_only` to 1 on the user configuration level. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2787) +* Added support for JOIN with table functions. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2907) +* Autocomplete by pressing Tab in clickhouse-client. [Sergey Shcherbin](https://github.com/ClickHouse/ClickHouse/pull/2447) +* Ctrl+C in clickhouse-client clears a query that was entered. [#2877](https://github.com/ClickHouse/ClickHouse/pull/2877) +* Added the `join_default_strictness` setting (values: `"`, `'any'`, `'all'`). This allows you to not specify `ANY` or `ALL` for `JOIN`. [#2982](https://github.com/ClickHouse/ClickHouse/pull/2982) +* Each line of the server log related to query processing shows the query ID. [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +* Now you can get query execution logs in clickhouse-client (use the `send_logs_level` setting). With distributed query processing, logs are cascaded from all the servers. [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +* The `system.query_log` and `system.processes` (`SHOW PROCESSLIST`) tables now have information about all changed settings when you run a query (the nested structure of the `Settings` data). Added the `log_query_settings` setting. [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +* The `system.query_log` and `system.processes` tables now show information about the number of threads that are participating in query execution (see the `thread_numbers` column). [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +* Added `ProfileEvents` counters that measure the time spent on reading and writing over the network and reading and writing to disk, the number of network errors, and the time spent waiting when network bandwidth is limited. [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +* Added `ProfileEvents`counters that contain the system metrics from rusage (you can use them to get information about CPU usage in userspace and the kernel, page faults, and context switches), as well as taskstats metrics (use these to obtain information about I/O wait time, CPU wait time, and the amount of data read and recorded, both with and without page cache). [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +* The `ProfileEvents` counters are applied globally and for each query, as well as for each query execution thread, which allows you to profile resource consumption by query in detail. [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +* Added the `system.query_thread_log` table, which contains information about each query execution thread. Added the `log_query_threads` setting. [#2482](https://github.com/ClickHouse/ClickHouse/pull/2482) +* The `system.metrics` and `system.events` tables now have built-in documentation. [#3016](https://github.com/ClickHouse/ClickHouse/pull/3016) +* Added the `arrayEnumerateDense` function. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2975) +* Added the `arrayCumSumNonNegative` and `arrayDifference` functions. [Aleksey Studnev](https://github.com/ClickHouse/ClickHouse/pull/2942) +* Added the `retention` aggregate function. [Sundy Li](https://github.com/ClickHouse/ClickHouse/pull/2887) +* Now you can add (merge) states of aggregate functions by using the plus operator, and multiply the states of aggregate functions by a nonnegative constant. [#3062](https://github.com/ClickHouse/ClickHouse/pull/3062) [#3034](https://github.com/ClickHouse/ClickHouse/pull/3034) +* Tables in the MergeTree family now have the virtual column `_partition_id`. [#3089](https://github.com/ClickHouse/ClickHouse/pull/3089) ### Experimental features: -* Added the `LowCardinality(T)` data type. This data type automatically creates a local dictionary of values and allows data processing without unpacking the dictionary. [#2830](https://github.com/yandex/ClickHouse/pull/2830) -* Added a cache of JIT-compiled functions and a counter for the number of uses before compiling. To JIT compile expressions, enable the `compile_expressions` setting. [#2990](https://github.com/yandex/ClickHouse/pull/2990) [#3077](https://github.com/yandex/ClickHouse/pull/3077) +* Added the `LowCardinality(T)` data type. This data type automatically creates a local dictionary of values and allows data processing without unpacking the dictionary. [#2830](https://github.com/ClickHouse/ClickHouse/pull/2830) +* Added a cache of JIT-compiled functions and a counter for the number of uses before compiling. To JIT compile expressions, enable the `compile_expressions` setting. [#2990](https://github.com/ClickHouse/ClickHouse/pull/2990) [#3077](https://github.com/ClickHouse/ClickHouse/pull/3077) ### Improvements: * Fixed the problem with unlimited accumulation of the replication log when there are abandoned replicas. Added an effective recovery mode for replicas with a long lag. * Improved performance of `GROUP BY` with multiple aggregation fields when one of them is string and the others are fixed length. * Improved performance when using `PREWHERE` and with implicit transfer of expressions in `PREWHERE`. -* Improved parsing performance for text formats (`CSV`, `TSV`). [Amos Bird](https://github.com/yandex/ClickHouse/pull/2977) [#2980](https://github.com/yandex/ClickHouse/pull/2980) -* Improved performance of reading strings and arrays in binary formats. [Amos Bird](https://github.com/yandex/ClickHouse/pull/2955) -* Increased performance and reduced memory consumption for queries to `system.tables` and `system.columns` when there is a very large number of tables on a single server. [#2953](https://github.com/yandex/ClickHouse/pull/2953) -* Fixed a performance problem in the case of a large stream of queries that result in an error (the ` _dl_addr` function is visible in `perf top`, but the server isn't using much CPU). [#2938](https://github.com/yandex/ClickHouse/pull/2938) -* Conditions are cast into the View (when `enable_optimize_predicate_expression` is enabled). [Winter Zhang](https://github.com/yandex/ClickHouse/pull/2907) -* Improvements to the functionality for the `UUID` data type. [#3074](https://github.com/yandex/ClickHouse/pull/3074) [#2985](https://github.com/yandex/ClickHouse/pull/2985) -* The `UUID` data type is supported in The-Alchemist dictionaries. [#2822](https://github.com/yandex/ClickHouse/pull/2822) -* The `visitParamExtractRaw` function works correctly with nested structures. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/2974) -* When the `input_format_skip_unknown_fields` setting is enabled, object fields in `JSONEachRow` format are skipped correctly. [BlahGeek](https://github.com/yandex/ClickHouse/pull/2958) -* For a `CASE` expression with conditions, you can now omit `ELSE`, which is equivalent to `ELSE NULL`. [#2920](https://github.com/yandex/ClickHouse/pull/2920) -* The operation timeout can now be configured when working with ZooKeeper. [urykhy](https://github.com/yandex/ClickHouse/pull/2971) -* You can specify an offset for `LIMIT n, m` as `LIMIT n OFFSET m`. [#2840](https://github.com/yandex/ClickHouse/pull/2840) -* You can use the `SELECT TOP n` syntax as an alternative for `LIMIT`. [#2840](https://github.com/yandex/ClickHouse/pull/2840) +* Improved parsing performance for text formats (`CSV`, `TSV`). [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2977) [#2980](https://github.com/ClickHouse/ClickHouse/pull/2980) +* Improved performance of reading strings and arrays in binary formats. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2955) +* Increased performance and reduced memory consumption for queries to `system.tables` and `system.columns` when there is a very large number of tables on a single server. [#2953](https://github.com/ClickHouse/ClickHouse/pull/2953) +* Fixed a performance problem in the case of a large stream of queries that result in an error (the ` _dl_addr` function is visible in `perf top`, but the server isn't using much CPU). [#2938](https://github.com/ClickHouse/ClickHouse/pull/2938) +* Conditions are cast into the View (when `enable_optimize_predicate_expression` is enabled). [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2907) +* Improvements to the functionality for the `UUID` data type. [#3074](https://github.com/ClickHouse/ClickHouse/pull/3074) [#2985](https://github.com/ClickHouse/ClickHouse/pull/2985) +* The `UUID` data type is supported in The-Alchemist dictionaries. [#2822](https://github.com/ClickHouse/ClickHouse/pull/2822) +* The `visitParamExtractRaw` function works correctly with nested structures. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2974) +* When the `input_format_skip_unknown_fields` setting is enabled, object fields in `JSONEachRow` format are skipped correctly. [BlahGeek](https://github.com/ClickHouse/ClickHouse/pull/2958) +* For a `CASE` expression with conditions, you can now omit `ELSE`, which is equivalent to `ELSE NULL`. [#2920](https://github.com/ClickHouse/ClickHouse/pull/2920) +* The operation timeout can now be configured when working with ZooKeeper. [urykhy](https://github.com/ClickHouse/ClickHouse/pull/2971) +* You can specify an offset for `LIMIT n, m` as `LIMIT n OFFSET m`. [#2840](https://github.com/ClickHouse/ClickHouse/pull/2840) +* You can use the `SELECT TOP n` syntax as an alternative for `LIMIT`. [#2840](https://github.com/ClickHouse/ClickHouse/pull/2840) * Increased the size of the queue to write to system tables, so the `SystemLog parameter queue is full` error doesn't happen as often. -* The `windowFunnel` aggregate function now supports events that meet multiple conditions. [Amos Bird](https://github.com/yandex/ClickHouse/pull/2801) -* Duplicate columns can be used in a `USING` clause for `JOIN`. [#3006](https://github.com/yandex/ClickHouse/pull/3006) -* `Pretty` formats now have a limit on column alignment by width. Use the `output_format_pretty_max_column_pad_width` setting. If a value is wider, it will still be displayed in its entirety, but the other cells in the table will not be too wide. [#3003](https://github.com/yandex/ClickHouse/pull/3003) -* The `odbc` table function now allows you to specify the database/schema name. [Amos Bird](https://github.com/yandex/ClickHouse/pull/2885) -* Added the ability to use a username specified in the `clickhouse-client` config file. [Vladimir Kozbin](https://github.com/yandex/ClickHouse/pull/2909) +* The `windowFunnel` aggregate function now supports events that meet multiple conditions. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2801) +* Duplicate columns can be used in a `USING` clause for `JOIN`. [#3006](https://github.com/ClickHouse/ClickHouse/pull/3006) +* `Pretty` formats now have a limit on column alignment by width. Use the `output_format_pretty_max_column_pad_width` setting. If a value is wider, it will still be displayed in its entirety, but the other cells in the table will not be too wide. [#3003](https://github.com/ClickHouse/ClickHouse/pull/3003) +* The `odbc` table function now allows you to specify the database/schema name. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2885) +* Added the ability to use a username specified in the `clickhouse-client` config file. [Vladimir Kozbin](https://github.com/ClickHouse/ClickHouse/pull/2909) * The `ZooKeeperExceptions` counter has been split into three counters: `ZooKeeperUserExceptions`, `ZooKeeperHardwareExceptions`, and `ZooKeeperOtherExceptions`. * `ALTER DELETE` queries work for materialized views. * Added randomization when running the cleanup thread periodically for `ReplicatedMergeTree` tables in order to avoid periodic load spikes when there are a very large number of `ReplicatedMergeTree` tables. -* Support for `ATTACH TABLE ... ON CLUSTER` queries. [#3025](https://github.com/yandex/ClickHouse/pull/3025) +* Support for `ATTACH TABLE ... ON CLUSTER` queries. [#3025](https://github.com/ClickHouse/ClickHouse/pull/3025) ### Bug fixes: -* Fixed an issue with `Dictionary` tables (throws the `Size of offsets doesn't match size of column` or `Unknown compression method` exception). This bug appeared in version 18.10.3. [#2913](https://github.com/yandex/ClickHouse/issues/2913) -* Fixed a bug when merging `CollapsingMergeTree` tables if one of the data parts is empty (these parts are formed during merge or `ALTER DELETE` if all data was deleted), and the `vertical` algorithm was used for the merge. [#3049](https://github.com/yandex/ClickHouse/pull/3049) -* Fixed a race condition during `DROP` or `TRUNCATE` for `Memory` tables with a simultaneous `SELECT`, which could lead to server crashes. This bug appeared in version 1.1.54388. [#3038](https://github.com/yandex/ClickHouse/pull/3038) -* Fixed the possibility of data loss when inserting in `Replicated` tables if the `Session is expired` error is returned (data loss can be detected by the `ReplicatedDataLoss` metric). This error occurred in version 1.1.54378. [#2939](https://github.com/yandex/ClickHouse/pull/2939) [#2949](https://github.com/yandex/ClickHouse/pull/2949) [#2964](https://github.com/yandex/ClickHouse/pull/2964) -* Fixed a segfault during `JOIN ... ON`. [#3000](https://github.com/yandex/ClickHouse/pull/3000) -* Fixed the error searching column names when the `WHERE` expression consists entirely of a qualified column name, such as `WHERE table.column`. [#2994](https://github.com/yandex/ClickHouse/pull/2994) -* Fixed the "Not found column" error that occurred when executing distributed queries if a single column consisting of an IN expression with a subquery is requested from a remote server. [#3087](https://github.com/yandex/ClickHouse/pull/3087) -* Fixed the `Block structure mismatch in UNION stream: different number of columns` error that occurred for distributed queries if one of the shards is local and the other is not, and optimization of the move to `PREWHERE` is triggered. [#2226](https://github.com/yandex/ClickHouse/pull/2226) [#3037](https://github.com/yandex/ClickHouse/pull/3037) [#3055](https://github.com/yandex/ClickHouse/pull/3055) [#3065](https://github.com/yandex/ClickHouse/pull/3065) [#3073](https://github.com/yandex/ClickHouse/pull/3073) [#3090](https://github.com/yandex/ClickHouse/pull/3090) [#3093](https://github.com/yandex/ClickHouse/pull/3093) -* Fixed the `pointInPolygon` function for certain cases of non-convex polygons. [#2910](https://github.com/yandex/ClickHouse/pull/2910) -* Fixed the incorrect result when comparing `nan` with integers. [#3024](https://github.com/yandex/ClickHouse/pull/3024) -* Fixed an error in the `zlib-ng` library that could lead to segfault in rare cases. [#2854](https://github.com/yandex/ClickHouse/pull/2854) -* Fixed a memory leak when inserting into a table with `AggregateFunction` columns, if the state of the aggregate function is not simple (allocates memory separately), and if a single insertion request results in multiple small blocks. [#3084](https://github.com/yandex/ClickHouse/pull/3084) +* Fixed an issue with `Dictionary` tables (throws the `Size of offsets doesn't match size of column` or `Unknown compression method` exception). This bug appeared in version 18.10.3. [#2913](https://github.com/ClickHouse/ClickHouse/issues/2913) +* Fixed a bug when merging `CollapsingMergeTree` tables if one of the data parts is empty (these parts are formed during merge or `ALTER DELETE` if all data was deleted), and the `vertical` algorithm was used for the merge. [#3049](https://github.com/ClickHouse/ClickHouse/pull/3049) +* Fixed a race condition during `DROP` or `TRUNCATE` for `Memory` tables with a simultaneous `SELECT`, which could lead to server crashes. This bug appeared in version 1.1.54388. [#3038](https://github.com/ClickHouse/ClickHouse/pull/3038) +* Fixed the possibility of data loss when inserting in `Replicated` tables if the `Session is expired` error is returned (data loss can be detected by the `ReplicatedDataLoss` metric). This error occurred in version 1.1.54378. [#2939](https://github.com/ClickHouse/ClickHouse/pull/2939) [#2949](https://github.com/ClickHouse/ClickHouse/pull/2949) [#2964](https://github.com/ClickHouse/ClickHouse/pull/2964) +* Fixed a segfault during `JOIN ... ON`. [#3000](https://github.com/ClickHouse/ClickHouse/pull/3000) +* Fixed the error searching column names when the `WHERE` expression consists entirely of a qualified column name, such as `WHERE table.column`. [#2994](https://github.com/ClickHouse/ClickHouse/pull/2994) +* Fixed the "Not found column" error that occurred when executing distributed queries if a single column consisting of an IN expression with a subquery is requested from a remote server. [#3087](https://github.com/ClickHouse/ClickHouse/pull/3087) +* Fixed the `Block structure mismatch in UNION stream: different number of columns` error that occurred for distributed queries if one of the shards is local and the other is not, and optimization of the move to `PREWHERE` is triggered. [#2226](https://github.com/ClickHouse/ClickHouse/pull/2226) [#3037](https://github.com/ClickHouse/ClickHouse/pull/3037) [#3055](https://github.com/ClickHouse/ClickHouse/pull/3055) [#3065](https://github.com/ClickHouse/ClickHouse/pull/3065) [#3073](https://github.com/ClickHouse/ClickHouse/pull/3073) [#3090](https://github.com/ClickHouse/ClickHouse/pull/3090) [#3093](https://github.com/ClickHouse/ClickHouse/pull/3093) +* Fixed the `pointInPolygon` function for certain cases of non-convex polygons. [#2910](https://github.com/ClickHouse/ClickHouse/pull/2910) +* Fixed the incorrect result when comparing `nan` with integers. [#3024](https://github.com/ClickHouse/ClickHouse/pull/3024) +* Fixed an error in the `zlib-ng` library that could lead to segfault in rare cases. [#2854](https://github.com/ClickHouse/ClickHouse/pull/2854) +* Fixed a memory leak when inserting into a table with `AggregateFunction` columns, if the state of the aggregate function is not simple (allocates memory separately), and if a single insertion request results in multiple small blocks. [#3084](https://github.com/ClickHouse/ClickHouse/pull/3084) * Fixed a race condition when creating and deleting the same `Buffer` or `MergeTree` table simultaneously. -* Fixed the possibility of a segfault when comparing tuples made up of certain non-trivial types, such as tuples. [#2989](https://github.com/yandex/ClickHouse/pull/2989) -* Fixed the possibility of a segfault when running certain `ON CLUSTER` queries. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/2960) -* Fixed an error in the `arrayDistinct` function for `Nullable` array elements. [#2845](https://github.com/yandex/ClickHouse/pull/2845) [#2937](https://github.com/yandex/ClickHouse/pull/2937) -* The `enable_optimize_predicate_expression` option now correctly supports cases with `SELECT *`. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/2929) -* Fixed the segfault when re-initializing the ZooKeeper session. [#2917](https://github.com/yandex/ClickHouse/pull/2917) +* Fixed the possibility of a segfault when comparing tuples made up of certain non-trivial types, such as tuples. [#2989](https://github.com/ClickHouse/ClickHouse/pull/2989) +* Fixed the possibility of a segfault when running certain `ON CLUSTER` queries. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2960) +* Fixed an error in the `arrayDistinct` function for `Nullable` array elements. [#2845](https://github.com/ClickHouse/ClickHouse/pull/2845) [#2937](https://github.com/ClickHouse/ClickHouse/pull/2937) +* The `enable_optimize_predicate_expression` option now correctly supports cases with `SELECT *`. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2929) +* Fixed the segfault when re-initializing the ZooKeeper session. [#2917](https://github.com/ClickHouse/ClickHouse/pull/2917) * Fixed potential blocking when working with ZooKeeper. * Fixed incorrect code for adding nested data structures in a `SummingMergeTree`. -* When allocating memory for states of aggregate functions, alignment is correctly taken into account, which makes it possible to use operations that require alignment when implementing states of aggregate functions. [chenxing-xc](https://github.com/yandex/ClickHouse/pull/2808) +* When allocating memory for states of aggregate functions, alignment is correctly taken into account, which makes it possible to use operations that require alignment when implementing states of aggregate functions. [chenxing-xc](https://github.com/ClickHouse/ClickHouse/pull/2808) ### Security fix: -* Safe use of ODBC data sources. Interaction with ODBC drivers uses a separate `clickhouse-odbc-bridge` process. Errors in third-party ODBC drivers no longer cause problems with server stability or vulnerabilities. [#2828](https://github.com/yandex/ClickHouse/pull/2828) [#2879](https://github.com/yandex/ClickHouse/pull/2879) [#2886](https://github.com/yandex/ClickHouse/pull/2886) [#2893](https://github.com/yandex/ClickHouse/pull/2893) [#2921](https://github.com/yandex/ClickHouse/pull/2921) -* Fixed incorrect validation of the file path in the `catBoostPool` table function. [#2894](https://github.com/yandex/ClickHouse/pull/2894) -* The contents of system tables (`tables`, `databases`, `parts`, `columns`, `parts_columns`, `merges`, `mutations`, `replicas`, and `replication_queue`) are filtered according to the user's configured access to databases (`allow_databases`). [Winter Zhang](https://github.com/yandex/ClickHouse/pull/2856) +* Safe use of ODBC data sources. Interaction with ODBC drivers uses a separate `clickhouse-odbc-bridge` process. Errors in third-party ODBC drivers no longer cause problems with server stability or vulnerabilities. [#2828](https://github.com/ClickHouse/ClickHouse/pull/2828) [#2879](https://github.com/ClickHouse/ClickHouse/pull/2879) [#2886](https://github.com/ClickHouse/ClickHouse/pull/2886) [#2893](https://github.com/ClickHouse/ClickHouse/pull/2893) [#2921](https://github.com/ClickHouse/ClickHouse/pull/2921) +* Fixed incorrect validation of the file path in the `catBoostPool` table function. [#2894](https://github.com/ClickHouse/ClickHouse/pull/2894) +* The contents of system tables (`tables`, `databases`, `parts`, `columns`, `parts_columns`, `merges`, `mutations`, `replicas`, and `replication_queue`) are filtered according to the user's configured access to databases (`allow_databases`). [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2856) ### Backward incompatible changes: @@ -1850,46 +1850,46 @@ This release contains exactly the same set of patches as 19.3.6. * Most integration tests can now be run by commit. * Code style checks can also be run by commit. -* The `memcpy` implementation is chosen correctly when building on CentOS7/Fedora. [Etienne Champetier](https://github.com/yandex/ClickHouse/pull/2912) -* When using clang to build, some warnings from `-Weverything` have been added, in addition to the regular `-Wall-Wextra -Werror`. [#2957](https://github.com/yandex/ClickHouse/pull/2957) +* The `memcpy` implementation is chosen correctly when building on CentOS7/Fedora. [Etienne Champetier](https://github.com/ClickHouse/ClickHouse/pull/2912) +* When using clang to build, some warnings from `-Weverything` have been added, in addition to the regular `-Wall-Wextra -Werror`. [#2957](https://github.com/ClickHouse/ClickHouse/pull/2957) * Debugging the build uses the `jemalloc` debug option. -* The interface of the library for interacting with ZooKeeper is declared abstract. [#2950](https://github.com/yandex/ClickHouse/pull/2950) +* The interface of the library for interacting with ZooKeeper is declared abstract. [#2950](https://github.com/ClickHouse/ClickHouse/pull/2950) ## ClickHouse release 18.10.3, 2018-08-13 ### New features: -* HTTPS can be used for replication. [#2760](https://github.com/yandex/ClickHouse/pull/2760) -* Added the functions `murmurHash2_64`, `murmurHash3_32`, `murmurHash3_64`, and `murmurHash3_128` in addition to the existing `murmurHash2_32`. [#2791](https://github.com/yandex/ClickHouse/pull/2791) -* Support for Nullable types in the ClickHouse ODBC driver (`ODBCDriver2` output format). [#2834](https://github.com/yandex/ClickHouse/pull/2834) +* HTTPS can be used for replication. [#2760](https://github.com/ClickHouse/ClickHouse/pull/2760) +* Added the functions `murmurHash2_64`, `murmurHash3_32`, `murmurHash3_64`, and `murmurHash3_128` in addition to the existing `murmurHash2_32`. [#2791](https://github.com/ClickHouse/ClickHouse/pull/2791) +* Support for Nullable types in the ClickHouse ODBC driver (`ODBCDriver2` output format). [#2834](https://github.com/ClickHouse/ClickHouse/pull/2834) * Support for `UUID` in the key columns. ### Improvements: -* Clusters can be removed without restarting the server when they are deleted from the config files. [#2777](https://github.com/yandex/ClickHouse/pull/2777) -* External dictionaries can be removed without restarting the server when they are removed from config files. [#2779](https://github.com/yandex/ClickHouse/pull/2779) -* Added `SETTINGS` support for the `Kafka` table engine. [Alexander Marshalov](https://github.com/yandex/ClickHouse/pull/2781) -* Improvements for the `UUID` data type (not yet complete). [#2618](https://github.com/yandex/ClickHouse/pull/2618) -* Support for empty parts after merges in the `SummingMergeTree`, `CollapsingMergeTree` and `VersionedCollapsingMergeTree` engines. [#2815](https://github.com/yandex/ClickHouse/pull/2815) -* Old records of completed mutations are deleted (`ALTER DELETE`). [#2784](https://github.com/yandex/ClickHouse/pull/2784) -* Added the `system.merge_tree_settings` table. [Kirill Shvakov](https://github.com/yandex/ClickHouse/pull/2841) -* The `system.tables` table now has dependency columns: `dependencies_database` and `dependencies_table`. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/2851) -* Added the `max_partition_size_to_drop` config option. [#2782](https://github.com/yandex/ClickHouse/pull/2782) -* Added the `output_format_json_escape_forward_slashes` option. [Alexander Bocharov](https://github.com/yandex/ClickHouse/pull/2812) -* Added the `max_fetch_partition_retries_count` setting. [#2831](https://github.com/yandex/ClickHouse/pull/2831) -* Added the `prefer_localhost_replica` setting for disabling the preference for a local replica and going to a local replica without inter-process interaction. [#2832](https://github.com/yandex/ClickHouse/pull/2832) -* The `quantileExact` aggregate function returns `nan` in the case of aggregation on an empty `Float32` or `Float64` set. [Sundy Li](https://github.com/yandex/ClickHouse/pull/2855) +* Clusters can be removed without restarting the server when they are deleted from the config files. [#2777](https://github.com/ClickHouse/ClickHouse/pull/2777) +* External dictionaries can be removed without restarting the server when they are removed from config files. [#2779](https://github.com/ClickHouse/ClickHouse/pull/2779) +* Added `SETTINGS` support for the `Kafka` table engine. [Alexander Marshalov](https://github.com/ClickHouse/ClickHouse/pull/2781) +* Improvements for the `UUID` data type (not yet complete). [#2618](https://github.com/ClickHouse/ClickHouse/pull/2618) +* Support for empty parts after merges in the `SummingMergeTree`, `CollapsingMergeTree` and `VersionedCollapsingMergeTree` engines. [#2815](https://github.com/ClickHouse/ClickHouse/pull/2815) +* Old records of completed mutations are deleted (`ALTER DELETE`). [#2784](https://github.com/ClickHouse/ClickHouse/pull/2784) +* Added the `system.merge_tree_settings` table. [Kirill Shvakov](https://github.com/ClickHouse/ClickHouse/pull/2841) +* The `system.tables` table now has dependency columns: `dependencies_database` and `dependencies_table`. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2851) +* Added the `max_partition_size_to_drop` config option. [#2782](https://github.com/ClickHouse/ClickHouse/pull/2782) +* Added the `output_format_json_escape_forward_slashes` option. [Alexander Bocharov](https://github.com/ClickHouse/ClickHouse/pull/2812) +* Added the `max_fetch_partition_retries_count` setting. [#2831](https://github.com/ClickHouse/ClickHouse/pull/2831) +* Added the `prefer_localhost_replica` setting for disabling the preference for a local replica and going to a local replica without inter-process interaction. [#2832](https://github.com/ClickHouse/ClickHouse/pull/2832) +* The `quantileExact` aggregate function returns `nan` in the case of aggregation on an empty `Float32` or `Float64` set. [Sundy Li](https://github.com/ClickHouse/ClickHouse/pull/2855) ### Bug fixes: * Removed unnecessary escaping of the connection string parameters for ODBC, which made it impossible to establish a connection. This error occurred in version 18.6.0. -* Fixed the logic for processing `REPLACE PARTITION` commands in the replication queue. If there are two `REPLACE` commands for the same partition, the incorrect logic could cause one of them to remain in the replication queue and not be executed. [#2814](https://github.com/yandex/ClickHouse/pull/2814) -* Fixed a merge bug when all data parts were empty (parts that were formed from a merge or from `ALTER DELETE` if all data was deleted). This bug appeared in version 18.1.0. [#2930](https://github.com/yandex/ClickHouse/pull/2930) -* Fixed an error for concurrent `Set` or `Join`. [Amos Bird](https://github.com/yandex/ClickHouse/pull/2823) -* Fixed the `Block structure mismatch in UNION stream: different number of columns` error that occurred for `UNION ALL` queries inside a sub-query if one of the `SELECT` queries contains duplicate column names. [Winter Zhang](https://github.com/yandex/ClickHouse/pull/2094) +* Fixed the logic for processing `REPLACE PARTITION` commands in the replication queue. If there are two `REPLACE` commands for the same partition, the incorrect logic could cause one of them to remain in the replication queue and not be executed. [#2814](https://github.com/ClickHouse/ClickHouse/pull/2814) +* Fixed a merge bug when all data parts were empty (parts that were formed from a merge or from `ALTER DELETE` if all data was deleted). This bug appeared in version 18.1.0. [#2930](https://github.com/ClickHouse/ClickHouse/pull/2930) +* Fixed an error for concurrent `Set` or `Join`. [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2823) +* Fixed the `Block structure mismatch in UNION stream: different number of columns` error that occurred for `UNION ALL` queries inside a sub-query if one of the `SELECT` queries contains duplicate column names. [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2094) * Fixed a memory leak if an exception occurred when connecting to a MySQL server. * Fixed incorrect clickhouse-client response code in case of a query error. -* Fixed incorrect behavior of materialized views containing DISTINCT. [#2795](https://github.com/yandex/ClickHouse/issues/2795) +* Fixed incorrect behavior of materialized views containing DISTINCT. [#2795](https://github.com/ClickHouse/ClickHouse/issues/2795) ### Backward incompatible changes @@ -1897,10 +1897,10 @@ This release contains exactly the same set of patches as 19.3.6. ### Build changes: -* The allocator has been replaced: `jemalloc` is now used instead of `tcmalloc`. In some scenarios, this increases speed up to 20%. However, there are queries that have slowed by up to 20%. Memory consumption has been reduced by approximately 10% in some scenarios, with improved stability. With highly competitive loads, CPU usage in userspace and in system shows just a slight increase. [#2773](https://github.com/yandex/ClickHouse/pull/2773) -* Use of libressl from a submodule. [#1983](https://github.com/yandex/ClickHouse/pull/1983) [#2807](https://github.com/yandex/ClickHouse/pull/2807) -* Use of unixodbc from a submodule. [#2789](https://github.com/yandex/ClickHouse/pull/2789) -* Use of mariadb-connector-c from a submodule. [#2785](https://github.com/yandex/ClickHouse/pull/2785) +* The allocator has been replaced: `jemalloc` is now used instead of `tcmalloc`. In some scenarios, this increases speed up to 20%. However, there are queries that have slowed by up to 20%. Memory consumption has been reduced by approximately 10% in some scenarios, with improved stability. With highly competitive loads, CPU usage in userspace and in system shows just a slight increase. [#2773](https://github.com/ClickHouse/ClickHouse/pull/2773) +* Use of libressl from a submodule. [#1983](https://github.com/ClickHouse/ClickHouse/pull/1983) [#2807](https://github.com/ClickHouse/ClickHouse/pull/2807) +* Use of unixodbc from a submodule. [#2789](https://github.com/ClickHouse/ClickHouse/pull/2789) +* Use of mariadb-connector-c from a submodule. [#2785](https://github.com/ClickHouse/ClickHouse/pull/2785) * Added functional test files to the repository that depend on the availability of test data (for the time being, without the test data itself). ## ClickHouse release 18.6.0, 2018-08-02 @@ -1909,49 +1909,49 @@ This release contains exactly the same set of patches as 19.3.6. * Added support for ON expressions for the JOIN ON syntax: `JOIN ON Expr([table.]column ...) = Expr([table.]column, ...) [AND Expr([table.]column, ...) = Expr([table.]column, ...) ...]` -The expression must be a chain of equalities joined by the AND operator. Each side of the equality can be an arbitrary expression over the columns of one of the tables. The use of fully qualified column names is supported (`table.name`, `database.table.name`, `table_alias.name`, `subquery_alias.name`) for the right table. [#2742](https://github.com/yandex/ClickHouse/pull/2742) -* HTTPS can be enabled for replication. [#2760](https://github.com/yandex/ClickHouse/pull/2760) +The expression must be a chain of equalities joined by the AND operator. Each side of the equality can be an arbitrary expression over the columns of one of the tables. The use of fully qualified column names is supported (`table.name`, `database.table.name`, `table_alias.name`, `subquery_alias.name`) for the right table. [#2742](https://github.com/ClickHouse/ClickHouse/pull/2742) +* HTTPS can be enabled for replication. [#2760](https://github.com/ClickHouse/ClickHouse/pull/2760) ### Improvements: -* The server passes the patch component of its version to the client. Data about the patch version component is in `system.processes` and `query_log`. [#2646](https://github.com/yandex/ClickHouse/pull/2646) +* The server passes the patch component of its version to the client. Data about the patch version component is in `system.processes` and `query_log`. [#2646](https://github.com/ClickHouse/ClickHouse/pull/2646) ## ClickHouse release 18.5.1, 2018-07-31 ### New features: -* Added the hash function `murmurHash2_32` [#2756](https://github.com/yandex/ClickHouse/pull/2756). +* Added the hash function `murmurHash2_32` [#2756](https://github.com/ClickHouse/ClickHouse/pull/2756). ### Improvements: -* Now you can use the `from_env` [#2741](https://github.com/yandex/ClickHouse/pull/2741) attribute to set values in config files from environment variables. -* Added case-insensitive versions of the `coalesce`, `ifNull`, and `nullIf functions` [#2752](https://github.com/yandex/ClickHouse/pull/2752). +* Now you can use the `from_env` [#2741](https://github.com/ClickHouse/ClickHouse/pull/2741) attribute to set values in config files from environment variables. +* Added case-insensitive versions of the `coalesce`, `ifNull`, and `nullIf functions` [#2752](https://github.com/ClickHouse/ClickHouse/pull/2752). ### Bug fixes: -* Fixed a possible bug when starting a replica [#2759](https://github.com/yandex/ClickHouse/pull/2759). +* Fixed a possible bug when starting a replica [#2759](https://github.com/ClickHouse/ClickHouse/pull/2759). ## ClickHouse release 18.4.0, 2018-07-28 ### New features: -* Added system tables: `formats`, `data_type_families`, `aggregate_function_combinators`, `table_functions`, `table_engines`, `collations` [#2721](https://github.com/yandex/ClickHouse/pull/2721). -* Added the ability to use a table function instead of a table as an argument of a `remote` or `cluster table function` [#2708](https://github.com/yandex/ClickHouse/pull/2708). -* Support for `HTTP Basic` authentication in the replication protocol [#2727](https://github.com/yandex/ClickHouse/pull/2727). -* The `has` function now allows searching for a numeric value in an array of `Enum` values [Maxim Khrisanfov](https://github.com/yandex/ClickHouse/pull/2699). -* Support for adding arbitrary message separators when reading from `Kafka` [Amos Bird](https://github.com/yandex/ClickHouse/pull/2701). +* Added system tables: `formats`, `data_type_families`, `aggregate_function_combinators`, `table_functions`, `table_engines`, `collations` [#2721](https://github.com/ClickHouse/ClickHouse/pull/2721). +* Added the ability to use a table function instead of a table as an argument of a `remote` or `cluster table function` [#2708](https://github.com/ClickHouse/ClickHouse/pull/2708). +* Support for `HTTP Basic` authentication in the replication protocol [#2727](https://github.com/ClickHouse/ClickHouse/pull/2727). +* The `has` function now allows searching for a numeric value in an array of `Enum` values [Maxim Khrisanfov](https://github.com/ClickHouse/ClickHouse/pull/2699). +* Support for adding arbitrary message separators when reading from `Kafka` [Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2701). ### Improvements: -* The `ALTER TABLE t DELETE WHERE` query does not rewrite data parts that were not affected by the WHERE condition [#2694](https://github.com/yandex/ClickHouse/pull/2694). +* The `ALTER TABLE t DELETE WHERE` query does not rewrite data parts that were not affected by the WHERE condition [#2694](https://github.com/ClickHouse/ClickHouse/pull/2694). * The `use_minimalistic_checksums_in_zookeeper` option for `ReplicatedMergeTree` tables is enabled by default. This setting was added in version 1.1.54378, 2018-04-16. Versions that are older than 1.1.54378 can no longer be installed. -* Support for running `KILL` and `OPTIMIZE` queries that specify `ON CLUSTER` [Winter Zhang](https://github.com/yandex/ClickHouse/pull/2689). +* Support for running `KILL` and `OPTIMIZE` queries that specify `ON CLUSTER` [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2689). ### Bug fixes: -* Fixed the error `Column ... is not under an aggregate function and not in GROUP BY` for aggregation with an IN expression. This bug appeared in version 18.1.0. ([bbdd780b](https://github.com/yandex/ClickHouse/commit/bbdd780be0be06a0f336775941cdd536878dd2c2)) -* Fixed a bug in the `windowFunnel aggregate function` [Winter Zhang](https://github.com/yandex/ClickHouse/pull/2735). -* Fixed a bug in the `anyHeavy` aggregate function ([a2101df2](https://github.com/yandex/ClickHouse/commit/a2101df25a6a0fba99aa71f8793d762af2b801ee)) +* Fixed the error `Column ... is not under an aggregate function and not in GROUP BY` for aggregation with an IN expression. This bug appeared in version 18.1.0. ([bbdd780b](https://github.com/ClickHouse/ClickHouse/commit/bbdd780be0be06a0f336775941cdd536878dd2c2)) +* Fixed a bug in the `windowFunnel aggregate function` [Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2735). +* Fixed a bug in the `anyHeavy` aggregate function ([a2101df2](https://github.com/ClickHouse/ClickHouse/commit/a2101df25a6a0fba99aa71f8793d762af2b801ee)) * Fixed server crash when using the `countArray()` aggregate function. ### Backward incompatible changes: @@ -1962,29 +1962,29 @@ The expression must be a chain of equalities joined by the AND operator. Each si ### New features: -* Support for the `ALTER TABLE t DELETE WHERE` query for non-replicated MergeTree tables ([#2634](https://github.com/yandex/ClickHouse/pull/2634)). -* Support for arbitrary types for the `uniq*` family of aggregate functions ([#2010](https://github.com/yandex/ClickHouse/issues/2010)). -* Support for arbitrary types in comparison operators ([#2026](https://github.com/yandex/ClickHouse/issues/2026)). -* The `users.xml` file allows setting a subnet mask in the format `10.0.0.1/255.255.255.0`. This is necessary for using masks for IPv6 networks with zeros in the middle ([#2637](https://github.com/yandex/ClickHouse/pull/2637)). -* Added the `arrayDistinct` function ([#2670](https://github.com/yandex/ClickHouse/pull/2670)). -* The SummingMergeTree engine can now work with AggregateFunction type columns ([Constantin S. Pan](https://github.com/yandex/ClickHouse/pull/2566)). +* Support for the `ALTER TABLE t DELETE WHERE` query for non-replicated MergeTree tables ([#2634](https://github.com/ClickHouse/ClickHouse/pull/2634)). +* Support for arbitrary types for the `uniq*` family of aggregate functions ([#2010](https://github.com/ClickHouse/ClickHouse/issues/2010)). +* Support for arbitrary types in comparison operators ([#2026](https://github.com/ClickHouse/ClickHouse/issues/2026)). +* The `users.xml` file allows setting a subnet mask in the format `10.0.0.1/255.255.255.0`. This is necessary for using masks for IPv6 networks with zeros in the middle ([#2637](https://github.com/ClickHouse/ClickHouse/pull/2637)). +* Added the `arrayDistinct` function ([#2670](https://github.com/ClickHouse/ClickHouse/pull/2670)). +* The SummingMergeTree engine can now work with AggregateFunction type columns ([Constantin S. Pan](https://github.com/ClickHouse/ClickHouse/pull/2566)). ### Improvements: * Changed the numbering scheme for release versions. Now the first part contains the year of release (A.D., Moscow timezone, minus 2000), the second part contains the number for major changes (increases for most releases), and the third part is the patch version. Releases are still backwards compatible, unless otherwise stated in the changelog. -* Faster conversions of floating-point numbers to a string ([Amos Bird](https://github.com/yandex/ClickHouse/pull/2664)). -* If some rows were skipped during an insert due to parsing errors (this is possible with the `input_allow_errors_num` and `input_allow_errors_ratio` settings enabled), the number of skipped rows is now written to the server log ([Leonardo Cecchi](https://github.com/yandex/ClickHouse/pull/2669)). +* Faster conversions of floating-point numbers to a string ([Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2664)). +* If some rows were skipped during an insert due to parsing errors (this is possible with the `input_allow_errors_num` and `input_allow_errors_ratio` settings enabled), the number of skipped rows is now written to the server log ([Leonardo Cecchi](https://github.com/ClickHouse/ClickHouse/pull/2669)). ### Bug fixes: -* Fixed the TRUNCATE command for temporary tables ([Amos Bird](https://github.com/yandex/ClickHouse/pull/2624)). -* Fixed a rare deadlock in the ZooKeeper client library that occurred when there was a network error while reading the response ([c315200](https://github.com/yandex/ClickHouse/commit/c315200e64b87e44bdf740707fc857d1fdf7e947)). -* Fixed an error during a CAST to Nullable types ([#1322](https://github.com/yandex/ClickHouse/issues/1322)). -* Fixed the incorrect result of the `maxIntersection()` function when the boundaries of intervals coincided ([Michael Furmur](https://github.com/yandex/ClickHouse/pull/2657)). -* Fixed incorrect transformation of the OR expression chain in a function argument ([chenxing-xc](https://github.com/yandex/ClickHouse/pull/2663)). -* Fixed performance degradation for queries containing `IN (subquery)` expressions inside another subquery ([#2571](https://github.com/yandex/ClickHouse/issues/2571)). -* Fixed incompatibility between servers with different versions in distributed queries that use a `CAST` function that isn't in uppercase letters ([fe8c4d6](https://github.com/yandex/ClickHouse/commit/fe8c4d64e434cacd4ceef34faa9005129f2190a5)). -* Added missing quoting of identifiers for queries to an external DBMS ([#2635](https://github.com/yandex/ClickHouse/issues/2635)). +* Fixed the TRUNCATE command for temporary tables ([Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2624)). +* Fixed a rare deadlock in the ZooKeeper client library that occurred when there was a network error while reading the response ([c315200](https://github.com/ClickHouse/ClickHouse/commit/c315200e64b87e44bdf740707fc857d1fdf7e947)). +* Fixed an error during a CAST to Nullable types ([#1322](https://github.com/ClickHouse/ClickHouse/issues/1322)). +* Fixed the incorrect result of the `maxIntersection()` function when the boundaries of intervals coincided ([Michael Furmur](https://github.com/ClickHouse/ClickHouse/pull/2657)). +* Fixed incorrect transformation of the OR expression chain in a function argument ([chenxing-xc](https://github.com/ClickHouse/ClickHouse/pull/2663)). +* Fixed performance degradation for queries containing `IN (subquery)` expressions inside another subquery ([#2571](https://github.com/ClickHouse/ClickHouse/issues/2571)). +* Fixed incompatibility between servers with different versions in distributed queries that use a `CAST` function that isn't in uppercase letters ([fe8c4d6](https://github.com/ClickHouse/ClickHouse/commit/fe8c4d64e434cacd4ceef34faa9005129f2190a5)). +* Added missing quoting of identifiers for queries to an external DBMS ([#2635](https://github.com/ClickHouse/ClickHouse/issues/2635)). ### Backward incompatible changes: @@ -1994,44 +1994,44 @@ The expression must be a chain of equalities joined by the AND operator. Each si ### New features: -* Added the `histogram` aggregate function ([Mikhail Surin](https://github.com/yandex/ClickHouse/pull/2521)). -* Now `OPTIMIZE TABLE ... FINAL` can be used without specifying partitions for `ReplicatedMergeTree` ([Amos Bird](https://github.com/yandex/ClickHouse/pull/2600)). +* Added the `histogram` aggregate function ([Mikhail Surin](https://github.com/ClickHouse/ClickHouse/pull/2521)). +* Now `OPTIMIZE TABLE ... FINAL` can be used without specifying partitions for `ReplicatedMergeTree` ([Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2600)). ### Bug fixes: * Fixed a problem with a very small timeout for sockets (one second) for reading and writing when sending and downloading replicated data, which made it impossible to download larger parts if there is a load on the network or disk (it resulted in cyclical attempts to download parts). This error occurred in version 1.1.54388. * Fixed issues when using chroot in ZooKeeper if you inserted duplicate data blocks in the table. -* The `has` function now works correctly for an array with Nullable elements ([#2115](https://github.com/yandex/ClickHouse/issues/2115)). +* The `has` function now works correctly for an array with Nullable elements ([#2115](https://github.com/ClickHouse/ClickHouse/issues/2115)). * The `system.tables` table now works correctly when used in distributed queries. The `metadata_modification_time` and `engine_full` columns are now non-virtual. Fixed an error that occurred if only these columns were queried from the table. -* Fixed how an empty `TinyLog` table works after inserting an empty data block ([#2563](https://github.com/yandex/ClickHouse/issues/2563)). +* Fixed how an empty `TinyLog` table works after inserting an empty data block ([#2563](https://github.com/ClickHouse/ClickHouse/issues/2563)). * The `system.zookeeper` table works if the value of the node in ZooKeeper is NULL. ## ClickHouse release 1.1.54390, 2018-07-06 ### New features: -* Queries can be sent in `multipart/form-data` format (in the `query` field), which is useful if external data is also sent for query processing ([Olga Hvostikova](https://github.com/yandex/ClickHouse/pull/2490)). -* Added the ability to enable or disable processing single or double quotes when reading data in CSV format. You can configure this in the `format_csv_allow_single_quotes` and `format_csv_allow_double_quotes` settings ([Amos Bird](https://github.com/yandex/ClickHouse/pull/2574)). -* Now `OPTIMIZE TABLE ... FINAL` can be used without specifying the partition for non-replicated variants of `MergeTree` ([Amos Bird](https://github.com/yandex/ClickHouse/pull/2599)). +* Queries can be sent in `multipart/form-data` format (in the `query` field), which is useful if external data is also sent for query processing ([Olga Hvostikova](https://github.com/ClickHouse/ClickHouse/pull/2490)). +* Added the ability to enable or disable processing single or double quotes when reading data in CSV format. You can configure this in the `format_csv_allow_single_quotes` and `format_csv_allow_double_quotes` settings ([Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2574)). +* Now `OPTIMIZE TABLE ... FINAL` can be used without specifying the partition for non-replicated variants of `MergeTree` ([Amos Bird](https://github.com/ClickHouse/ClickHouse/pull/2599)). ### Improvements: -* Improved performance, reduced memory consumption, and correct memory consumption tracking with use of the IN operator when a table index could be used ([#2584](https://github.com/yandex/ClickHouse/pull/2584)). +* Improved performance, reduced memory consumption, and correct memory consumption tracking with use of the IN operator when a table index could be used ([#2584](https://github.com/ClickHouse/ClickHouse/pull/2584)). * Removed redundant checking of checksums when adding a data part. This is important when there are a large number of replicas, because in these cases the total number of checks was equal to N^2. -* Added support for `Array(Tuple(...))` arguments for the `arrayEnumerateUniq` function ([#2573](https://github.com/yandex/ClickHouse/pull/2573)). -* Added `Nullable` support for the `runningDifference` function ([#2594](https://github.com/yandex/ClickHouse/pull/2594)). -* Improved query analysis performance when there is a very large number of expressions ([#2572](https://github.com/yandex/ClickHouse/pull/2572)). -* Faster selection of data parts for merging in `ReplicatedMergeTree` tables. Faster recovery of the ZooKeeper session ([#2597](https://github.com/yandex/ClickHouse/pull/2597)). -* The `format_version.txt` file for `MergeTree` tables is re-created if it is missing, which makes sense if ClickHouse is launched after copying the directory structure without files ([Ciprian Hacman](https://github.com/yandex/ClickHouse/pull/2593)). +* Added support for `Array(Tuple(...))` arguments for the `arrayEnumerateUniq` function ([#2573](https://github.com/ClickHouse/ClickHouse/pull/2573)). +* Added `Nullable` support for the `runningDifference` function ([#2594](https://github.com/ClickHouse/ClickHouse/pull/2594)). +* Improved query analysis performance when there is a very large number of expressions ([#2572](https://github.com/ClickHouse/ClickHouse/pull/2572)). +* Faster selection of data parts for merging in `ReplicatedMergeTree` tables. Faster recovery of the ZooKeeper session ([#2597](https://github.com/ClickHouse/ClickHouse/pull/2597)). +* The `format_version.txt` file for `MergeTree` tables is re-created if it is missing, which makes sense if ClickHouse is launched after copying the directory structure without files ([Ciprian Hacman](https://github.com/ClickHouse/ClickHouse/pull/2593)). ### Bug fixes: * Fixed a bug when working with ZooKeeper that could make it impossible to recover the session and readonly states of tables before restarting the server. * Fixed a bug when working with ZooKeeper that could result in old nodes not being deleted if the session is interrupted. -* Fixed an error in the `quantileTDigest` function for Float arguments (this bug was introduced in version 1.1.54388) ([Mikhail Surin](https://github.com/yandex/ClickHouse/pull/2553)). -* Fixed a bug in the index for MergeTree tables if the primary key column is located inside the function for converting types between signed and unsigned integers of the same size ([#2603](https://github.com/yandex/ClickHouse/pull/2603)). -* Fixed segfault if `macros` are used but they aren't in the config file ([#2570](https://github.com/yandex/ClickHouse/pull/2570)). -* Fixed switching to the default database when reconnecting the client ([#2583](https://github.com/yandex/ClickHouse/pull/2583)). +* Fixed an error in the `quantileTDigest` function for Float arguments (this bug was introduced in version 1.1.54388) ([Mikhail Surin](https://github.com/ClickHouse/ClickHouse/pull/2553)). +* Fixed a bug in the index for MergeTree tables if the primary key column is located inside the function for converting types between signed and unsigned integers of the same size ([#2603](https://github.com/ClickHouse/ClickHouse/pull/2603)). +* Fixed segfault if `macros` are used but they aren't in the config file ([#2570](https://github.com/ClickHouse/ClickHouse/pull/2570)). +* Fixed switching to the default database when reconnecting the client ([#2583](https://github.com/ClickHouse/ClickHouse/pull/2583)). * Fixed a bug that occurred when the `use_index_for_in_with_subqueries` setting was disabled. ### Security fix: @@ -2044,50 +2044,50 @@ The expression must be a chain of equalities joined by the AND operator. Each si * Support for the `ALTER TABLE t DELETE WHERE` query for replicated tables. Added the `system.mutations` table to track progress of this type of queries. * Support for the `ALTER TABLE t [REPLACE|ATTACH] PARTITION` query for \*MergeTree tables. -* Support for the `TRUNCATE TABLE` query ([Winter Zhang](https://github.com/yandex/ClickHouse/pull/2260)) +* Support for the `TRUNCATE TABLE` query ([Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2260)) * Several new `SYSTEM` queries for replicated tables (`RESTART REPLICAS`, `SYNC REPLICA`, `[STOP|START] [MERGES|FETCHES|SENDS REPLICATED|REPLICATION QUEUES]`). -* Added the ability to write to a table with the MySQL engine and the corresponding table function ([sundy-li](https://github.com/yandex/ClickHouse/pull/2294)). -* Added the `url()` table function and the `URL` table engine ([Alexander Sapin](https://github.com/yandex/ClickHouse/pull/2501)). -* Added the `windowFunnel` aggregate function ([sundy-li](https://github.com/yandex/ClickHouse/pull/2352)). -* New `startsWith` and `endsWith` functions for strings ([Vadim Plakhtinsky](https://github.com/yandex/ClickHouse/pull/2429)). -* The `numbers()` table function now allows you to specify the offset ([Winter Zhang](https://github.com/yandex/ClickHouse/pull/2535)). +* Added the ability to write to a table with the MySQL engine and the corresponding table function ([sundy-li](https://github.com/ClickHouse/ClickHouse/pull/2294)). +* Added the `url()` table function and the `URL` table engine ([Alexander Sapin](https://github.com/ClickHouse/ClickHouse/pull/2501)). +* Added the `windowFunnel` aggregate function ([sundy-li](https://github.com/ClickHouse/ClickHouse/pull/2352)). +* New `startsWith` and `endsWith` functions for strings ([Vadim Plakhtinsky](https://github.com/ClickHouse/ClickHouse/pull/2429)). +* The `numbers()` table function now allows you to specify the offset ([Winter Zhang](https://github.com/ClickHouse/ClickHouse/pull/2535)). * The password to `clickhouse-client` can be entered interactively. -* Server logs can now be sent to syslog ([Alexander Krasheninnikov](https://github.com/yandex/ClickHouse/pull/2459)). -* Support for logging in dictionaries with a shared library source ([Alexander Sapin](https://github.com/yandex/ClickHouse/pull/2472)). -* Support for custom CSV delimiters ([Ivan Zhukov](https://github.com/yandex/ClickHouse/pull/2263)) +* Server logs can now be sent to syslog ([Alexander Krasheninnikov](https://github.com/ClickHouse/ClickHouse/pull/2459)). +* Support for logging in dictionaries with a shared library source ([Alexander Sapin](https://github.com/ClickHouse/ClickHouse/pull/2472)). +* Support for custom CSV delimiters ([Ivan Zhukov](https://github.com/ClickHouse/ClickHouse/pull/2263)) * Added the `date_time_input_format` setting. If you switch this setting to `'best_effort'`, DateTime values will be read in a wide range of formats. * Added the `clickhouse-obfuscator` utility for data obfuscation. Usage example: publishing data used in performance tests. ### Experimental features: -* Added the ability to calculate `and` arguments only where they are needed ([Anastasia Tsarkova](https://github.com/yandex/ClickHouse/pull/2272)) -* JIT compilation to native code is now available for some expressions ([pyos](https://github.com/yandex/ClickHouse/pull/2277)). +* Added the ability to calculate `and` arguments only where they are needed ([Anastasia Tsarkova](https://github.com/ClickHouse/ClickHouse/pull/2272)) +* JIT compilation to native code is now available for some expressions ([pyos](https://github.com/ClickHouse/ClickHouse/pull/2277)). ### Bug fixes: * Duplicates no longer appear for a query with `DISTINCT` and `ORDER BY`. * Queries with `ARRAY JOIN` and `arrayFilter` no longer return an incorrect result. -* Fixed an error when reading an array column from a Nested structure ([#2066](https://github.com/yandex/ClickHouse/issues/2066)). +* Fixed an error when reading an array column from a Nested structure ([#2066](https://github.com/ClickHouse/ClickHouse/issues/2066)). * Fixed an error when analyzing queries with a HAVING clause like `HAVING tuple IN (...)`. * Fixed an error when analyzing queries with recursive aliases. -* Fixed an error when reading from ReplacingMergeTree with a condition in PREWHERE that filters all rows ([#2525](https://github.com/yandex/ClickHouse/issues/2525)). +* Fixed an error when reading from ReplacingMergeTree with a condition in PREWHERE that filters all rows ([#2525](https://github.com/ClickHouse/ClickHouse/issues/2525)). * User profile settings were not applied when using sessions in the HTTP interface. * Fixed how settings are applied from the command line parameters in clickhouse-local. * The ZooKeeper client library now uses the session timeout received from the server. * Fixed a bug in the ZooKeeper client library when the client waited for the server response longer than the timeout. -* Fixed pruning of parts for queries with conditions on partition key columns ([#2342](https://github.com/yandex/ClickHouse/issues/2342)). -* Merges are now possible after `CLEAR COLUMN IN PARTITION` ([#2315](https://github.com/yandex/ClickHouse/issues/2315)). -* Type mapping in the ODBC table function has been fixed ([sundy-li](https://github.com/yandex/ClickHouse/pull/2268)). -* Type comparisons have been fixed for `DateTime` with and without the time zone ([Alexander Bocharov](https://github.com/yandex/ClickHouse/pull/2400)). +* Fixed pruning of parts for queries with conditions on partition key columns ([#2342](https://github.com/ClickHouse/ClickHouse/issues/2342)). +* Merges are now possible after `CLEAR COLUMN IN PARTITION` ([#2315](https://github.com/ClickHouse/ClickHouse/issues/2315)). +* Type mapping in the ODBC table function has been fixed ([sundy-li](https://github.com/ClickHouse/ClickHouse/pull/2268)). +* Type comparisons have been fixed for `DateTime` with and without the time zone ([Alexander Bocharov](https://github.com/ClickHouse/ClickHouse/pull/2400)). * Fixed syntactic parsing and formatting of the `CAST` operator. -* Fixed insertion into a materialized view for the Distributed table engine ([Babacar Diassé](https://github.com/yandex/ClickHouse/pull/2411)). -* Fixed a race condition when writing data from the `Kafka` engine to materialized views ([Yangkuan Liu](https://github.com/yandex/ClickHouse/pull/2448)). +* Fixed insertion into a materialized view for the Distributed table engine ([Babacar Diassé](https://github.com/ClickHouse/ClickHouse/pull/2411)). +* Fixed a race condition when writing data from the `Kafka` engine to materialized views ([Yangkuan Liu](https://github.com/ClickHouse/ClickHouse/pull/2448)). * Fixed SSRF in the remote() table function. -* Fixed exit behavior of `clickhouse-client` in multiline mode ([#2510](https://github.com/yandex/ClickHouse/issues/2510)). +* Fixed exit behavior of `clickhouse-client` in multiline mode ([#2510](https://github.com/ClickHouse/ClickHouse/issues/2510)). ### Improvements: -* Background tasks in replicated tables are now performed in a thread pool instead of in separate threads ([Silviu Caragea](https://github.com/yandex/ClickHouse/pull/1722)). +* Background tasks in replicated tables are now performed in a thread pool instead of in separate threads ([Silviu Caragea](https://github.com/ClickHouse/ClickHouse/pull/1722)). * Improved LZ4 compression performance. * Faster analysis for queries with a large number of JOINs and sub-queries. * The DNS cache is now updated automatically when there are too many network errors. @@ -2095,8 +2095,8 @@ The expression must be a chain of equalities joined by the AND operator. Each si * Corrected the discrepancy in the event counters `Query`, `SelectQuery`, and `InsertQuery`. * Expressions like `tuple IN (SELECT tuple)` are allowed if the tuple types match. * A server with replicated tables can start even if you haven't configured ZooKeeper. -* When calculating the number of available CPU cores, limits on cgroups are now taken into account ([Atri Sharma](https://github.com/yandex/ClickHouse/pull/2325)). -* Added chown for config directories in the systemd config file ([Mikhail Shiryaev](https://github.com/yandex/ClickHouse/pull/2421)). +* When calculating the number of available CPU cores, limits on cgroups are now taken into account ([Atri Sharma](https://github.com/ClickHouse/ClickHouse/pull/2325)). +* Added chown for config directories in the systemd config file ([Mikhail Shiryaev](https://github.com/ClickHouse/ClickHouse/pull/2421)). ### Build changes: @@ -2104,10 +2104,10 @@ The expression must be a chain of equalities joined by the AND operator. Each si * Added the ability to build llvm from submodule. * The version of the librdkafka library has been updated to v0.11.4. * Added the ability to use the system libcpuid library. The library version has been updated to 0.4.0. -* Fixed the build using the vectorclass library ([Babacar Diassé](https://github.com/yandex/ClickHouse/pull/2274)). +* Fixed the build using the vectorclass library ([Babacar Diassé](https://github.com/ClickHouse/ClickHouse/pull/2274)). * Cmake now generates files for ninja by default (like when using `-G Ninja`). -* Added the ability to use the libtinfo library instead of libtermcap ([Georgy Kondratiev](https://github.com/yandex/ClickHouse/pull/2519)). -* Fixed a header file conflict in Fedora Rawhide ([#2520](https://github.com/yandex/ClickHouse/issues/2520)). +* Added the ability to use the libtinfo library instead of libtermcap ([Georgy Kondratiev](https://github.com/ClickHouse/ClickHouse/pull/2519)). +* Fixed a header file conflict in Fedora Rawhide ([#2520](https://github.com/ClickHouse/ClickHouse/issues/2520)). ### Backward incompatible changes: @@ -2215,7 +2215,7 @@ The expression must be a chain of equalities joined by the AND operator. Each si * Added the `system.macros` table and auto updating of macros when the config file is changed. * Added the `SYSTEM RELOAD CONFIG` query. -* Added the `maxIntersections(left_col, right_col)` aggregate function, which returns the maximum number of simultaneously intersecting intervals `[left; right]`. The `maxIntersectionsPosition(left, right)` function returns the beginning of the "maximum" interval. ([Michael Furmur](https://github.com/yandex/ClickHouse/pull/2012)). +* Added the `maxIntersections(left_col, right_col)` aggregate function, which returns the maximum number of simultaneously intersecting intervals `[left; right]`. The `maxIntersectionsPosition(left, right)` function returns the beginning of the "maximum" interval. ([Michael Furmur](https://github.com/ClickHouse/ClickHouse/pull/2012)). ### Improvements: diff --git a/README.md b/README.md index 45c626604fd..93e4af1a43d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![ClickHouse — open source distributed column-oriented DBMS](https://github.com/yandex/ClickHouse/raw/master/website/images/logo-400x240.png)](https://clickhouse.yandex) +[![ClickHouse — open source distributed column-oriented DBMS](https://github.com/ClickHouse/ClickHouse/raw/master/website/images/logo-400x240.png)](https://clickhouse.yandex) ClickHouse is an open-source column-oriented database management system that allows generating analytical data reports in real time. diff --git a/SECURITY.md b/SECURITY.md index c3682d6c499..62a70166dab 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -14,4 +14,4 @@ currently being supported with security updates: ## Reporting a Vulnerability To report a potential vulnerability in ClickHouse please use the security advisory feature of GitHub: -https://github.com/yandex/ClickHouse/security/advisories +https://github.com/ClickHouse/ClickHouse/security/advisories diff --git a/dbms/src/Common/SensitiveDataMasker.h b/dbms/src/Common/SensitiveDataMasker.h index 942b97181c5..99a5c8c72b4 100644 --- a/dbms/src/Common/SensitiveDataMasker.h +++ b/dbms/src/Common/SensitiveDataMasker.h @@ -55,7 +55,7 @@ public: size_t wipeSensitiveData(std::string & data) const; /// setInstance is not thread-safe and should be called once in single-thread mode. - /// https://github.com/yandex/ClickHouse/pull/6810#discussion_r321183367 + /// https://github.com/ClickHouse/ClickHouse/pull/6810#discussion_r321183367 static void setInstance(std::unique_ptr sensitive_data_masker_); static SensitiveDataMasker * getInstance(); diff --git a/dbms/src/Interpreters/PredicateExpressionsOptimizer.h b/dbms/src/Interpreters/PredicateExpressionsOptimizer.h index 302c2e62a65..ca2c8b8766d 100644 --- a/dbms/src/Interpreters/PredicateExpressionsOptimizer.h +++ b/dbms/src/Interpreters/PredicateExpressionsOptimizer.h @@ -20,7 +20,7 @@ class Context; * - Query after optimization : * SELECT id_1, name_1 FROM (SELECT id_1, name_1 FROM table_a WHERE id_1 = 1 UNION ALL SELECT id_2, name_2 FROM table_b WHERE id_2 = 1) * WHERE id_1 = 1 - * For more details : https://github.com/yandex/ClickHouse/pull/2015#issuecomment-374283452 + * For more details : https://github.com/ClickHouse/ClickHouse/pull/2015#issuecomment-374283452 */ class PredicateExpressionsOptimizer { diff --git a/dbms/src/Interpreters/RowRefs.h b/dbms/src/Interpreters/RowRefs.h index ffcaf370b00..03309831322 100644 --- a/dbms/src/Interpreters/RowRefs.h +++ b/dbms/src/Interpreters/RowRefs.h @@ -242,7 +242,7 @@ private: // Lookups can be stored in a HashTable because it is memmovable // A std::variant contains a currently active type id (memmovable), together with a union of the types // The types are all std::unique_ptr, which contains a single pointer, which is memmovable. - // Source: https://github.com/yandex/ClickHouse/issues/4906 + // Source: https://github.com/ClickHouse/ClickHouse/issues/4906 Lookups lookups; }; diff --git a/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp b/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp index 045aa8a6461..abb7e8e88ce 100644 --- a/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp +++ b/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp @@ -45,7 +45,7 @@ void ReplicatedMergeTreePartCheckThread::start() void ReplicatedMergeTreePartCheckThread::stop() { - //based on discussion on https://github.com/yandex/ClickHouse/pull/1489#issuecomment-344756259 + //based on discussion on https://github.com/ClickHouse/ClickHouse/pull/1489#issuecomment-344756259 //using the schedule pool there is no problem in case stop is called two time in row and the start multiple times std::lock_guard lock(start_stop_mutex); diff --git a/dbms/tests/instructions/developer_instruction_ru.md b/dbms/tests/instructions/developer_instruction_ru.md index 411287e4072..7aa110b1b94 100644 --- a/dbms/tests/instructions/developer_instruction_ru.md +++ b/dbms/tests/instructions/developer_instruction_ru.md @@ -12,7 +12,7 @@ Если аккаунта нет - зарегистрируйтесь на https://github.com/. Создайте ssh ключи, если их нет, и загрузите публичные ключи на GitHub. Это потребуется для отправки изменений. Для работы с GitHub можно использовать такие же ssh ключи, как и для работы с другими ssh серверами - скорее всего, они уже у вас есть. -Создайте fork репозитория ClickHouse. Для этого, на странице https://github.com/yandex/ClickHouse нажмите на кнопку "fork" в правом верхнем углу. Вы получите полную копию репозитория ClickHouse на своём аккаунте, которая называется "форк". Процесс разработки состоит в том, чтобы внести нужные изменения в свой форк репозитория, а затем создать "pull request" для принятия изменений в основной репозиторий. +Создайте fork репозитория ClickHouse. Для этого, на странице https://github.com/ClickHouse/ClickHouse нажмите на кнопку "fork" в правом верхнем углу. Вы получите полную копию репозитория ClickHouse на своём аккаунте, которая называется "форк". Процесс разработки состоит в том, чтобы внести нужные изменения в свой форк репозитория, а затем создать "pull request" для принятия изменений в основной репозиторий. Для работы с git репозиториями, установите `git`. @@ -61,7 +61,7 @@ and the repository exists. Вы также можете клонировать репозиторий по протоколу https: ``` -git clone https://github.com/yandex/ClickHouse.git +git clone https://github.com/ClickHouse/ClickHouse.git ``` Этот вариант не подходит для отправки изменений на сервер. Вы можете временно его использовать, а затем добавить ssh ключи и заменить адрес репозитория с помощью команды `git remote`. @@ -228,7 +228,7 @@ sudo -u clickhouse ClickHouse/build/dbms/programs/clickhouse server --config-fil Разработка тестов: https://clickhouse.yandex/docs/ru/development/tests/ -Список задач: https://github.com/yandex/ClickHouse/blob/master/dbms/tests/instructions/easy_tasks_sorted_ru.md +Список задач: https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/instructions/easy_tasks_sorted_ru.md # Тестовые данные diff --git a/dbms/tests/instructions/easy_tasks_sorted_ru.md b/dbms/tests/instructions/easy_tasks_sorted_ru.md index 43d86b709c3..6b9c0e18f9d 100644 --- a/dbms/tests/instructions/easy_tasks_sorted_ru.md +++ b/dbms/tests/instructions/easy_tasks_sorted_ru.md @@ -134,7 +134,7 @@ Geohash - способ преобразования географических Энтропию следует считать по гистограмме. Пример расчёта гистограммы смотрите в реализации функции `quantileExact`. -https://github.com/yandex/ClickHouse/issues/3266 +https://github.com/ClickHouse/ClickHouse/issues/3266 ## Функции создания и обновления состояния агрегатной функции по одному кортежу аргументов. @@ -152,7 +152,7 @@ https://github.com/yandex/ClickHouse/issues/3266 ## Корректное сравнение Date и DateTime. -https://github.com/yandex/ClickHouse/issues/2011 +https://github.com/ClickHouse/ClickHouse/issues/2011 Нужно сравнивать Date и DateTime так, как будто Date расширено до DateTime на начало суток в том же часовом поясе. diff --git a/dbms/tests/integration/test_block_structure_mismatch/test.py b/dbms/tests/integration/test_block_structure_mismatch/test.py index ad96a9aa853..fa9272b8401 100644 --- a/dbms/tests/integration/test_block_structure_mismatch/test.py +++ b/dbms/tests/integration/test_block_structure_mismatch/test.py @@ -9,7 +9,7 @@ cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance('node1', main_configs=['configs/remote_servers.xml']) node2 = cluster.add_instance('node2', main_configs=['configs/remote_servers.xml']) -#test reproducing issue https://github.com/yandex/ClickHouse/issues/3162 +#test reproducing issue https://github.com/ClickHouse/ClickHouse/issues/3162 @pytest.fixture(scope="module") def started_cluster(): try: diff --git a/dbms/tests/performance/if_array_string.xml b/dbms/tests/performance/if_array_string.xml index 3654cedd879..c135cf9c8ce 100644 --- a/dbms/tests/performance/if_array_string.xml +++ b/dbms/tests/performance/if_array_string.xml @@ -17,5 +17,5 @@ SELECT count() FROM system.numbers WHERE NOT ignore(rand() % 2 ? ['Hello', 'World'] : materialize(['a', 'b', 'c'])) SELECT count() FROM system.numbers WHERE NOT ignore(rand() % 2 ? materialize(['Hello', 'World']) : materialize(['a', 'b', 'c'])) SELECT count() FROM system.numbers WHERE NOT ignore(rand() % 2 ? materialize(['', '']) : emptyArrayString()) - SELECT count() FROM system.numbers WHERE NOT ignore(rand() % 2 ? materialize(['https://github.com/yandex/ClickHouse/pull/1070', 'https://www.google.ru/search?newwindow=1&site=&source=hp&q=zookeeper+wire+protocol+exists&oq=zookeeper+wire+protocol+exists&gs_l=psy-ab.3...330.6300.0.6687.33.28.0.0.0.0.386.4838.0j5j9j5.19.0....0...1.1.64.psy-ab..14.17.4448.0..0j35i39k1j0i131k1j0i22i30k1j0i19k1j33i21k1.r_3uFoNOrSU']) : emptyArrayString()) + SELECT count() FROM system.numbers WHERE NOT ignore(rand() % 2 ? materialize(['https://github.com/ClickHouse/ClickHouse/pull/1070', 'https://www.google.ru/search?newwindow=1&site=&source=hp&q=zookeeper+wire+protocol+exists&oq=zookeeper+wire+protocol+exists&gs_l=psy-ab.3...330.6300.0.6687.33.28.0.0.0.0.386.4838.0j5j9j5.19.0....0...1.1.64.psy-ab..14.17.4448.0..0j35i39k1j0i131k1j0i22i30k1j0i19k1j33i21k1.r_3uFoNOrSU']) : emptyArrayString()) diff --git a/dbms/tests/queries/0_stateless/00504_insert_miss_columns.sh b/dbms/tests/queries/0_stateless/00504_insert_miss_columns.sh index 49ecb5ef315..35290bfbb45 100755 --- a/dbms/tests/queries/0_stateless/00504_insert_miss_columns.sh +++ b/dbms/tests/queries/0_stateless/00504_insert_miss_columns.sh @@ -3,7 +3,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . $CURDIR/../shell_config.sh -# https://github.com/yandex/ClickHouse/issues/1300 +# https://github.com/ClickHouse/ClickHouse/issues/1300 $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS advertiser"; $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS advertiser_test"; diff --git a/dbms/tests/queries/0_stateless/00506_union_distributed.sql b/dbms/tests/queries/0_stateless/00506_union_distributed.sql index 7fd921a6a65..0bd4dd43ac9 100644 --- a/dbms/tests/queries/0_stateless/00506_union_distributed.sql +++ b/dbms/tests/queries/0_stateless/00506_union_distributed.sql @@ -1,5 +1,5 @@ --- https://github.com/yandex/ClickHouse/issues/1059 +-- https://github.com/ClickHouse/ClickHouse/issues/1059 DROP TABLE IF EXISTS union1; DROP TABLE IF EXISTS union2; diff --git a/docker/client/README.md b/docker/client/README.md index 385858d7f7b..b8cb51adb1e 100644 --- a/docker/client/README.md +++ b/docker/client/README.md @@ -4,4 +4,4 @@ For more information see [ClickHouse Server Docker Image](https://hub.docker.com ## License -View [license information](https://github.com/yandex/ClickHouse/blob/master/LICENSE) for the software contained in this image. +View [license information](https://github.com/ClickHouse/ClickHouse/blob/master/LICENSE) for the software contained in this image. diff --git a/docker/server/README.md b/docker/server/README.md index 4ed73a0fb9b..b1225e8a27a 100644 --- a/docker/server/README.md +++ b/docker/server/README.md @@ -59,4 +59,4 @@ EOSQL ## License -View [license information](https://github.com/yandex/ClickHouse/blob/master/LICENSE) for the software contained in this image. +View [license information](https://github.com/ClickHouse/ClickHouse/blob/master/LICENSE) for the software contained in this image. diff --git a/docker/test/README.md b/docker/test/README.md index 0833aacb822..563cfd837e9 100644 --- a/docker/test/README.md +++ b/docker/test/README.md @@ -2,4 +2,4 @@ ## License -View [license information](https://github.com/yandex/ClickHouse/blob/master/LICENSE) for the software contained in this image. +View [license information](https://github.com/ClickHouse/ClickHouse/blob/master/LICENSE) for the software contained in this image. diff --git a/docs/en/getting_started/example_datasets/metrica.md b/docs/en/getting_started/example_datasets/metrica.md index 44001b383d7..88d4c86430f 100644 --- a/docs/en/getting_started/example_datasets/metrica.md +++ b/docs/en/getting_started/example_datasets/metrica.md @@ -48,4 +48,4 @@ $ clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" ``` ## Queries -Examples of queries to these tables (they are named `test.hits` and `test.visits`) can be found among [stateful tests](https://github.com/yandex/ClickHouse/tree/master/dbms/tests/queries/1_stateful) and in some [performance tests](https://github.com/yandex/ClickHouse/tree/master/dbms/tests/performance/test_hits) of ClickHouse. +Examples of queries to these tables (they are named `test.hits` and `test.visits`) can be found among [stateful tests](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/tests/queries/1_stateful) and in some [performance tests](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/tests/performance/test_hits) of ClickHouse. diff --git a/docs/en/interfaces/tcp.md b/docs/en/interfaces/tcp.md index c17e8c15b5e..0c3e0e7885f 100644 --- a/docs/en/interfaces/tcp.md +++ b/docs/en/interfaces/tcp.md @@ -1,5 +1,5 @@ # Native Interface (TCP) -The native protocol is used in the [command-line client](cli.md), for interserver communication during distributed query processing, and also in other C++ programs. Unfortunately, native ClickHouse protocol does not have formal specification yet, but it can be reverse engineered from ClickHouse source code (starting [around here](https://github.com/yandex/ClickHouse/tree/master/dbms/src/Client)) and/or by intercepting and analyzing TCP traffic. +The native protocol is used in the [command-line client](cli.md), for interserver communication during distributed query processing, and also in other C++ programs. Unfortunately, native ClickHouse protocol does not have formal specification yet, but it can be reverse engineered from ClickHouse source code (starting [around here](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/src/Client)) and/or by intercepting and analyzing TCP traffic. [Original article](https://clickhouse.yandex/docs/en/interfaces/tcp/) diff --git a/docs/en/operations/backup.md b/docs/en/operations/backup.md index f6cb381e602..fcbfb7fb40c 100644 --- a/docs/en/operations/backup.md +++ b/docs/en/operations/backup.md @@ -1,6 +1,6 @@ # Data Backup -While [replication](table_engines/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [you can't just drop tables with a MergeTree-like engine containing more than 50 Gb of data](https://github.com/yandex/ClickHouse/blob/v18.14.18-stable/dbms/programs/server/config.xml#L322-L330). However, these safeguards don't cover all possible cases and can be circumvented. +While [replication](table_engines/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [you can't just drop tables with a MergeTree-like engine containing more than 50 Gb of data](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/dbms/programs/server/config.xml#L322-L330). However, these safeguards don't cover all possible cases and can be circumvented. In order to effectively mitigate possible human errors, you should carefully prepare a strategy for backing up and restoring your data **in advance**. diff --git a/docs/en/operations/settings/query_complexity.md b/docs/en/operations/settings/query_complexity.md index c00f2132ebd..1e71fb5e333 100644 --- a/docs/en/operations/settings/query_complexity.md +++ b/docs/en/operations/settings/query_complexity.md @@ -37,7 +37,7 @@ Memory consumption is also restricted by the parameters `max_memory_usage_for_us The maximum amount of RAM to use for running a user's queries on a single server. -Default values are defined in [Settings.h](https://github.com/yandex/ClickHouse/blob/master/dbms/src/Core/Settings.h#L288). By default, the amount is not restricted (`max_memory_usage_for_user = 0`). +Default values are defined in [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Core/Settings.h#L288). By default, the amount is not restricted (`max_memory_usage_for_user = 0`). See also the description of [max_memory_usage](#settings_max_memory_usage). @@ -45,7 +45,7 @@ See also the description of [max_memory_usage](#settings_max_memory_usage). The maximum amount of RAM to use for running all queries on a single server. -Default values are defined in [Settings.h](https://github.com/yandex/ClickHouse/blob/master/dbms/src/Core/Settings.h#L289). By default, the amount is not restricted (`max_memory_usage_for_all_queries = 0`). +Default values are defined in [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Core/Settings.h#L289). By default, the amount is not restricted (`max_memory_usage_for_all_queries = 0`). See also the description of [max_memory_usage](#settings_max_memory_usage). diff --git a/docs/en/operations/table_engines/mergetree.md b/docs/en/operations/table_engines/mergetree.md index 2c7dbbd4b23..31006df639d 100644 --- a/docs/en/operations/table_engines/mergetree.md +++ b/docs/en/operations/table_engines/mergetree.md @@ -78,7 +78,7 @@ For a description of parameters, see the [CREATE query description](../../query_ For more details, see [TTL for columns and tables](#table_engine-mergetree-ttl) - `SETTINGS` — Additional parameters that control the behavior of the `MergeTree`: - - `index_granularity` — The granularity of an index. The number of data rows between the "marks" of an index. By default, 8192. For the list of available parameters, see [MergeTreeSettings.h](https://github.com/yandex/ClickHouse/blob/master/dbms/src/Storages/MergeTree/MergeTreeSettings.h). + - `index_granularity` — The granularity of an index. The number of data rows between the "marks" of an index. By default, 8192. For the list of available parameters, see [MergeTreeSettings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Storages/MergeTree/MergeTreeSettings.h). - `use_minimalistic_part_header_in_zookeeper` — Storage method of the data parts headers in ZooKeeper. If `use_minimalistic_part_header_in_zookeeper=1`, then ZooKeeper stores less data. For more information, see the [setting description](../server_settings/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) in "Server configuration parameters". - `min_merge_bytes_to_use_direct_io` — The minimum data volume for merge operation that is required for using direct I/O access to the storage disk. When merging data parts, ClickHouse calculates the total storage volume of all the data to be merged. If the volume exceeds `min_merge_bytes_to_use_direct_io` bytes, ClickHouse reads and writes the data to the storage disk using the direct I/O interface (`O_DIRECT` option). If `min_merge_bytes_to_use_direct_io = 0`, then direct I/O is disabled. Default value: `10 * 1024 * 1024 * 1024` bytes. diff --git a/docs/en/query_language/alter.md b/docs/en/query_language/alter.md index 53461d5edcd..5c1d6331add 100644 --- a/docs/en/query_language/alter.md +++ b/docs/en/query_language/alter.md @@ -374,7 +374,7 @@ All the rules above are also true for the [OPTIMIZE](misc.md#misc_operations-opt OPTIMIZE TABLE table_not_partitioned PARTITION tuple() FINAL; ``` -The examples of `ALTER ... PARTITION` queries are demonstrated in the tests [`00502_custom_partitioning_local`](https://github.com/yandex/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00502_custom_partitioning_local.sql) and [`00502_custom_partitioning_replicated_zookeeper`](https://github.com/yandex/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00502_custom_partitioning_replicated_zookeeper.sql). +The examples of `ALTER ... PARTITION` queries are demonstrated in the tests [`00502_custom_partitioning_local`](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00502_custom_partitioning_local.sql) and [`00502_custom_partitioning_replicated_zookeeper`](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00502_custom_partitioning_replicated_zookeeper.sql). ### Synchronicity of ALTER Queries diff --git a/docs/en/query_language/create.md b/docs/en/query_language/create.md index 773eee88d55..2a8c546233c 100644 --- a/docs/en/query_language/create.md +++ b/docs/en/query_language/create.md @@ -149,7 +149,7 @@ ENGINE = If a codec is specified, the default codec doesn't apply. Codecs can be combined in a pipeline, for example, `CODEC(Delta, ZSTD)`. To select the best codecs combination for you project, pass benchmarks, similar to described in the Altinity [New Encodings to Improve ClickHouse Efficiency](https://www.altinity.com/blog/2019/7/new-encodings-to-improve-clickhouse) article. !!!warning - You cannot decompress ClickHouse database files with external utilities, for example, `lz4`. Use the special utility, [clickhouse-compressor](https://github.com/yandex/ClickHouse/tree/master/dbms/programs/compressor). + You cannot decompress ClickHouse database files with external utilities, for example, `lz4`. Use the special utility, [clickhouse-compressor](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/programs/compressor). Compression is supported for the table engines: diff --git a/docs/en/query_language/operators.md b/docs/en/query_language/operators.md index 534336ca0a9..0a7a81550a2 100644 --- a/docs/en/query_language/operators.md +++ b/docs/en/query_language/operators.md @@ -126,7 +126,7 @@ FROM test.Orders; └───────────┴────────────┴──────────┴───────────┴─────────────┴─────────────┘ ``` -You can see more examples in [tests](https://github.com/yandex/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00619_extract.sql). +You can see more examples in [tests](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00619_extract.sql). ## Logical Negation Operator diff --git a/docs/fa/interfaces/tcp.md b/docs/fa/interfaces/tcp.md index bd902aedc58..d6fad9bd5c7 100644 --- a/docs/fa/interfaces/tcp.md +++ b/docs/fa/interfaces/tcp.md @@ -2,7 +2,7 @@ # رابط بومی (TCP) -پروتکل بومی در [خط فرمان خط] (cli.md)، برای برقراری ارتباط بین سرور در طی پردازش پرس و جو توزیع شده، و همچنین در سایر برنامه های C ++ استفاده می شود. متاسفانه، پروتکل ClickHouse بومی هنوز مشخصات رسمی ندارد، اما می توان آن را از کد منبع ClickHouse (شروع [از اینجا](https://github.com/yandex/ClickHouse/tree/master/dbms/src/Client)) و / یا با رهگیری و تجزیه و تحلیل ترافیک TCP. +پروتکل بومی در [خط فرمان خط] (cli.md)، برای برقراری ارتباط بین سرور در طی پردازش پرس و جو توزیع شده، و همچنین در سایر برنامه های C ++ استفاده می شود. متاسفانه، پروتکل ClickHouse بومی هنوز مشخصات رسمی ندارد، اما می توان آن را از کد منبع ClickHouse (شروع [از اینجا](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/src/Client)) و / یا با رهگیری و تجزیه و تحلیل ترافیک TCP. [مقاله اصلی](https://clickhouse.yandex/docs/fa/interfaces/tcp/) diff --git a/docs/ru/getting_started/example_datasets/metrica.md b/docs/ru/getting_started/example_datasets/metrica.md index aade4d0f38d..096c48e9d30 100644 --- a/docs/ru/getting_started/example_datasets/metrica.md +++ b/docs/ru/getting_started/example_datasets/metrica.md @@ -48,4 +48,4 @@ $ clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" ``` ## Запросы -Примеры запросов к этим таблицам (они называются `test.hits` и `test.visits`) можно найти среди [stateful тестов](https://github.com/yandex/ClickHouse/tree/master/dbms/tests/queries/1_stateful) и в некоторых [performance тестах](https://github.com/yandex/ClickHouse/tree/master/dbms/tests/performance/test_hits) ClickHouse. +Примеры запросов к этим таблицам (они называются `test.hits` и `test.visits`) можно найти среди [stateful тестов](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/tests/queries/1_stateful) и в некоторых [performance тестах](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/tests/performance/test_hits) ClickHouse. diff --git a/docs/ru/interfaces/tcp.md b/docs/ru/interfaces/tcp.md index da0ea735644..449c15b1bcc 100644 --- a/docs/ru/interfaces/tcp.md +++ b/docs/ru/interfaces/tcp.md @@ -1,5 +1,5 @@ # Родной интерфейс (TCP) -Нативный протокол используется в [клиенте командной строки](cli.md), для взаимодействия между серверами во время обработки распределенных запросов, а также в других программах на C++. К сожалению, у родного протокола ClickHouse пока нет формальной спецификации, но в нем можно разобраться с использованием исходного кода ClickHouse (начиная с [примерно этого места](https://github.com/yandex/ClickHouse/tree/master/dbms/src/Client)) и/или путем перехвата и анализа TCP трафика. +Нативный протокол используется в [клиенте командной строки](cli.md), для взаимодействия между серверами во время обработки распределенных запросов, а также в других программах на C++. К сожалению, у родного протокола ClickHouse пока нет формальной спецификации, но в нем можно разобраться с использованием исходного кода ClickHouse (начиная с [примерно этого места](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/src/Client)) и/или путем перехвата и анализа TCP трафика. [Оригинальная статья](https://clickhouse.yandex/docs/ru/interfaces/tcp/) diff --git a/docs/ru/operations/backup.md b/docs/ru/operations/backup.md index e2162182673..1f7402fd25e 100644 --- a/docs/ru/operations/backup.md +++ b/docs/ru/operations/backup.md @@ -1,6 +1,6 @@ # Резервное копирование данных -[Репликация](table_engines/replication.md) обеспечивает защиту от аппаратных сбоев, но не защищает от человеческих ошибок: случайного удаления данных, удаления не той таблицы, которую надо было, или таблицы на не том кластере, а также программных ошибок, которые приводят к неправильной обработке данных или их повреждению. Во многих случаях подобные ошибки влияют на все реплики. ClickHouse имеет встроенные средства защиты для предотвращения некоторых типов ошибок — например, по умолчанию [не получится удалить таблицы *MergeTree, содержащие более 50 Гб данных, одной командой](https://github.com/yandex/ClickHouse/blob/v18.14.18-stable/dbms/programs/server/config.xml#L322-L330). Однако эти средства защиты не охватывают все возможные случаи и могут быть обойдены. +[Репликация](table_engines/replication.md) обеспечивает защиту от аппаратных сбоев, но не защищает от человеческих ошибок: случайного удаления данных, удаления не той таблицы, которую надо было, или таблицы на не том кластере, а также программных ошибок, которые приводят к неправильной обработке данных или их повреждению. Во многих случаях подобные ошибки влияют на все реплики. ClickHouse имеет встроенные средства защиты для предотвращения некоторых типов ошибок — например, по умолчанию [не получится удалить таблицы *MergeTree, содержащие более 50 Гб данных, одной командой](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/dbms/programs/server/config.xml#L322-L330). Однако эти средства защиты не охватывают все возможные случаи и могут быть обойдены. Для того чтобы эффективно уменьшить возможные человеческие ошибки, следует тщательно подготовить стратегию резервного копирования и восстановления данных **заранее**. diff --git a/docs/ru/operations/settings/query_complexity.md b/docs/ru/operations/settings/query_complexity.md index 4ebd9868ecc..864764add48 100644 --- a/docs/ru/operations/settings/query_complexity.md +++ b/docs/ru/operations/settings/query_complexity.md @@ -38,7 +38,7 @@ Максимальный возможный объем оперативной памяти для запросов пользователя на одном сервере. -Значения по умолчанию определены в файле [Settings.h](https://github.com/yandex/ClickHouse/blob/master/dbms/src/Core/Settings.h#L288). По умолчанию размер не ограничен (`max_memory_usage_for_user = 0`). +Значения по умолчанию определены в файле [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Core/Settings.h#L288). По умолчанию размер не ограничен (`max_memory_usage_for_user = 0`). Смотрите также описание настройки [max_memory_usage](#settings_max_memory_usage). @@ -46,7 +46,7 @@ Максимальный возможный объем оперативной памяти для всех запросов на одном сервере. -Значения по умолчанию определены в файле [Settings.h](https://github.com/yandex/ClickHouse/blob/master/dbms/src/Core/Settings.h#L289). По умолчанию размер не ограничен (`max_memory_usage_for_all_queries = 0`). +Значения по умолчанию определены в файле [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Core/Settings.h#L289). По умолчанию размер не ограничен (`max_memory_usage_for_all_queries = 0`). Смотрите также описание настройки [max_memory_usage](#settings_max_memory_usage). diff --git a/docs/ru/operations/table_engines/mergetree.md b/docs/ru/operations/table_engines/mergetree.md index 3215b1dbd08..54debe40089 100644 --- a/docs/ru/operations/table_engines/mergetree.md +++ b/docs/ru/operations/table_engines/mergetree.md @@ -75,7 +75,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] - `SETTINGS` — дополнительные параметры, регулирующие поведение `MergeTree`: - - `index_granularity` — гранулярность индекса. Число строк данных между «засечками» индекса. По умолчанию — 8192. Список всех доступных параметров можно посмотреть в [MergeTreeSettings.h](https://github.com/yandex/ClickHouse/blob/master/dbms/src/Storages/MergeTree/MergeTreeSettings.h). + - `index_granularity` — гранулярность индекса. Число строк данных между «засечками» индекса. По умолчанию — 8192. Список всех доступных параметров можно посмотреть в [MergeTreeSettings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Storages/MergeTree/MergeTreeSettings.h). - `min_merge_bytes_to_use_direct_io` — минимальный объем данных, необходимый для прямого (небуферизованного) чтения/записи (direct I/O) на диск. При слиянии частей данных ClickHouse вычисляет общий объем хранения всех данных, подлежащих слиянию. Если общий объем хранения всех данных для чтения превышает `min_bytes_to_use_direct_io` байт, тогда ClickHouse использует флаг `O_DIRECT` при чтении данных с диска. Если `min_merge_bytes_to_use_direct_io = 0`, тогда прямой ввод-вывод отключен. Значение по умолчанию: `10 * 1024 * 1024 * 1024` байт. - `merge_with_ttl_timeout` - Минимальное время в секундах для повторного выполнения слияний с TTL. По умолчанию - 86400 (1 день). diff --git a/docs/ru/query_language/alter.md b/docs/ru/query_language/alter.md index 01d8bbc3179..685ad914722 100644 --- a/docs/ru/query_language/alter.md +++ b/docs/ru/query_language/alter.md @@ -399,7 +399,7 @@ ALTER TABLE hits MOVE PARTITION '2019-09-01' TO DISK 'fast_ssd' OPTIMIZE TABLE table_not_partitioned PARTITION tuple() FINAL; ``` -Примеры запросов `ALTER ... PARTITION` можно посмотреть в тестах: [`00502_custom_partitioning_local`](https://github.com/yandex/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00502_custom_partitioning_local.sql) и [`00502_custom_partitioning_replicated_zookeeper`](https://github.com/yandex/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00502_custom_partitioning_replicated_zookeeper.sql). +Примеры запросов `ALTER ... PARTITION` можно посмотреть в тестах: [`00502_custom_partitioning_local`](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00502_custom_partitioning_local.sql) и [`00502_custom_partitioning_replicated_zookeeper`](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00502_custom_partitioning_replicated_zookeeper.sql). ### Синхронность запросов ALTER diff --git a/docs/ru/query_language/create.md b/docs/ru/query_language/create.md index 95d794b691e..c10cfff7685 100644 --- a/docs/ru/query_language/create.md +++ b/docs/ru/query_language/create.md @@ -144,7 +144,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Высокие уровни сжатия полезны для асимметричных сценариев, например, для таких, в которых требуется однократное сжатие и многократная распаковка. Более высокие уровни обеспечивают лучшее сжатие, но более высокое потребление вычислительных ресурсов. !!! warning "Предупреждение" - Базу данных ClickHouse не получится распаковать с помощью внешних утилит типа `lz4`. Используйте специальную программу [clickhouse-compressor](https://github.com/yandex/ClickHouse/tree/master/dbms/programs/compressor). + Базу данных ClickHouse не получится распаковать с помощью внешних утилит типа `lz4`. Используйте специальную программу [clickhouse-compressor](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/programs/compressor). Пример использования: diff --git a/docs/ru/query_language/operators.md b/docs/ru/query_language/operators.md index c39409e356b..74aa2270e90 100644 --- a/docs/ru/query_language/operators.md +++ b/docs/ru/query_language/operators.md @@ -126,7 +126,7 @@ FROM test.Orders; └───────────┴────────────┴──────────┴───────────┴─────────────┴─────────────┘ ``` -Больше примеров приведено в [тестах](https://github.com/yandex/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00619_extract.sql). +Больше примеров приведено в [тестах](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/tests/queries/0_stateless/00619_extract.sql). ## Оператор логического отрицания diff --git a/docs/tools/github.py b/docs/tools/github.py index 13aa4984fdf..e07d8a0683a 100644 --- a/docs/tools/github.py +++ b/docs/tools/github.py @@ -12,7 +12,7 @@ import util def choose_latest_releases(): seen = collections.OrderedDict() - candidates = requests.get('https://api.github.com/repos/yandex/ClickHouse/tags?per_page=100').json() + candidates = requests.get('https://api.github.com/repos/ClickHouse/ClickHouse/tags?per_page=100').json() for tag in candidates: name = tag.get('name', '') if 'v18' in name or 'stable' not in name: diff --git a/docs/zh/interfaces/tcp.md b/docs/zh/interfaces/tcp.md index 7eab7dea1b1..d07f5d14f1d 100644 --- a/docs/zh/interfaces/tcp.md +++ b/docs/zh/interfaces/tcp.md @@ -1,5 +1,5 @@ # 原生客户端接口(TCP) -本机协议用于 [命令行客户端](cli.md),用于分布式查询处理期间的服务器间通信,以及其他C ++程序。 不幸的是,本机ClickHouse协议还没有正式的规范,但它可以从ClickHouse源代码进行逆向工程 [从这里开始](https://github.com/yandex/ClickHouse/tree/master/dbms/src/Client))和/或拦截和分析TCP流量。 +本机协议用于 [命令行客户端](cli.md),用于分布式查询处理期间的服务器间通信,以及其他C ++程序。 不幸的是,本机ClickHouse协议还没有正式的规范,但它可以从ClickHouse源代码进行逆向工程 [从这里开始](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/src/Client))和/或拦截和分析TCP流量。 [来源文章](https://clickhouse.yandex/docs/zh/interfaces/tcp/) diff --git a/docs/zh/operations/settings/query_complexity.md b/docs/zh/operations/settings/query_complexity.md index 0250a37685e..1e3c94bc2e1 100644 --- a/docs/zh/operations/settings/query_complexity.md +++ b/docs/zh/operations/settings/query_complexity.md @@ -45,7 +45,7 @@ Memory consumption is also restricted by the parameters `max_memory_usage_for_us The maximum amount of RAM to use for running a user's queries on a single server. -Default values are defined in [Settings.h](https://github.com/yandex/ClickHouse/blob/master/dbms/src/Interpreters/Settings.h#L244). By default, the amount is not restricted (`max_memory_usage_for_user = 0`). +Default values are defined in [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Interpreters/Settings.h#L244). By default, the amount is not restricted (`max_memory_usage_for_user = 0`). See also the description of [max_memory_usage](#settings_max_memory_usage). @@ -53,7 +53,7 @@ See also the description of [max_memory_usage](#settings_max_memory_usage). The maximum amount of RAM to use for running all queries on a single server. -Default values are defined in [Settings.h](https://github.com/yandex/ClickHouse/blob/master/dbms/src/Interpreters/Settings.h#L245). By default, the amount is not restricted (`max_memory_usage_for_all_queries = 0`). +Default values are defined in [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Interpreters/Settings.h#L245). By default, the amount is not restricted (`max_memory_usage_for_all_queries = 0`). See also the description of [max_memory_usage](#settings_max_memory_usage). diff --git a/docs/zh/operations/table_engines/mergetree.md b/docs/zh/operations/table_engines/mergetree.md index 5ddf837708a..2afb50af155 100644 --- a/docs/zh/operations/table_engines/mergetree.md +++ b/docs/zh/operations/table_engines/mergetree.md @@ -69,7 +69,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] `SAMPLE BY intHash32(UserID) ORDER BY (CounterID, EventDate, intHash32(UserID))` 。 - `SETTINGS` — 影响 `MergeTree` 性能的额外参数: - - `index_granularity` — 索引粒度。即索引中相邻『标记』间的数据行数。默认值,8192 。该列表中所有可用的参数可以从这里查看 [MergeTreeSettings.h](https://github.com/yandex/ClickHouse/blob/master/dbms/src/Storages/MergeTree/MergeTreeSettings.h) 。 + - `index_granularity` — 索引粒度。即索引中相邻『标记』间的数据行数。默认值,8192 。该列表中所有可用的参数可以从这里查看 [MergeTreeSettings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Storages/MergeTree/MergeTreeSettings.h) 。 - `use_minimalistic_part_header_in_zookeeper` — 数据片段头在 ZooKeeper 中的存储方式。如果设置了 `use_minimalistic_part_header_in_zookeeper=1` ,ZooKeeper 会存储更少的数据。更多信息参考『服务配置参数』这章中的 [设置描述](../server_settings/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) 。 - `min_merge_bytes_to_use_direct_io` — 使用直接 I/O 来操作磁盘的合并操作时要求的最小数据量。合并数据片段时,ClickHouse 会计算要被合并的所有数据的总存储空间。如果大小超过了 `min_merge_bytes_to_use_direct_io` 设置的字节数,则 ClickHouse 将使用直接 I/O 接口(`O_DIRECT` 选项)对磁盘读写。如果设置 `min_merge_bytes_to_use_direct_io = 0` ,则会禁用直接 I/O。默认值:`10 * 1024 * 1024 * 1024` 字节。 diff --git a/docs/zh/query_language/create.md b/docs/zh/query_language/create.md index d3a6c2e841b..62630673540 100644 --- a/docs/zh/query_language/create.md +++ b/docs/zh/query_language/create.md @@ -124,7 +124,7 @@ ENGINE = If a codec is specified, the default codec doesn't apply. Codecs can be combined in a pipeline, for example, `CODEC(Delta, ZSTD)`. To select the best codecs combination for you project, pass benchmarks, similar to described in the Altinity [New Encodings to Improve ClickHouse Efficiency](https://www.altinity.com/blog/2019/7/new-encodings-to-improve-clickhouse) article. !!!warning - You cannot decompress ClickHouse database files with external utilities, for example, `lz4`. Use the special utility, [clickhouse-compressor](https://github.com/yandex/ClickHouse/tree/master/dbms/programs/compressor). + You cannot decompress ClickHouse database files with external utilities, for example, `lz4`. Use the special utility, [clickhouse-compressor](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/programs/compressor). Compression is supported for the table engines: diff --git a/utils/ci/default-config b/utils/ci/default-config index 26e82ddcceb..5427ee7280d 100644 --- a/utils/ci/default-config +++ b/utils/ci/default-config @@ -13,7 +13,7 @@ PROJECT_ROOT=$(cd $SCRIPTPATH/.. && pwd) # get-sources SOURCES_METHOD=local # clone, local, tarball -SOURCES_CLONE_URL="https://github.com/yandex/ClickHouse.git" +SOURCES_CLONE_URL="https://github.com/ClickHouse/ClickHouse.git" SOURCES_BRANCH="master" SOURCES_COMMIT=HEAD # do checkout of this commit after clone diff --git a/utils/report/clickhouse-report b/utils/report/clickhouse-report index 1a461200575..b0dbabbda51 100755 --- a/utils/report/clickhouse-report +++ b/utils/report/clickhouse-report @@ -1,11 +1,11 @@ #!/bin/sh -x # Usages: # sh -x clickhouse-report > ch.`hostname`.`date '+%Y%M%''d%H%M%''S'`.dmp 2>&1 -# curl https://raw.githubusercontent.com/yandex/ClickHouse/master/utils/report/clickhouse-report | sh -x > ch.`hostname`.`date '+%Y%M%''d%H%M%''S'`.dmp 2>&1 +# curl https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/utils/report/clickhouse-report | sh -x > ch.`hostname`.`date '+%Y%M%''d%H%M%''S'`.dmp 2>&1 # Also dump some system info (can contain some private data) and get trace from running clickhouse-server process # sh -x clickhouse-report system gdb > ch.`hostname`.`date '+%Y%M%''d%H%M%''S'`.dmp 2>&1 -# curl https://raw.githubusercontent.com/yandex/ClickHouse/master/utils/report/clickhouse-report | sh -s -x system gdb > ch.`hostname`.`date '+%Y%M%''d%H%M%''S'`.dmp 2>&1 +# curl https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/utils/report/clickhouse-report | sh -s -x system gdb > ch.`hostname`.`date '+%Y%M%''d%H%M%''S'`.dmp 2>&1 for i in "$@" ; do diff --git a/website/README.md b/website/README.md index 26bb1dceab5..5652cbcc6a1 100644 --- a/website/README.md +++ b/website/README.md @@ -1,2 +1,2 @@ -ClickHouse website is built alongside it's documentation via [docs/tools](https://github.com/yandex/ClickHouse/tree/master/docs/tools), see [README.md there](https://github.com/yandex/ClickHouse/tree/master/docs/tools/README.md). +ClickHouse website is built alongside it's documentation via [docs/tools](https://github.com/ClickHouse/ClickHouse/tree/master/docs/tools), see [README.md there](https://github.com/ClickHouse/ClickHouse/tree/master/docs/tools/README.md). diff --git a/website/deprecated/reference_en.html b/website/deprecated/reference_en.html index 2e6fe0ac30b..cebcf819101 100644 --- a/website/deprecated/reference_en.html +++ b/website/deprecated/reference_en.html @@ -453,7 +453,7 @@ By default, access is allowed from everywhere for the default user without a pas ===Installing from source=== -To build, follow the instructions in build.md (for Linux) or in build_osx.md (for Mac OS X). +To build, follow the instructions in build.md (for Linux) or in build_osx.md (for Mac OS X). You can compile packages and install them. You can also use programs without installing packages. @@ -550,7 +550,7 @@ Congratulations, it works! If you are Yandex employee, you can use Yandex.Metrica test data to explore the system's capabilities. You can find instructions for using the test data here. -Otherwise, you could use one of available public datasets, described here. +Otherwise, you could use one of available public datasets, described here. ==If you have questions== diff --git a/website/deprecated/reference_ru.html b/website/deprecated/reference_ru.html index 89c91d7d1c1..12bc967e3e1 100644 --- a/website/deprecated/reference_ru.html +++ b/website/deprecated/reference_ru.html @@ -464,7 +464,7 @@ ClickHouse содержит настройки ограничения досту ===Установка из исходников=== -Для сборки воспользуйтесь инструкцией build.md (для Linux) или build_osx.md (для Mac OS X). +Для сборки воспользуйтесь инструкцией build.md (для Linux) или build_osx.md (для Mac OS X). Вы можете собрать пакеты и установить их. Также вы можете использовать программы без установки пакетов. @@ -564,7 +564,7 @@ Connected to ClickHouse server version 0.0.18749. Если вы сотрудник Яндекса, вы можете воспользоваться тестовыми данными Яндекс.Метрики для изучения возможностей системы. Как загрузить тестовые данные, написано здесь. -Если вы внешний пользователь системы, вы можете воспользоваться использовать общедоступные данные, способы загрузки которых указаны здесь. +Если вы внешний пользователь системы, вы можете воспользоваться использовать общедоступные данные, способы загрузки которых указаны здесь. ==Если возникли вопросы== diff --git a/website/index.html b/website/index.html index f2ef2f698f0..5c832745858 100644 --- a/website/index.html +++ b/website/index.html @@ -429,7 +429,7 @@ clickhouse-client target="_blank"> official Docker images of ClickHouse . Alternatively you can build ClickHouse from sources according to the Follow official Twitter account. -
  • Open Open GitHub issue if you have a bug report or feature request.
  • Or email Yandex ClickHouse team directly at turn on JavaScript to see email address. @@ -476,7 +476,7 @@ clickhouse-client if you are interested and we'll get in touch. Short reports about previous meetups are published in official ClickHouse blog.

    -

    ClickHouse source code is published under Apache 2.0 License. Software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

    @@ -486,7 +486,7 @@ clickhouse-client
    Fork me on GitHub
    @@ -494,7 +494,7 @@ clickhouse-client - - - - - - - - - -
    -

    Contents

    -
    -
    -
    - -
    -

    Introduction

    -
    - -
    -==What is ClickHouse?== -ClickHouse is a columnar DBMS for OLAP. - -In a "normal" row-oriented DBMS, data is stored in this order: - -
    -5123456789123456789     1       Eurobasket - Greece - Bosnia and Herzegovina - example.com      1       2011-09-01 01:03:02     6274717   1294101174      11409   612345678912345678      0       33      6       http://www.example.com/basketball/team/123/match/456789.html http://www.example.com/basketball/team/123/match/987654.html       0       1366    768     32      10      3183      0       0       13      0\0     1       1       0       0                       2011142 -1      0               0       01321     613     660     2011-09-01 08:01:17     0       0       0       0       utf-8   1466    0       0       0       5678901234567890123               277789954       0       0       0       0       0
    -5234985259563631958     0       Consulting, Tax assessment, Accounting, Law       1       2011-09-01 01:03:02     6320881   2111222333      213     6458937489576391093     0       3       2       http://www.example.ru/         0       800     600       16      10      2       153.1   0       0       10      63      1       1       0       0                       2111678 000       0       588     368     240     2011-09-01 01:03:17     4       0       60310   0       windows-1251    1466    0       000               778899001       0       0       0       0       0
    -...
    -
    - -In other words, all the values related to a row are stored next to each other. Examples of a row-oriented DBMS are MySQL, Postgres, MS SQL Server, and others. - -In a column-oriented DBMS, data is stored like this: - -
    -WatchID:    5385521489354350662     5385521490329509958     5385521489953706054     5385521490476781638     5385521490583269446     5385521490218868806     5385521491437850694   5385521491090174022      5385521490792669254     5385521490420695110     5385521491532181574     5385521491559694406     5385521491459625030     5385521492275175494   5385521492781318214      5385521492710027334     5385521492955615302     5385521493708759110     5385521494506434630     5385521493104611398
    -JavaEnable: 1       0       1       0       0       0       1       0       1       1       1       1       1       1       0       1       0       0       1       1
    -Title:      Yandex  Announcements - Investor Relations - Yandex     Yandex — Contact us — Moscow    Yandex — Mission        Ru      Yandex — History — History of Yandex    Yandex Financial Releases - Investor Relations - Yandex Yandex — Locations      Yandex Board of Directors - Corporate Governance - Yandex       Yandex — Technologies
    -GoodEvent:  1       1       1       1       1       1       1       1       1       1       1       1       1       1       1       1       1       1       1       1
    -EventTime:  2016-05-18 05:19:20     2016-05-18 08:10:20     2016-05-18 07:38:00     2016-05-18 01:13:08     2016-05-18 00:04:06     2016-05-18 04:21:30     2016-05-18 00:34:16     2016-05-18 07:35:49     2016-05-18 11:41:59     2016-05-18 01:13:32
    -...
    -
    - -These examples only show the order that data is arranged in. -The values from different columns are stored separately, and data from the same column is stored together. -Examples of a column-oriented DBMS: Vertica, Paraccel (Actian Matrix) (Amazon Redshift), Sybase IQ, Exasol, Infobright, InfiniDB, MonetDB (VectorWise) (Actian Vector), LucidDB, SAP HANA, Google Dremel, Google PowerDrill, Druid, kdb+ and others. - - -Different orders for storing data are better suited to different scenarios. -The data access scenario refers to what queries are made, how often, and in what proportion; how much data is read for each type of query - rows, columns, and bytes; the relationship between reading and updating data; the working size of the data and how locally it is used; whether transactions are used, and how isolated they are; requirements for data replication and logical integrity; requirements for latency and throughput for each type of query, and so on. - -The higher the load on the system, the more important it is to customize the system to the scenario, and the more specific this customization becomes. There is no system that is equally well-suited to significantly different scenarios. If a system is adaptable to a wide set of scenarios, under a high load, the system will handle all the scenarios equally poorly, or will work well for just one of the scenarios. - -We'll say that the following is true for the OLAP (online analytical processing) scenario: -- The vast majority of requests are for read access. -- Data is updated in fairly large batches (> 1000 rows), not by single rows; or it is not updated at all. -- Data is added to the DB but is not modified. -- For reads, quite a large number of rows are extracted from the DB, but only a small subset of columns. -- Tables are "wide," meaning they contain a large number of columns. -- Queries are relatively rare (usually hundreds of queries per server or less per second). -- For simple queries, latencies around 50 ms are allowed. -- Column values are fairly small - numbers and short strings (for example, 60 bytes per URL). -- Requires high throughput when processing a single query (up to billions of rows per second per server). -- There are no transactions. -- Low requirements for data consistency. -- There is one large table per query. All tables are small, except for one. -- A query result is significantly smaller than the source data. That is, data is filtered or aggregated. The result fits in a single server's RAM. - -It is easy to see that the OLAP scenario is very different from other popular scenarios (such as OLTP or Key-Value access). So it doesn't make sense to try to use OLTP or a Key-Value DB for processing analytical queries if you want to get decent performance. For example, if you try to use MongoDB or Elliptics for analytics, you will get very poor performance compared to OLAP databases. - -Columnar-oriented databases are better suited to OLAP scenarios (at least 100 times better in processing speed for most queries), for the following reasons: - -1. For I/O. -1.1. For an analytical query, only a small number of table columns need to be read. In a column-oriented database, you can read just the data you need. For example, if you need 5 columns out of 100, you can expect a 20-fold reduction in I/O. -1.2. Since data is read in packets, it is easier to compress. Data in columns is also easier to compress. This further reduces the I/O volume. -1.3. Due to the reduced I/O, more data fits in the system cache. - -For example, the query "count the number of records for each advertising platform" requires reading one "advertising platform ID" column, which takes up 1 byte uncompressed. If most of the traffic was not from advertising platforms, you can expect at least 10-fold compression of this column. When using a quick compression algorithm, data decompression is possible at a speed of at least several gigabytes of uncompressed data per second. In other words, this query can be processed at a speed of approximately several billion rows per second on a single server. This speed is actually achieved in practice. - - -
    -milovidov@████████.yandex.ru:~$ clickhouse-client
    -ClickHouse client version 0.0.52053.
    -Connecting to localhost:9000.
    -Connected to ClickHouse server version 0.0.52053.
    -
    -:) SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20
    -
    -SELECT
    -    CounterID,
    -    count()
    -FROM hits
    -GROUP BY CounterID
    -ORDER BY count() DESC
    -LIMIT 20
    -
    -┌─CounterID─┬──count()─┐
    -│    114208 │ 56057344 │
    -│    115080 │ 51619590 │
    -│      3228 │ 44658301 │
    -│     38230 │ 42045932 │
    -│    145263 │ 42042158 │
    -│     91244 │ 38297270 │
    -│    154139 │ 26647572 │
    -│    150748 │ 24112755 │
    -│    242232 │ 21302571 │
    -│    338158 │ 13507087 │
    -│     62180 │ 12229491 │
    -│     82264 │ 12187441 │
    -│    232261 │ 12148031 │
    -│    146272 │ 11438516 │
    -│    168777 │ 11403636 │
    -│   4120072 │ 11227824 │
    -│  10938808 │ 10519739 │
    -│     74088 │  9047015 │
    -│    115079 │  8837972 │
    -│    337234 │  8205961 │
    -└───────────┴──────────┘
    -
    -20 rows in set. Elapsed: 0.153 sec. Processed 1.00 billion rows, 4.00 GB (6.53 billion rows/s., 26.10 GB/s.)
    -
    -:)
    -
    - -2. For CPU. -Since executing a query requires processing a large number of rows, it helps to dispatch all operations for entire vectors instead of for separate rows, or to implement the query engine so that there is almost no dispatching cost. If you don't do this, with any half-decent disk subsystem, the query interpreter inevitably stalls the CPU. -It makes sense to both store data in columns and process it, when possible, by columns. - -There are two ways to do this: -1. A vector engine. All operations are written for vectors, instead of for separate values. This means you don't need to call operations very often, and dispatching costs are negligible. Operation code contains an optimized internal cycle. -2. Code generation. The code generated for the query has all the indirect calls in it. - -This is not done in "normal" databases, because it doesn't make sense when running simple queries. However, there are exceptions. For example, MemSQL uses code generation to reduce latency when processing SQL queries. (For comparison, analytical DBMSs require optimization of throughput, not latency.) - -Note that for CPU efficiency, the query language must be declarative (SQL or MDX), or at least a vector (J, K). The query should only contain implicit loops, allowing for optimization. - - -==Distinctive features of ClickHouse== - -1. True column-oriented DBMS. -2. Data compression. -3. Disk storage of data. -4. Parallel processing on multiple cores. -5. Distributed processing on multiple servers. -6. SQL support. -7. Vector engine. -8. Real-time data updates. -9. Indexes. -10. Suitable for online queries. -11. Support for approximated calculations. -12. Support for nested data structures. Support for arrays as data types. -13. Support for restrictions on query complexity, along with quotas. -14. Data replication and support for data integrity on replicas. - - -Let's look at some of these features in detail. - -

    1. True column-oriented DBMS.

    - -In a true column-oriented DBMS, there isn't any "garbage" stored with the values. For example, constant-length values must be supported, to avoid storing their length "number" next to the values. As an example, a billion UInt8-type values should actually consume around 1 GB uncompressed, or this will strongly affect the CPU use. It is very important to store data compactly (without any "garbage") even when uncompressed, since the speed of decompression (CPU usage) depends mainly on the volume of uncompressed data. - -This is worth noting because there are systems that can store values of separate columns separately, but that can't effectively process analytical queries due to their optimization for other scenarios. Example are HBase, BigTable, Cassandra, and HyperTable. In these systems, you will get throughput around a hundred thousand rows per second, but not hundreds of millions of rows per second. - -Also note that ClickHouse is a DBMS, not a single database. ClickHouse allows creating tables and databases in runtime, loading data, and running queries without reconfiguring and restarting the server. - -

    2. Data compression.

    - -Some column-oriented DBMSs (InfiniDB CE and MonetDB) do not use data compression. However, data compression really improves performance. - -

    3. Disk storage of data.

    - -Many column-oriented DBMSs (SAP HANA, and Google PowerDrill) can only work in RAM. But even on thousands of servers, the RAM is too small for storing all the pageviews and sessions in Yandex.Metrica. - -

    4. Parallel processing on multiple cores.

    - -Large queries are parallelized in a natural way. - -

    5. Distributed processing on multiple servers.

    - -Almost none of the columnar DBMSs listed above have support for distributed processing. -In ClickHouse, data can reside on different shards. Each shard can be a group of replicas that are used for fault tolerance. The query is processed on all the shards in parallel. This is transparent for the user. - -

    6. SQL support.

    - -If you are familiar with standard SQL, we can't really talk about SQL support. -NULLs are not supported. All the functions have different names. However, this is a declarative query language based on SQL that can't be differentiated from SQL in many instances. -JOINs are supported. Subqueries are supported in FROM, IN, JOIN clauses; and scalar subqueries. -Correlated subqueries are not supported. - -

    7. Vector engine.

    - -Data is not only stored by columns, but is processed by vectors - parts of columns. This allows us to achieve high CPU performance. - -

    8. Real-time data updates.

    - -ClickHouse supports primary key tables. In order to quickly perform queries on the range of the primary key, the data is sorted incrementally using the merge tree. Due to this, data can continually be added to the table. There is no locking when adding data. - -

    9. Indexes.

    - -Having a primary key allows, for example, extracting data for specific clients (Metrica counters) for a specific time range, with low latency less than several dozen milliseconds. - -

    10. Suitable for online queries.

    - -This lets us use the system as the back-end for a web interface. Low latency means queries can be processed without delay, while the Yandex.Metrica interface page is loading (in online mode). - -

    11. Support for approximated calculations.

    - -1. The system contains aggregate functions for approximated calculation of the number of various values, medians, and quantiles. -2. Supports running a query based on a part (sample) of data and getting an approximated result. In this case, proportionally less data is retrieved from the disk. -3. Supports running an aggregation for a limited number of random keys, instead of for all keys. Under certain conditions for key distribution in the data, this provides a reasonably accurate result while using fewer resources. - -

    14. Data replication and support for data integrity on replicas.

    - -Uses asynchronous multimaster replication. After being written to any available replica, data is distributed to all the remaining replicas. The system maintains identical data on different replicas. Data is restored automatically after a failure, or using a "button" for complex cases. -For more information, see the section "Data replication". - -==ClickHouse features that can be considered disadvantages== - -1. No transactions. - -2. For aggregation, query results must fit in the RAM on a single server. However, the volume of source data for a query may be indefinitely large. - -3. Lack of full-fledged UPDATE/DELETE implementation. - -==The Yandex.Metrica task== - -We need to get custom reports based on hits and sessions, with custom segments set by the user. Data for the reports is updated in real-time. Queries must be run immediately (in online mode). We must be able to build reports for any time period. Complex aggregates must be calculated, such as the number of unique visitors. -At this time (April 2014), Yandex.Metrica receives approximately 12 billion events (pageviews and mouse clicks) daily. All these events must be stored in order to build custom reports. A single query may require scanning hundreds of millions of rows over a few seconds, or millions of rows in no more than a few hundred milliseconds. - -===Aggregated and non-aggregated data=== - -There is a popular opinion that in order to effectively calculate statistics, you must aggregate data, since this reduces the volume of data. - -But data aggregation is a very limited solution, for the following reasons: -- You must have a pre-defined list of reports the user will need. The user can't make custom reports. -- When aggregating a large quantity of keys, the volume of data is not reduced, and aggregation is useless. -- For a large number of reports, there are too many aggregation variations (combinatorial explosion). -- When aggregating keys with high cardinality (such as URLs), the volume of data is not reduced by much (less than twofold). For this reason, the volume of data with aggregation might grow instead of shrink. -- Users will not view all the reports we calculate for them. A large portion of calculations are useless. -- The logical integrity of data may be violated for various aggregations. - -If we do not aggregate anything and work with non-aggregated data, this might actually reduce the volume of calculations. - -However, with aggregation, a significant part of the work is taken offline and completed relatively calmly. In contrast, online calculations require calculating as fast as possible, since the user is waiting for the result. - -Yandex.Metrica has a specialized system for aggregating data called Metrage, which is used for the majority of reports. Starting in 2009, Yandex.Metrica also used a specialized OLAP database for non-aggregated data called OLAPServer, which was previously used for the report builder. OLAPServer worked well for non-aggregated data, but it had many restrictions that did not allow it to be used for all reports as desired. These included the lack of support for data types (only numbers), and the inability to incrementally update data in real-time (it could only be done by rewriting data daily). OLAPServer is not a DBMS, but a specialized DB. - -To remove the limitations of OLAPServer and solve the problem of working with non-aggregated data for all reports, we developed the ClickHouse DBMS. - -==Usage in Yandex.Metrica and other Yandex services== - -ClickHouse is used for multiple purposes in Yandex.Metrica. Its main task is to build reports in online mode using non-aggregated data. It uses a cluster of 374 servers, which store over 20.3 trillion rows in the database. The volume of compressed data, without counting duplication and replication, is about 2 PB. The volume of uncompressed data (in TSV format) would be approximately 17 PB. - -ClickHouse is also used for: -- Storing WebVisor data. -- Processing intermediate data. -- Building global reports with Analytics. -- Running queries for debugging the Metrica engine. -- Analyzing logs from the API and the user interface. - - -ClickHouse has at least a dozen installations in other Yandex services: in search verticals, Market, Direct, business analytics, mobile development, AdFox, personal services, and others. - - -==Possible counterparts== - -There are no analogs to ClickHouse available. -At this time (May 2016), there aren't any available open-source and free systems that have all the features listed above. However, these features are absolutely necessary for Yandex.Metrica. - - -==Possible silly questions== - -

    1. Why not to use systems like map-reduce?

    - -Systems like map-reduce are distributed computing systems, where the reduce phase is performed using distributed sorting. -Regarding this aspect, map-reduce is similar to other systems like YAMR, Hadoop, YT. - -These systems are not suitable for online queries because of latency, So they can't be used in backend-level for web interface. -Systems like this also are not suitable for real-time updates. -Distributed sorting is not optimal solution for reduce operations, if the result of the operation and all intermediate results, shall they exist, fit in operational memory of a single server, as usually happens in case of online analytical queries. -In this case the optimal way to perform reduce operations is by using a hash-table. A common optimization method for map-reduce tasks is combine operation (partial reduce) which uses hash-tables in memory. This optimization is done by the user manually. -Distributed sorting is the main reason for long latencies of simple map-reduce jobs. - -Systems similar to map-reduce enable running any code on the cluster. But for OLAP use-cases declarative query languages are better suited as they allow to carry out investigations faster. For example, for Hadoop there are Hive and Pig. There are others: Cloudera Impala, Shark (deprecated) and Spark SQL for Spark, Presto, Apache Drill. -However, performance of such tasks is highly sub-optimal compared to the performance of specialized systems and relatively high latency does not allow the use of these systems as a backend for the web interface. -YT allows you to store separate groups of columns. But YT is not a truly columnar storage system, as the system has no fixed length data types (so you can efficiently store a number without "garbage"), and there is no vector engine. Tasks in YT are performed by arbitrary code in streaming mode, so can not be sufficiently optimized (up to hundreds of millions of lines per second per server). In 2014-2016 YT is to develop "dynamic table sorting" functionality using Merge Tree, strongly typed values ​​and SQL-like language support. Dynamically sorted tables are not suited for OLAP tasks, since the data is stored in rows. Query language development in YT is still in incubating phase, which does not allow it to focus on this functionality. YT developers are considering dynamically sorted tables for use in OLTP and Key-Value scenarios. - -==Performance== - -According to internal testing results, ClickHouse shows the best performance for comparable operating scenarios among systems of its class that were available for testing. This includes the highest throughput for long queries, and the lowest latency on short queries. Testing results are shown on this page. - - -===Throughput for a single large query=== - -Throughput can be measured in rows per second or in megabytes per second. If the data is placed in the page cache, a query that is not too complex is processed on modern hardware at a speed of approximately 2-10 GB/s of uncompressed data on a single server (for the simplest cases, the speed may reach 30 GB/s). If data is not placed in the page cache, the speed depends on the disk subsystem and the data compression rate. For example, if the disk subsystem allows reading data at 400 MB/s, and the data compression rate is 3, the speed will be around 1.2 GB/s. To get the speed in rows per second, divide the speed in bytes per second by the total size of the columns used in the query. For example, if 10 bytes of columns are extracted, the speed will be around 100-200 million rows per second. - -The processing speed increases almost linearly for distributed processing, but only if the number of rows resulting from aggregation or sorting is not too large. - -===Latency when processing short queries.=== - -If a query uses a primary key and does not select too many rows to process (hundreds of thousands), and does not use too many columns, we can expect less than 50 milliseconds of latency (single digits of milliseconds in the best case) if data is placed in the page cache. Otherwise, latency is calculated from the number of seeks. If you use rotating drives, for a system that is not overloaded, the latency is calculated by this formula: seek time (10 ms) * number of columns queried * number of data parts. - -===Throughput when processing a large quantity of short queries.=== - -Under the same conditions, ClickHouse can handle several hundred queries per second on a single server (up to several thousand in the best case). Since this scenario is not typical for analytical DBMSs, we recommend expecting a maximum of 100 queries per second. - -===Performance on data insertion.=== - -We recommend inserting data in packets of at least 1000 rows, or no more than a single request per second. When inserting to a MergeTree table from a tab-separated dump, the insertion speed will be from 50 to 200 MB/s. If the inserted rows are around 1 Kb in size, the speed will be from 50,000 to 200,000 rows per second. If the rows are small, the performance will be higher in rows per second (on Yandex Banner System data -> 500,000 rows per second, on Graphite data -> 1,000,000 rows per second). To improve performance, you can make multiple INSERT queries in parallel, and performance will increase linearly. - -
    - -
    -

    Getting started

    -
    - -
    - -==System requirements== - -This is not a cross-platform system. It requires Linux Ubuntu Precise (12.04) or newer, x86_64 architecture with SSE 4.2 instruction set. -To test for SSE 4.2 support, do -%%grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported"%% - -We recommend using Ubuntu Trusty or Ubuntu Xenial or Ubuntu Precise. -The terminal must use UTF-8 encoding (the default in Ubuntu). - - -==Installation== - -For testing and development, the system can be installed on a single server or on a desktop computer. - - -===Installing from packages=== - -In %%/etc/apt/sources.list%% (or in a separate %%/etc/apt/sources.list.d/clickhouse.list%% file), add the repository: - -On Ubuntu Trusty (14.04): - -%% -deb http://repo.yandex.ru/clickhouse/trusty stable main -%% - -For other Ubuntu versions, replace %%trusty%% to %%xenial%% or %%precise%%. - -Then run: - -%% -sudo apt-key adv --keyserver keyserver.ubuntu.com --recv E0C56BD4 # optional -sudo apt-get update -sudo apt-get install -y clickhouse-client clickhouse-server -%% - -You can also download and install packages manually from here: -http://repo.yandex.ru/clickhouse/trusty/pool/main/c/clickhouse/, -http://repo.yandex.ru/clickhouse/xenial/pool/main/c/clickhouse/ -http://repo.yandex.ru/clickhouse/precise/pool/main/c/clickhouse/. - -ClickHouse contains access restriction settings. They are located in the 'users.xml' file (next to 'config.xml'). -By default, access is allowed from everywhere for the default user without a password. See 'user/default/networks'. For more information, see the section "Configuration files". - - -===Installing from source=== - -To build, follow the instructions in build.md (for Linux) or in build_osx.md (for Mac OS X). - -You can compile packages and install them. You can also use programs without installing packages. - -Client: src/dbms/programs/clickhouse-client -Server: src/dbms/programs/clickhouse-server - -For the server, create a catalog with data, such as: - -%% -/var/lib/clickhouse/data/default/ -/var/lib/clickhouse/metadata/default/ -%% - -(Configured in the server config.) -Run 'chown' for the desired user. - -Note the path to logs in the server config (src/dbms/programs/server/config.xml). - - -===Other methods of installation=== - -The Docker image is located here: https://hub.docker.com/r/yandex/clickhouse-server/ - -There is Gentoo overlay located here: https://github.com/kmeaw/clickhouse-overlay - - -===Launch=== - -To start the server (as a daemon), run: - -
    -sudo service clickhouse-server start
    -
    - -View the logs in the catalog - -%% -/var/log/clickhouse-server/ -%% - -If the server doesn't start, check the configurations in the file - -%% -/etc/clickhouse-server/config.xml -%% - -You can also launch the server from the console: - -
    -clickhouse-server --config-file=/etc/clickhouse-server/config.xml
    -
    - -In this case, the log will be printed to the console, which is convenient during development. If the configuration file is in the current directory, you don't need to specify the '--config-file' parameter. By default, it uses './config.xml'. - -You can use the command-line client to connect to the server: - -
    -clickhouse-client
    -
    - -The default parameters indicate connecting with localhost:9000 on behalf of the user 'default' without a password. -The client can be used for connecting to a remote server. For example: - -
    -clickhouse-client --host=example.com
    -
    - -For more information, see the section "Command-line client". - -Checking the system: - -
    -milovidov@milovidov-Latitude-E6320:~/work/metrica/src/dbms/src/Client$ ./clickhouse-client
    -ClickHouse client version 0.0.18749.
    -Connecting to localhost:9000.
    -Connected to ClickHouse server version 0.0.18749.
    -
    -:) SELECT 1
    -
    -SELECT 1
    -
    -┌─1─┐
    -│ 1 │
    -└───┘
    -
    -1 rows in set. Elapsed: 0.003 sec.
    -
    -:)
    -
    - -Congratulations, it works! - -==Test data== - -If you are Yandex employee, you can use Yandex.Metrica test data to explore the system's capabilities. You can find instructions for using the test data here. - -Otherwise, you could use one of available public datasets, described here. - - -==If you have questions== - -If you are Yandex employee, use internal ClickHouse maillist. -You can subscribe to this list to get announcements, information on new developments, and questions that other users have. - -Otherwise, you could ask questions on Stack Overflow; discuss in Google Groups; or send private message to developers to address clickhouse-feedback@yandex-team.com. - - - -
    -
    -

    Interfaces

    -
    - -
    - -To explore the system's capabilities, download data to tables, or make manual queries, use the clickhouse-client program. - -To enable remote access to server write to config.xml -
    <listen_host>::</listen_host>
    - -==HTTP interface== - - -The HTTP interface lets you use ClickHouse on any platform from any programming language. We use it for working from Java and Perl, as well as shell scripts. In other departments, the HTTP interface is used from Perl, Python, and Go. The HTTP interface is more limited than the native interface, but it has better compatibility. - -By default, clickhouse-server listens for HTTP on port 8123 (this can be changed in the config). -If you make a GET / request without parameters, it returns the string "Ok" (with a line break at the end). You can use this in health-check scripts. - -
    -$ curl 'http://localhost:8123/'
    -Ok.
    -
    - -Send the request as a URL 'query' parameter, or as a POST. Or send the beginning of the request in the 'query' parameter, and the rest in the POST (we'll explain later why this is necessary). URL length is limited by 16KB, this limit should be taken into account when sending long queries in the 'query' parameter. - -If successful, you receive the 200 response code and the result in the response body. -If an error occurs, you receive the 500 response code and an error description text in the response body. - -When using the GET method, 'readonly' is set. In other words, for queries that modify data, you can only use the POST method. You can send the query itself either in the POST body, or in the URL parameter. - -Examples: - -
    -$ curl 'http://localhost:8123/?query=SELECT%201'
    -1
    -
    -$ wget -O- -q 'http://localhost:8123/?query=SELECT 1'
    -1
    -
    -$ GET 'http://localhost:8123/?query=SELECT 1'
    -1
    -
    -$ echo -ne 'GET /?query=SELECT%201 HTTP/1.0\r\n\r\n' | nc localhost 8123
    -HTTP/1.0 200 OK
    -Connection: Close
    -Date: Fri, 16 Nov 2012 19:21:50 GMT
    -
    -1
    -
    - -As you can see, curl is not very convenient because spaces have to be URL-escaped. Although wget escapes everything on its own, we don't recommend it because it doesn't work well over HTTP 1.1 when using keep-alive and Transfer-Encoding: chunked. - -
    -$ echo 'SELECT 1' | curl 'http://localhost:8123/' --data-binary @-
    -1
    -
    -$ echo 'SELECT 1' | curl 'http://localhost:8123/?query=' --data-binary @-
    -1
    -
    -$ echo '1' | curl 'http://localhost:8123/?query=SELECT' --data-binary @-
    -1
    -
    - -If part of the query is sent in the parameter, and part in the POST, a line break is inserted between these two data parts. -Example (this won't work): - -
    -$ echo 'ECT 1' | curl 'http://localhost:8123/?query=SEL' --data-binary @-
    -Code: 59, e.displayText() = DB::Exception: Syntax error: failed at position 0: SEL
    -ECT 1
    -, expected One of: SHOW TABLES, SHOW DATABASES, SELECT, INSERT, CREATE, ATTACH, RENAME, DROP, DETACH, USE, SET, OPTIMIZE., e.what() = DB::Exception
    -
    - -By default, data is returned in TabSeparated format (for more information, see the "Formats" section). -You use the FORMAT clause of the query to request any other format. - -
    -$ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @-
    -┏━━━┓
    -┃ 1 ┃
    -┡━━━┩
    -│ 1 │
    -└───┘
    -
    - -The POST method of transmitting data is necessary for INSERT queries. In this case, you can write the beginning of the query in the URL parameter, and use POST to pass the data to insert. The data to insert could be, for example, a tab-separated dump from MySQL. In this way, the INSERT query replaces LOAD DATA LOCAL INFILE from MySQL. - -Examples: - -Creating a table: - -
    -echo 'CREATE TABLE t (a UInt8) ENGINE = Memory' | POST 'http://localhost:8123/'
    -
    - -Using the familiar INSERT query for data insertion: - -
    -echo 'INSERT INTO t VALUES (1),(2),(3)' | POST 'http://localhost:8123/'
    -
    - -Data can be sent separately from the query: - -
    -echo '(4),(5),(6)' | POST 'http://localhost:8123/?query=INSERT INTO t VALUES'
    -
    - -You can specify any data format. The 'Values' format is the same as what is used when writing INSERT INTO t VALUES: - -
    -echo '(7),(8),(9)' | POST 'http://localhost:8123/?query=INSERT INTO t FORMAT Values'
    -
    - -To insert data from a tab-separated dump, specify the corresponding format: - -
    -echo -ne '10\n11\n12\n' | POST 'http://localhost:8123/?query=INSERT INTO t FORMAT TabSeparated'
    -
    - -Reading the table contents. Data is output in random order due to parallel query processing: - -
    -$ GET 'http://localhost:8123/?query=SELECT a FROM t'
    -7
    -8
    -9
    -10
    -11
    -12
    -1
    -2
    -3
    -4
    -5
    -6
    -
    - -Deleting the table. - -
    -echo 'DROP TABLE t' | POST 'http://localhost:8123/'
    -
    - -For successful requests that don't return a data table, an empty response body is returned. - -You can use compression when transmitting data. The compressed data has a non-standard format, and you will need to use a special clickhouse-compressor program to work with it (%%sudo apt-get install clickhouse-utils%%). - -If you specified 'compress=1' in the URL, the server will compress the data it sends you. -If you specified 'decompress=1' in the URL, the server will decompress the same data that you pass in the POST method. - -You can use this to reduce network traffic when transmitting a large amount of data, or for creating dumps that are immediately compressed. - -You can use the 'database' URL parameter to specify the default database. - -
    -$ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?database=system' --data-binary @-
    -0
    -1
    -2
    -3
    -4
    -5
    -6
    -7
    -8
    -9
    -
    - -By default, the database that is registered in the server settings is used as the default database. By default, this is the database called 'default'. Alternatively, you can always specify the database using a dot before the table name. - -The username and password can be indicated in one of two ways: -1. Using HTTP Basic Authentication. Example: -
    -echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @-
    -
    -2. In the 'user' and 'password' URL parameters. Example: -
    -echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @-
    -
    -3. Using 'X-ClickHouse-User' and 'X-ClickHouse-Key' headers. Example: -
    -echo 'SELECT 1' | curl -H "X-ClickHouse-User: user" -H "X-ClickHouse-Key: password"  'http://localhost:8123/' -d @-
    -
    -If the user name is not indicated, the username 'default' is used. If the password is not indicated, an empty password is used. - - -You can also use the URL parameters to specify any settings for processing a single query, or entire profiles of settings. Example: - -%%http://localhost:8123/?profile=web&max_rows_to_read=1000000000&query=SELECT+1%% - -For more information, see the section "Settings". - -
    -$ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:8123/?' --data-binary @-
    -0
    -1
    -2
    -3
    -4
    -5
    -6
    -7
    -8
    -9
    -
    - -For information about other parameters, see the section "SET". - -In contrast to the native interface, the HTTP interface does not support the concept of sessions or session settings, does not allow aborting a query (to be exact, it allows this in only a few cases), and does not show the progress of query processing. Parsing and data formatting are performed on the server side, and using the network might be ineffective. - -The optional 'query_id' parameter can be passed as the query ID (any string). For more information, see the section "Settings, replace_running_query". - -The optional 'quota_key' parameter can be passed as the quota key (any string). It can also be passed as 'X-ClickHouse-Quota' header. For more information, see the section "Quotas". - -The HTTP interface allows passing external data (external temporary tables) for querying. For more information, see the section "External data for query processing". - - -==JDBC driver== - -There is official JDBC driver for ClickHouse. See here. - - -==Third-party client libraries== - -There exist third-party client libraries for Python (1, 2), PHP (1, 2, 3), Go (1, 2), Node.js (1, 2), Perl (1, 2, 3), Ruby (1), R (1), .NET (1), C++ (1). - -Libraries was not tested by us. Ordering is arbitrary. - - -==Third-party GUI== - -There are open source project Tabix company of SMI2, which implements a graphical web interface for ClickHouse. - -Tabix key features: -- works with ClickHouse from the browser directly, without installing additional software; -- query editor that supports highlighting of SQL syntax ClickHouse, auto-completion for all objects, including dictionaries and context-sensitive help for built-in functions. -- graphs, charts and geo-referenced for mapping query results; -- interactive designer PivotTables (pivot) for query results; -- graphical tools for analysis ClickHouse; -- two color theme: light and dark. - -Tabix documentation - -==Native interface (TCP)== - -The native interface is used in the "clickhouse-client" command-line client for interaction between servers with distributed query processing, and also in C++ programs. We will only cover the command-line client. - -==Command-line client== - -
    -$ clickhouse-client
    -ClickHouse client version 0.0.26176.
    -Connecting to localhost:9000.
    -Connected to ClickHouse server version 0.0.26176.
    -
    -:) SELECT 1
    -
    - -The "clickhouse-client" program accepts the following parameters, which are all optional: - ---host, -h - server name, by default - 'localhost'. -You can use either the name or the IPv4 or IPv6 address. - ---port - The port to connect to, by default - '9000'. -Note that the HTTP interface and the native interface use different ports. - ---user, -u - The username, by default - 'default'. - ---password - The password, by default - empty string. - ---query, -q - Query to process when using non-interactive mode. - ---database, -d - Select the current default database, by default - the current DB from the server settings (by default, the 'default' DB). - ---multiline, -m - If specified, allow multiline queries (do not send request on Enter). - ---multiquery, -n - If specified, allow processing multiple queries separated by semicolons. -Only works in non-interactive mode. - ---format, -f - Use the specified default format to output the result. - ---vertical, -E - If specified, use the Vertical format by default to output the result. This is the same as '--format=Vertical'. In this format, each value is printed on a separate line, which is helpful when displaying wide tables. - ---time, -t - If specified, print the query execution time to 'stderr' in non-interactive mode. - ---stacktrace - If specified, also prints the stack trace if an exception occurs. - ---config-file - Name of the configuration file that has additional settings or changed defaults for the settings listed above. -By default, files are searched for in this order: -./clickhouse-client.xml -~/.clickhouse-client/config.xml -/etc/clickhouse-client/config.xml -Settings are only taken from the first file found. - -You can also specify any settings that will be used for processing queries. For example, %%clickhouse-client --max_threads=1%%. For more information, see the section "Settings". - -The client can be used in interactive and non-interactive (batch) mode. -To use batch mode, specify the 'query' parameter, or send data to 'stdin' (it verifies that 'stdin' is not a terminal), or both. -Similar to the HTTP interface, when using the 'query' parameter and sending data to 'stdin', the request is a concatenation of the 'query' parameter, a line break, and the data in 'stdin'. This is convenient for large INSERT queries. - -Examples for insert data via clickhouse-client: - -%% -echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; - -cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; -3, 'some text', '2016-08-14 00:00:00' -4, 'some more text', '2016-08-14 00:00:01' -_EOF - -cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; -%% - -In batch mode, the default data format is TabSeparated. You can set the format in the FORMAT clause of the query. - -By default, you can only process a single query in batch mode. To make multiple queries from a "script," use the 'multiquery' parameter. This works for all queries except INSERT. Query results are output consecutively without additional separators. -Similarly, to process a large number of queries, you can run 'clickhouse-client' for each query. Note that it may take tens of milliseconds to launch the 'clickhouse-client' program. - -In interactive mode, you get a command line where you can enter queries. - -If 'multiline' is not specified (the default): -To run a query, press Enter. The semicolon is not necessary at the end of the query. To enter a multiline query, enter a backslash %%\%% before the line break - after you press Enter, you will be asked to enter the next line of the query. - -If 'multiline' is specified: -To run a query, end it with a semicolon and press Enter. If the semicolon was omitted at the end of the entered line, you will be asked to enter the next line of the query. - -Only a single query is run, so everything after the semicolon is ignored. - -You can specify %%\G%% instead of or after the semicolon. This indicates using Vertical format. In this format, each value is printed on a separate line, which is convenient for wide tables. This unusual feature was added for compatibility with the MySQL CLI. - -The command line is based on 'readline' (and 'history') (or 'libedit', or even nothing, depending on build). In other words, it uses the familiar keyboard shortcuts and keeps a history. The history is written to /.clickhouse-client-history. - -By default, the format used is PrettyCompact. You can change the format in the FORMAT clause of the query, or by specifying '\G' at the end of the query, using the '--format' or '--vertical' argument in the command line, or using the client configuration file. - -To exit the client, press Ctrl+D (or Ctrl+C), or enter one of the following : -"exit", "quit", "logout", "учше", "йгше", "дщпщге", "exit;", "quit;", "logout;", "учшеж", "йгшеж", "дщпщгеж", "q", "й", "\q", "\Q", ":q", "\й", "\Й", "Жй" - -When processing a query, the client shows: -1. Progress, which is updated no more than 10 times per second (by default). For quick queries, the progress might not have time to be displayed. -2. The formatted query after parsing, for debugging. -3. The result in the specified format. -4. The number of lines in the result, the time passed, and the average speed of query processing. - -To cancel a lengthy query, press Ctrl+C. However, you will still need to wait a little for the server to abort the request. It is not possible to cancel a query at certain stages. If you don't wait and press Ctrl+C a second time, the client will exit. - -The command-line client allows passing external data (external temporary tables) for querying. For more information, see the section "External data for request processing". - - -
    -
    -

    Query language

    -
    - -
    - -==Syntax== - -There are two types of parsers in the system: a full SQL parser (a recursive descent parser), and a data format parser (a fast stream parser). In all cases except the INSERT query, only the full SQL parser is used. -The INSERT query uses both parsers: - -%%INSERT INTO t VALUES (1, 'Hello, world'), (2, 'abc'), (3, 'def')%% - -The %%INSERT INTO t VALUES%% fragment is parsed by the full parser, and the data %%(1, 'Hello, world'), (2, 'abc'), (3, 'def')%% is parsed by the fast stream parser. -Data can have any format. When a query is received, the server calculates no more than 'max_query_size' bytes of the request in RAM (by default, 1 MB), and the rest is stream parsed. This means the system doesn't have problems with large INSERT queries, like MySQL does. - -When using the Values format in an INSERT query, it may seem that data is parsed the same as expressions in a SELECT query, but this is not true. The Values format is much more limited. - -Next we will cover the full parser. For more information about format parsers, see the section "Formats". - -===Spaces=== - -There may be any number of space symbols between syntactical constructions (including the beginning and end of a query). Space symbols include the space, tab, line break, CR, and form feed. - -===Comments=== - -SQL-style and C-style comments are supported. -SQL-style comments: from %%--%% to the end of the line. The space after %%--%% can be omitted. -C-style comments: from %%/*%% to %%*/%%. These comments can be multiline. Spaces are not required here, either. - -===Keywords=== - -Keywords (such as SELECT) are not case-sensitive. Everything else (column names, functions, and so on), in contrast to standard SQL, is case-sensitive. Keywords are not reserved (they are just parsed as keywords in the corresponding context). - -===Identifiers=== - -Identifiers (column names, functions, and data types) can be quoted or non-quoted. -Non-quoted identifiers start with a Latin letter or underscore, and continue with a Latin letter, underscore, or number. In other words, they must match the regex %%^[a-zA-Z_][0-9a-zA-Z_]*$%%. Examples: %%x%%, %%_1%%, %%X_y__Z123_%%. -Quoted identifiers are placed in reversed quotation marks %%`id`%% (the same as in MySQL), and can indicate any set of bytes (non-empty). In addition, symbols (for example, the reverse quotation mark) inside this type of identifier can be backslash-escaped. Escaping rules are the same as for string literals (see below). -We recommend using identifiers that do not need to be quoted. - -===Literals=== - -There are numeric literals, string literals, and compound literals. - -

    Numeric literals

    - -A numeric literal tries to be parsed: -- first as a 64-bit signed number, using the 'strtoull' function. -- if unsuccessful, as a 64-bit unsigned number, using the 'strtoll' function. -- if unsuccessful, as a floating-point number using the 'strtod' function. -- otherwise, an error is returned. - -The corresponding value will have the smallest type that the value fits in. -For example, 1 is parsed as UInt8, but 256 is parsed as UInt16. For more information, see "Data types". - -Examples: %%1%%, %%18446744073709551615%%, %%0xDEADBEEF%%, %%01%%, %%0.1%%, %%1e100%%, %%-1e-100%%, %%inf%%, %%nan%%. - -

    String literals

    - -Only string literals in single quotes are supported. The enclosed characters can be backslash-escaped. The following escape sequences have special meanings: %%\b%%, %%\f%%, %%\r%%, %%\n%%, %%\t%%, %%\0%%, %%\a%%, %%\v%%, \xHH. In all other cases, escape sequences like \c, where c is any character, are transformed to c. This means that the sequences %%\'%% and %%\\%% can be used. The value will have the String type. - -Minimum set of symbols that must be escaped in string literal is %%'%% and %%\%%. - -

    Compound literals

    - -Constructions are supported for arrays: %%[1, 2, 3]%% and tuples: %%(1, 'Hello, world!', 2)%%. -Actually, these are not literals, but expressions with the array creation operator and the tuple creation operator, respectively. For more information, see the section "Operators2". -An array must consist of at least one item, and a tuple must have at least two items. -Tuples have a special purpose for use in the IN clause of a SELECT query. Tuples can be obtained as the result of a query, but they can't be saved to a database (with the exception of Memory-type tables). - -===Functions=== - -Functions are written like an identifier with a list of arguments (possibly empty) in brackets. In contrast to standard SQL, the brackets are required, even for an empty arguments list. Example: %%now()%%. -There are regular and aggregate functions (see the section "Aggregate functions"). Some aggregate functions can contain two lists of arguments in brackets. Example: %%quantile(0.9)(x)%%. These aggregate functions are called "parametric" functions, and the arguments in the first list are called "parameters". The syntax of aggregate functions without parameters is the same as for regular functions. - -===Operators=== - -Operators are converted to their corresponding functions during query parsing, taking their priority and associativity into account. -For example, the expression %%1 + 2 * 3 + 4%% is transformed to %%plus(plus(1, multiply(2, 3)), 4)%%. -For more information, see the section "Operators2" below. - -===Data types and database table engines=== - -Data types and table engines in the CREATE query are written the same way as identifiers or functions. In other words, they may or may not contain an arguments list in brackets. For more information, see the sections "Data types," "Table engines," and "CREATE". - -===Synonyms=== - -In the SELECT query, expressions can specify synonyms using the AS keyword. Any expression is placed to the left of AS. The identifier name for the synonym is placed to the right of AS. As opposed to standard SQL, synonyms are not only declared on the top level of expressions: - -%%SELECT (1 AS n) + 2, n%% - -In contrast to standard SQL, synonyms can be used in all parts of a query, not just SELECT. - -===Asterisk=== - -In a SELECT query, an asterisk can replace the expression. For more information, see the section "SELECT". - -===Expressions=== - -An expression is a function, identifier, literal, application of an operator, expression in brackets, subquery, or asterisk. It can also contain a synonym. -A list of expressions is one or more expressions separated by commas. -Functions and operators, in turn, can have expressions as arguments. - - -==Queries== - - -===CREATE DATABASE=== - -%%CREATE DATABASE [IF NOT EXISTS] db_name%% - -- Creates the 'db_name' database. A database is just a directory for tables. -If "IF NOT EXISTS" is included, the query won't return an error if the database already exists. - -===CREATE TABLE=== - -The CREATE TABLE query can have several forms. - -%%CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE = engine%% - -Creates a table named 'name' in the 'db' database or the current database if 'db' is not set, with the structure specified in brackets and the 'engine' engine. The structure of the table is a list of column descriptions. If indexes are supported by the engine, they are indicated as parameters for the table engine. - -A column description is %%name type%% in the simplest case. For example: %%RegionID UInt32%%. -Expressions can also be defined for default values (see below). - -%%CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name AS [db2.]name2 [ENGINE = engine]%% - -Creates a table with the same structure as another table. You can specify a different engine for the table. If the engine is not specified, the same engine will be used as for the 'db2.name2' table. - -%%CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name ENGINE = engine AS SELECT ...%% - -Creates a table with a structure like the result of the SELECT query, with the 'engine' engine, and fills it with data from SELECT. - -In all cases, if IF NOT EXISTS is specified, the query won't return an error if the table already exists. In this case, the query won't do anything. - -

    Default values

    - -The column description can specify an expression for a default value, in one of the following ways: -%%DEFAULT expr%%, %%MATERIALIZED expr%%, %%ALIAS expr%%. -Example: %%URLDomain String DEFAULT domain(URL)%%. - -If an expression for the default value is not defined, the default values will be set to zeros for numbers, empty strings for strings, empty arrays for arrays, and 0000-00-00 for dates or 0000-00-00 00:00:00 for dates with time. NULLs are not supported. - -If the default expression is defined, the column type is optional. If there isn't an explicitly defined type, the default expression type is used. Example: %%EventDate DEFAULT toDate(EventTime)%% - the 'Date' type will be used for the 'EventDate' column. - -If the data type and default expression are defined explicitly, this expression will be cast to the specified type using type casting functions. Example: %%Hits UInt32 DEFAULT 0%% means the same thing as %%Hits UInt32 DEFAULT toUInt32(0)%%. - -Default expressions may be defined as an arbitrary expression from table constants and columns. When creating and changing the table structure, it checks that expressions don't contain loops. For INSERT, it checks that expressions are resolvable - that all columns they can be calculated from have been passed. - -%%DEFAULT expr%% - -Normal default value. If the INSERT query doesn't specify the corresponding column, it will be filled in by computing the corresponding expression. - -%%MATERIALIZED expr%% - -Materialized expression. Such a column can't be specified for INSERT, because it is always calculated. -For an INSERT without a list of columns, these columns are not considered. -In addition, this column is not substituted when using an asterisk in a SELECT query. This is to preserve the invariant that the dump obtained using SELECT * can be inserted back into the table using INSERT without specifying the list of columns. - -%%ALIAS expr%% - -Synonym. Such a column isn't stored in the table at all. -Its values can't be inserted in a table, and it is not substituted when using an asterisk in a SELECT query. -It can be used in SELECTs if the alias is expanded during query parsing. - -When using the ALTER query to add new columns, old data for these columns is not written. Instead, when reading old data that does not have values for the new columns, expressions are computed on the fly by default. However, if running the expressions requires different columns that are not indicated in the query, these columns will additionally be read, but only for the blocks of data that need it. - -If you add a new column to a table but later change its default expression, the values used for old data will change (for data where values were not stored on the disk). Note that when running background merges, data for columns that are missing in one of the merging parts is written to the merged part. - -It is not possible to set default values for elements in nested data structures. - - -

    Temporary tables

    - -In all cases, if TEMPORARY is specified, a temporary table will be created. Temporary tables have the following characteristics: -- Temporary tables disappear when the session ends, including if the connection is lost. -- A temporary table is created with the Memory engine. The other table engines are not supported. -- The DB can't be specified for a temporary table. It is created outside of databases. -- If a temporary table has the same name as another one and a query specifies the table name without specifying the DB, the temporary table will be used. -- For distributed query processing, temporary tables used in a query are passed to remote servers. - -In most cases, temporary tables are not created manually, but when using external data for a query, or for distributed (GLOBAL) IN. For more information, see the appropriate sections. - -===CREATE VIEW=== - -%%CREATE [MATERIALIZED] VIEW [IF NOT EXISTS] [db.]name [ENGINE = engine] [POPULATE] AS SELECT ...%% - -Creates a view. There are two types of views: normal and MATERIALIZED. - -Normal views don't store any data, but just perform a read from another table. In other words, a normal view is nothing more than a saved query. When reading from a view, this saved query is used as a subquery in the FROM clause. - -As an example, assume you've created a view: -%%CREATE VIEW view AS SELECT ...%% -and written a query: -%%SELECT a, b, c FROM view%% -This query is fully equivalent to using the subquery: -%%SELECT a, b, c FROM (SELECT ...)%% - - -Materialized views store data transformed by the corresponding SELECT query. - -When creating a materialized view, you can specify ENGINE - the table engine for storing data. By default, it uses the same engine as for the table that the SELECT query is made from. - -A materialized view is arranged as follows: when inserting data to the table specified in SELECT, part of the inserted data is converted by this SELECT query, and the result is inserted in the view. - -If you specify POPULATE, the existing table data is inserted in the view when creating it, as if making a CREATE TABLE ... AS SELECT ... query. Otherwise, the query contains only the data inserted in the table after creating the view. We don't recommend using POPULATE, since data inserted in the table during the view creation will not be inserted in it. - -The SELECT query can contain DISTINCT, GROUP BY, ORDER BY, LIMIT ... Note that the corresponding conversions are performed independently on each block of inserted data. For example, if GROUP BY is set, data is aggregated during insertion, but only within a single packet of inserted data. The data won't be further aggregated. The exception is when using an ENGINE that independently performs data aggregation, such as SummingMergeTree. - -The execution of ALTER queries on materialized views has not been fully developed, so they might be inconvenient. - -Views look the same as normal tables. For example, they are listed in the result of the SHOW TABLES query. - -There isn't a separate query for deleting views. To delete a view, use DROP TABLE. - -===ATTACH=== - -The query is exactly the same as CREATE, except -- The word ATTACH is used instead of CREATE. -- The query doesn't create data on the disk, but assumes that data is already in the appropriate places, and just adds information about the table to the server. -After executing an ATTACH query, the server will know about the existence of the table. - -This query is used when starting the server. The server stores table metadata as files with ATTACH queries, which it simply runs at launch (with the exception of system tables, which are explicitly created on the server). - - -===DROP=== - -This query has two types: DROP DATABASE and DROP TABLE. - -%%DROP DATABASE [IF EXISTS] db%% - -Deletes all tables inside the 'db' database, then deletes the 'db' database itself. -If IF EXISTS is specified, it doesn't return an error if the database doesn't exist. - -%%DROP TABLE [IF EXISTS] [db.]name%% - -Deletes the table. -If IF EXISTS is specified, it doesn't return an error if the table doesn't exist or the database doesn't exist. - - -===DETACH=== - -%%DETACH TABLE [IF EXISTS] [db.]name%% - -Deletes information about the table from the server. The server stops knowing about the table's existence. This does not delete the table's data or metadata. On the next server launch, the server will read the metadata and find out about the table again. Similarly, a "detached" table can be re-attached using the ATTACH query (with the exception of system tables, which do not have metadata stored for them). - -There is no DETACH DATABASE query. - - -===RENAME=== - -%%RENAME TABLE [db11.]name11 TO [db12.]name12, [db21.]name21 TO [db22.]name22, ...%% - -Renames one or more tables. All tables are renamed under global locking. Renaming tables is a light operation. If you indicated another database after TO, the table will be moved to this database. However, the directories with databases must reside in the same file system (otherwise, an error is returned). - - -===ALTER=== - -The ALTER query is only supported for *MergeTree type tables, as well as for Merge and Distributed types. The query has several variations. - -

    Column manipulations

    - -%%ALTER TABLE [db].name ADD|DROP|MODIFY COLUMN ...%% - -Lets you change the table structure. In the query, specify a list of one or more comma-separated actions. Each action is an operation on a column. - -The following actions are supported: - -%%ADD COLUMN name [type] [default_expr] [AFTER name_after]%% - -Adds a new column to the table with the specified name, type, and default expression (see the section "Default expressions"). If you specify 'AFTER name_after' (the name of another column), the column is added after the specified one in the list of table columns. Otherwise, the column is added to the end of the table. Note that there is no way to add a column to the beginning of a table. For a chain of actions, 'name_after' can be the name of a column that is added in one of the previous actions. - -Adding a column just changes the table structure, without performing any actions with data. The data doesn't appear on the disk after ALTER. If the data is missing for a column when reading from the table, it is filled in with default values (by performing the default expression if there is one, or using zeros or empty strings). The column appears on the disk after merging data parts (see MergeTree). - -This approach allows us to complete the ALTER query instantly, without increasing the volume of old data. - -%%DROP COLUMN name%% - -Deletes the column with the name 'name'. - -Deletes data from the file system. Since this deletes entire files, the query is completed almost instantly. - -%%MODIFY COLUMN name [type] [default_expr]%% - -Changes the 'name' column's type to 'type' and/or the default expression to 'default_expr'. When changing the type, values are converted as if the 'toType' function were applied to them. - -If only the default expression is changed, the query doesn't do anything complex, and is completed almost instantly. - -Changing the column type is the only complex action - it changes the contents of files with data. For large tables, this may take a long time. - -There are several stages of execution: -- Preparing temporary (new) files with modified data. -- Renaming old files. -- Renaming the temporary (new) files to the old names. -- Deleting the old files. - -Only the first stage takes time. If there is a failure at this stage, the data is not changed. -If there is a failure during one of the successive stages, data can be restored manually. The exception is if the old files were deleted from the file system but the data for the new files did not get written to the disk and was lost. - -There is no support for changing the column type in arrays and nested data structures. - -The ALTER query lets you create and delete separate elements (columns) in nested data structures, but not whole nested data structures. To add a nested data structure, you can add columns with a name like 'name.nested_name' and the type 'Array(T)'. A nested data structure is equivalent to multiple array columns with a name that has the same prefix before the dot. - -There is no support for deleting of columns in the primary key or the sampling key (columns that are in the ENGINE expression). Changing the type of columns in the primary key is allowed only if such change doesn't entail changing the actual data (e.g. adding the value to an Enum or changing the type from DateTime to UInt32 is allowed). - -If the ALTER query is not sufficient for making the table changes you need, you can create a new table, copy the data to it using the INSERT SELECT query, then switch the tables using the RENAME query and delete the old table. - -The ALTER query blocks all reads and writes for the table. In other words, if a long SELECT is running at the time of the ALTER query, the ALTER query will wait for the SELECT to complete. At the same time, all new queries to the same table will wait while this ALTER is running. - -For tables that don't store data themselves (Merge and Distributed), ALTER just changes the table structure, and does not change the structure of subordinate tables. For example, when running ALTER for a Distributed table, you will also need to run ALTER for the tables on all remote servers. - -The ALTER query for changing columns is replicated. The instructions are saved in ZooKeeper, then each replica applies them. All ALTER queries are run in the same order. The query waits for the appropriate actions to be completed on the other replicas. However, a query to change columns in a replicated table can be interrupted, and all actions will be performed asynchronously. - - -

    Manipulations with partitions and parts

    - -Only works for tables in the MergeTree family. The following operations are available: - -%%DETACH PARTITION%% - Move a partition to the 'detached' directory and forget it. -%%DROP PARTITION%% - Delete a partition. -%%ATTACH PART|PARTITION%% - Add a new part or partition from the 'detached' directory to the table. -%%FREEZE PARTITION%% - Create a backup of a partition. -%%FETCH PARTITION%% - Download a partition from another server. - -Each type of query is covered separately below. - -A partition in a table is data for a single calendar month. This is determined by the values of the date key specified in the table engine parameters. Each month's data is stored separately in order to simplify manipulations with this data. - -A "part" in the table is part of the data from a single partition, sorted by the primary key. - -You can use the system.parts table to view the set of table parts and partitions: - -%%SELECT * FROM system.parts WHERE active%% - -active - Only count active parts. Inactive parts are, for example, source parts remaining after merging to a larger part - these parts are deleted approximately 10 minutes after merging. - -Another way to view a set of parts and partitions is to go into the directory with table data. -The directory with data is -/var/lib/clickhouse/data/database/table/, -where /var/lib/clickhouse/ is the path to ClickHouse data, 'database' is the database name, and 'table' is the table name. Example: - -%% -$ ls -l /var/lib/clickhouse/data/test/visits/ -total 48 -drwxrwxrwx 2 clickhouse clickhouse 20480 may 13 02:58 20140317_20140323_2_2_0 -drwxrwxrwx 2 clickhouse clickhouse 20480 may 13 02:58 20140317_20140323_4_4_0 -drwxrwxrwx 2 clickhouse clickhouse 4096 may 13 02:55 detached --rw-rw-rw- 1 clickhouse clickhouse 2 may 13 02:58 increment.txt -%% - -Here 20140317_20140323_2_2_0 and 20140317_20140323_4_4_0 are directories of parts. - -Let's look at the name of the first part: 20140317_20140323_2_2_0. -20140317 - minimum date of part data -20140323 - maximum date of part data -2 - minimum number of the data block -2 - maximum number of the data block -0 - part level - depth of the merge tree that formed it - -Each part corresponds to a single partition and contains data for a single month. -201403 - The partition name. A partition is a set of parts for a single month. - -On an operating server, you can't manually change the set of parts or their data on the file system, since the server won't know about it. For non-replicated tables, you can do this when the server is stopped, but we don't recommended it. For replicated tables, the set of parts can't be changed in any case. - -The 'detached' directory contains parts that are not used by the server - detached from the table using the ALTER ... DETACH query. Parts that are damaged are also moved to this directory, instead of deleting them. You can add, delete, or modify the data in the 'detached' directory at any time - the server won't know about this until you make the ALTER TABLE ... ATTACH query. - - -%%ALTER TABLE [db.]table DETACH PARTITION 'name'%% - -Move all data for partitions named 'name' to the 'detached' directory and forget about them. -The partition name is specified in YYYYMM format. It can be indicated in single quotes or without them. - -After the query is executed, you can do whatever you want with the data in the 'detached' directory — delete it from the file system, or just leave it. - -The query is replicated - data will be moved to the 'detached' directory and forgotten on all replicas. The query can only be sent to a leader replica. To find out if a replica is a leader, perform SELECT to the 'system.replicas' system table. Alternatively, it is easier to make a query on all replicas, and all except one will throw an exception. - - -%%ALTER TABLE [db.]table DROP PARTITION 'name'%% - -Similar to the DETACH operation. Deletes data from the table. Data parts will be tagged as inactive and will be completely deleted in approximately 10 minutes. The query is replicated - data will be deleted on all replicas. - - -%%ALTER TABLE [db.]table ATTACH PARTITION|PART 'name'%% - -Adds data to the table from the 'detached' directory. - -It is possible to add data for an entire partition or a separate part. For a part, specify the full name of the part in single quotes. - -The query is replicated. Each replica checks whether there is data in the 'detached' directory. If there is data, it checks the integrity, verifies that it matches the data on the server that initiated the query, and then adds it if everything is correct. If not, it downloads data from the query requestor replica, or from another replica where the data has already been added. - -So you can put data in the 'detached' directory on one replica, and use the ALTER ... ATTACH query to add it to the table on all replicas. - - -%%ALTER TABLE [db.]table FREEZE PARTITION 'name'%% - -Creates a local backup of one or multiple partitions. The name can be the full name of the partition (for example, 201403), or its prefix (for example, 2014) - then the backup will be created for all the corresponding partitions. - -The query does the following: for a data snapshot at the time of execution, it creates hardlinks to table data in the directory /var/lib/clickhouse/shadow/N/... -/var/lib/clickhouse/ is the working ClickHouse directory from the config. -N is the incremental number of the backup. -The same structure of directories is created inside the backup as inside /var/lib/clickhouse/. -It also performs 'chmod' for all files, forbidding writes to them. - -The backup is created almost instantly (but first it waits for current queries to the corresponding table to finish running). At first, the backup doesn't take any space on the disk. As the system works, the backup can take disk space, as data is modified. If the backup is made for old enough data, it won't take space on the disk. - -After creating the backup, data from /var/lib/clickhouse/shadow/ can be copied to the remote server and then deleted on the local server. The entire backup process is performed without stopping the server. - -The ALTER ... FREEZE PARTITION query is not replicated. A local backup is only created on the local server. - -As an alternative, you can manually copy data from the /var/lib/clickhouse/data/database/table directory. But if you do this while the server is running, race conditions are possible when copying directories with files being added or changed, and the backup may be inconsistent. You can do this if the server isn't running - then the resulting data will be the same as after the ALTER TABLE t FREEZE PARTITION query. - -ALTER TABLE ... FREEZE PARTITION only copies data, not table metadata. To make a backup of table metadata, copy the file /var/lib/clickhouse/metadata/database/table.sql - -To restore from a backup: -- Use the CREATE query to create the table if it doesn't exist. The query can be taken from an .sql file (replace ATTACH in it with CREATE). -- Copy data from the data/database/table/ directory inside the backup to the /var/lib/clickhouse/data/database/table/detached/ directory. -- Run ALTER TABLE ... ATTACH PARTITION YYYYMM queries where YYYYMM is the month, for every month. - -In this way, data from the backup will be added to the table. -Restoring from a backup doesn't require stopping the server. - -Backups and replication - -Replication provides protection from device failures. If all data disappeared on one of your replicas, follow the instructions in the "Restoration after failure" section to restore it. - -For protection from device failures, you must use replication. For more information about replication, see the section "Data replication". - -Backups protect against human error (accidentally deleting data, deleting the wrong data or in the wrong cluster, or corrupting data). For high-volume databases, it can be difficult to copy backups to remote servers. In such cases, to protect from human error, you can keep a backup on the same server (it will reside in /var/lib/clickhouse/shadow/). - - -%%ALTER TABLE [db.]table FETCH PARTITION 'name' FROM 'path-in-zookeeper'%% - -This query only works for replicatable tables. - -It downloads the specified partition from the shard that has its ZooKeeper path specified in the FROM clause, then puts it in the 'detached' directory for the specified table. - -Although the query is called ALTER TABLE, it does not change the table structure, and does not immediately change the data available in the table. - -Data is placed in the 'detached' directory. You can use the ALTER TABLE ... ATTACH query to attach the data. - -The path to ZooKeeper is specified in the FROM clause. For example, %%/clickhouse/tables/01-01/visits%%. -Before downloading, the system checks that the partition exists and the table structure matches. The most appropriate replica is selected automatically from the healthy replicas. - -The ALTER ... FETCH PARTITION query is not replicated. The partition will be downloaded to the 'detached' directory only on the local server. Note that if after this you use the ALTER TABLE ... ATTACH query to add data to the table, the data will be added on all replicas (on one of the replicas it will be added from the 'detached' directory, and on the rest it will be loaded from neighboring replicas). - - -

    Synchronicity of ALTER queries

    - -For non-replicatable tables, all ALTER queries are performed synchronously. For replicatable tables, the query just adds instructions for the appropriate actions to ZooKeeper, and the actions themselves are performed as soon as possible. However, the query can wait for these actions to be completed on all the replicas. - -For ALTER ... ATTACH|DETACH|DROP queries, you can use the 'replication_alter_partitions_sync' setting to set up waiting. -Possible values: 0 - do not wait, 1 - wait for own completion (default), 2 - wait for all. - - - -===SHOW DATABASES=== - -%%SHOW DATABASES [INTO OUTFILE filename] [FORMAT format]%% - -Prints a list of all databases. -This query is identical to the query SELECT name FROM system.databases [INTO OUTFILE filename] [FORMAT format] -See the section "Formats". - - -===SHOW TABLES=== - -%%SHOW TABLES [FROM db] [LIKE 'pattern'] [INTO OUTFILE filename] [FORMAT format]%% - -Outputs a list of -- tables from the current database, or from the 'db' database if "FROM db" is specified. -- all tables, or tables whose name matches the pattern, if "LIKE 'pattern'" is specified. - -The query is identical to the query SELECT name FROM system.tables -WHERE database = 'db' [AND name LIKE 'pattern'] [INTO OUTFILE filename] [FORMAT format] -See the section "LIKE operator". - - -===SHOW PROCESSLIST=== - -%%SHOW PROCESSLIST [INTO OUTFILE filename] [FORMAT format]%% - -Outputs a list of queries currently being processed, other than SHOW PROCESSLIST queries. - -Prints a table containing the columns: - -user is the user who made the query. Keep in mind that for distributed processing, queries are sent to remote servers under the 'default' user. SHOW PROCESSLIST shows the username for a specific query, not for a query that this query initiated. - -address is the name of the host that the query was sent from. For distributed processing, on remote servers, this is the name of the query requestor host. To track where a distributed query was originally made from, look at SHOW PROCESSLIST on the query requestor server. - -elapsed - The execution time, in seconds. Queries are output in order of decreasing execution time. - -rows_read, bytes_read - How many rows and bytes of uncompressed data were read when processing the query. For distributed processing, data is totaled from all the remote servers. This is the data used for restrictions and quotas. - -memory_usage - Current RAM usage in bytes. See the setting 'max_memory_usage'. - -query - The query itself. In INSERT queries, the data for insertion is not output. - -query_id - The query identifier. Non-empty only if it was explicitly defined by the user. For distributed processing, the query ID is not passed to remote servers. - -This query is exactly the same as: SELECT * FROM system.processes [INTO OUTFILE filename] [FORMAT format]. - -Tip (execute in the console): -%%watch -n1 "clickhouse-client --query='SHOW PROCESSLIST'"%% - - -===SHOW CREATE TABLE=== - -%%SHOW CREATE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format]%% - -Returns a single String-type 'statement' column, which contains a single value - the CREATE query used for creating the specified table. - - -===DESCRIBE TABLE=== - -%%DESC|DESCRIBE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format]%% - -Returns two String-type columns: 'name' and 'type', which indicate the names and types of columns in the specified table. - -Nested data structures are output in "expanded" format. Each column is shown separately, with the name after a dot. - - -===EXISTS=== - -%%EXISTS TABLE [db.]name [INTO OUTFILE filename] [FORMAT format]%% - -Returns a single UInt8-type column, which contains the single value 0 if the table or database doesn't exist, or 1 if the table exists in the specified database. - - -===USE=== - -%%USE db%% - -Lets you set the current database for the session. -The current database is used for searching for tables if the database is not explicitly defined in the query with a dot before the table name. -This query can't be made when using the HTTP protocol, since there is no concept of a session. - - -===SET=== - -%%SET param = value%% - -Lets you set the 'param' setting to 'value'. You can also make all the settings from the specified settings profile in a single query. To do this, specify 'profile' as the setting name. For more information, see the section "Settings". The setting is made for the session, or for the server (globally) if GLOBAL is specified. -When making a global setting, the setting is not applied to sessions already running, including the current session. It will only be used for new sessions. - -When the server is restarted, global settings made using SET are lost. -To make settings that persist after a server restart, you can only use the server's config file. - - -===OPTIMIZE=== - -%%OPTIMIZE TABLE [db.]name [PARTITION partition] [FINAL]%% - -Asks the table engine to do something for optimization. -Supported only by *MergeTree engines, in which this query initializes a non-scheduled merge of data parts. -If PARTITION is specified, then only specified partition will be optimized. -If FINAL is specified, then optimization will be performed even if data inside the partition already optimized (i. e. all data is in single part). - - -===INSERT=== - -This query has several variations. - -%%INSERT INTO [db.]table [(c1, c2, c3)] VALUES (v11, v12, v13), (v21, v22, v23), ...%% - -Inserts rows with the listed values in the 'table' table. This query is exactly the same as: - -%%INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ...%% - -%%INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format ...%% - -Inserts data in any specified format. -The data itself comes after 'format', after all space symbols up to the first line break if there is one and including it, or after all space symbols if there isn't a line break. We recommend writing data starting from the next line (this is important if the data starts with space characters). - -Example: - -%%INSERT INTO t FORMAT TabSeparated -11 Hello, world! -22 Qwerty -%% - -For more information about data formats, see the section "Formats". The "Interfaces" section describes how to insert data separately from the query when using the command-line client or the HTTP interface. - -The query may optionally specify a list of columns for insertion. In this case, the default values are written to the other columns. -Default values are calculated from DEFAULT expressions specified in table definitions, or, if the DEFAULT is not explicitly defined, zeros and empty strings are used. If the 'strict_insert_default' setting is set to 1, all the columns that do not have explicit DEFAULTS must be specified in the query. - -%%INSERT INTO [db.]table [(c1, c2, c3)] SELECT ...%% - -Inserts the result of the SELECT query into a table. -The names and data types of the SELECT result must exactly match the table structure that data is inserted into, or the specified list of columns. -To change column names, use synonyms (AS) in the SELECT query. -To change data types, use type conversion functions (see the section "Functions"). - -None of the data formats allows using expressions as values. -In other words, you can't write INSERT INTO t VALUES (now(), 1 + 1, DEFAULT). - -There is no support for other data part modification queries: -UPDATE, DELETE, REPLACE, MERGE, UPSERT, INSERT UPDATE. -However, you can delete old data using ALTER TABLE ... DROP PARTITION. - - -===SELECT=== - -His Highness, the SELECT query. - -%%SELECT [DISTINCT] expr_list - [FROM [db.]table | (subquery) | table_function] [FINAL] - [SAMPLE sample_coeff] - [ARRAY JOIN ...] - [GLOBAL] ANY|ALL INNER|LEFT JOIN (subquery)|table USING columns_list - [PREWHERE expr] - [WHERE expr] - [GROUP BY expr_list] [WITH TOTALS] - [HAVING expr] - [ORDER BY expr_list] - [LIMIT [n, ]m] - [UNION ALL ...] - [INTO OUTFILE filename] - [FORMAT format]%% - -All the clauses are optional, except for the required list of expressions immediately after SELECT. -The clauses below are described in almost the same order as in the query execution conveyor. - -If the query omits the DISTINCT, GROUP BY, and ORDER BY clauses and the IN and JOIN subqueries, the query will be completely stream processed, using O(1) amount of RAM. -Otherwise, the query may consume too much RAM, if appropriate restrictions are not defined (max_memory_usage, max_rows_to_group_by, max_rows_to_sort, max_rows_in_distinct, max_bytes_in_distinct, max_rows_in_set, max_bytes_in_set, max_rows_in_join, max_bytes_in_join, max_bytes_before_external_sort, max_bytes_before_external_group_by). For more information, see the section "Settings". It is possible to use external sorting (saving temporary tables to a disk) and external aggregation. Merge join is not implemented. - -

    FROM clause

    - -If the FROM clause is omitted, data will be read from the 'system.one' table. -The 'system.one' table contains exactly one row (this table fulfills the same purpose as the DUAL table found in other DBMSs). - -The FROM clause specifies the table to read data from, or a subquery, or a table function; ARRAY JOIN and the regular JOIN may also be included (see below). - -Instead of a table, the SELECT subquery may be specified in brackets. In this case, the subquery processing pipeline will be built into the processing pipeline of an external query. -In contrast to standard SQL, a synonym does not need to be specified after a subquery. For compatibility, it is possible to write 'AS name' after a subquery, but the specified name isn't used anywhere. - -A table function may be specified instead of a table. For more information, see the section "Table functions". - -To execute a query, all the columns listed in the query are extracted from the appropriate table. Any columns not needed for the external query are thrown out of the subqueries. -If a query does not list any columns (for example, SELECT count() FROM t), some column is extracted from the table anyway (the smallest one is preferred), in order to calculate the number of rows. - -The FINAL modifier can be used only for a SELECT from a CollapsingMergeTree table. When you specify FINAL, data is selected fully "collapsed". Keep in mind that using FINAL leads to a selection that includes columns related to the primary key, in addition to the columns specified in the SELECT. Additionally, the query will be executed in a single stream, and data will be merged during query execution. This means that when using FINAL, the query is processed more slowly. In most cases, you should avoid using FINAL. For more information, see the section "CollapsingMergeTree engine". - -

    SAMPLE clause

    - -The SAMPLE clause allows for approximated query processing. -Approximated query processing is only supported by MergeTree* type tables, and only if the sampling expression was specified during table creation (see the section "MergeTree engine"). - -SAMPLE has the format %%SAMPLE k%%, where 'k' is a decimal number from 0 to 1, or %%SAMPLE n%%, where 'n' is a sufficiently large integer. - -In the first case, the query will be executed on 'k' percent of data. For example, %%SAMPLE 0.1%% runs the query on 10% of data. -In the second case, the query will be executed on a sample of no more than 'n' rows. For example, %%SAMPLE 10000000%% runs the query on a maximum of 10,000,000 rows. - -Example: - -%%SELECT - Title, - count() * 10 AS PageViews -FROM hits_distributed -SAMPLE 0.1 -WHERE - CounterID = 34 - AND toDate(EventDate) >= toDate('2013-01-29') - AND toDate(EventDate) <= toDate('2013-02-04') - AND NOT DontCountHits - AND NOT Refresh - AND Title != '' -GROUP BY Title -ORDER BY PageViews DESC LIMIT 1000%% - -In this example, the query is executed on a sample from 0.1 (10%) of data. Values of aggregate functions are not corrected automatically, so to get an approximate result, the value 'count()' is manually multiplied by 10. - -When using something like %%SAMPLE 10000000%%, there isn't any information about which relative percent of data was processed or what the aggregate functions should be multiplied by, so this method of writing is not always appropriate to the situation. - -A sample with a relative coefficient is "consistent": if we look at all possible data that could be in the table, a sample (when using a single sampling expression specified during table creation) with the same coefficient always selects the same subset of possible data. In other words, a sample from different tables on different servers at different times is made the same way. - -For example, a sample of user IDs takes rows with the same subset of all the possible user IDs from different tables. This allows using the sample in subqueries in the IN clause, as well as for manually correlating results of different queries with samples. - -

    ARRAY JOIN clause

    - -Allows executing JOIN with an array or nested data structure. The intent is similar to the 'arrayJoin' function, but its functionality is broader. - -ARRAY JOIN is essentially INNER JOIN with an array. Example: - -%% -:) CREATE TABLE arrays_test (s String, arr Array(UInt8)) ENGINE = Memory - -CREATE TABLE arrays_test -( - s String, - arr Array(UInt8) -) ENGINE = Memory - -Ok. - -0 rows in set. Elapsed: 0.001 sec. - -:) INSERT INTO arrays_test VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []) - -INSERT INTO arrays_test VALUES - -Ok. - -3 rows in set. Elapsed: 0.001 sec. - -:) SELECT * FROM arrays_test - -SELECT * -FROM arrays_test - -┌─s───────┬─arr─────┐ -│ Hello │ [1,2] │ -│ World │ [3,4,5] │ -│ Goodbye │ [] │ -└─────────┴─────────┘ - -3 rows in set. Elapsed: 0.001 sec. - -:) SELECT s, arr FROM arrays_test ARRAY JOIN arr - -SELECT s, arr -FROM arrays_test -ARRAY JOIN arr - -┌─s─────┬─arr─┐ -│ Hello │ 1 │ -│ Hello │ 2 │ -│ World │ 3 │ -│ World │ 4 │ -│ World │ 5 │ -└───────┴─────┘ - -5 rows in set. Elapsed: 0.001 sec. -%% - -An alias can be specified for an array in the ARRAY JOIN clause. In this case, an array item can be accessed by this alias, but the array itself by the original name. Example: - -%% -:) SELECT s, arr, a FROM arrays_test ARRAY JOIN arr AS a - -SELECT s, arr, a -FROM arrays_test -ARRAY JOIN arr AS a - -┌─s─────┬─arr─────┬─a─┐ -│ Hello │ [1,2] │ 1 │ -│ Hello │ [1,2] │ 2 │ -│ World │ [3,4,5] │ 3 │ -│ World │ [3,4,5] │ 4 │ -│ World │ [3,4,5] │ 5 │ -└───────┴─────────┴───┘ - -5 rows in set. Elapsed: 0.001 sec. -%% - -Multiple arrays of the same size can be comma-separated in the ARRAY JOIN clause. In this case, JOIN is performed with them simultaneously (the direct sum, not the direct product). -Example: - -%% -:) SELECT s, arr, a, num, mapped FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS mapped - -SELECT s, arr, a, num, mapped -FROM arrays_test -ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(lambda(tuple(x), plus(x, 1)), arr) AS mapped - -┌─s─────┬─arr─────┬─a─┬─num─┬─mapped─┐ -│ Hello │ [1,2] │ 1 │ 1 │ 2 │ -│ Hello │ [1,2] │ 2 │ 2 │ 3 │ -│ World │ [3,4,5] │ 3 │ 1 │ 4 │ -│ World │ [3,4,5] │ 4 │ 2 │ 5 │ -│ World │ [3,4,5] │ 5 │ 3 │ 6 │ -└───────┴─────────┴───┴─────┴────────┘ - -5 rows in set. Elapsed: 0.002 sec. - -:) SELECT s, arr, a, num, arrayEnumerate(arr) FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num - -SELECT s, arr, a, num, arrayEnumerate(arr) -FROM arrays_test -ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num - -┌─s─────┬─arr─────┬─a─┬─num─┬─arrayEnumerate(arr)─┐ -│ Hello │ [1,2] │ 1 │ 1 │ [1,2] │ -│ Hello │ [1,2] │ 2 │ 2 │ [1,2] │ -│ World │ [3,4,5] │ 3 │ 1 │ [1,2,3] │ -│ World │ [3,4,5] │ 4 │ 2 │ [1,2,3] │ -│ World │ [3,4,5] │ 5 │ 3 │ [1,2,3] │ -└───────┴─────────┴───┴─────┴─────────────────────┘ - -5 rows in set. Elapsed: 0.002 sec. -%% - -ARRAY JOIN also works with nested data structures. Example: - -%% -:) CREATE TABLE nested_test (s String, nest Nested(x UInt8, y UInt32)) ENGINE = Memory - -CREATE TABLE nested_test -( - s String, - nest Nested( - x UInt8, - y UInt32) -) ENGINE = Memory - -Ok. - -0 rows in set. Elapsed: 0.006 sec. - -:) INSERT INTO nested_test VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [], []) - -INSERT INTO nested_test VALUES - -Ok. - -3 rows in set. Elapsed: 0.001 sec. - -:) SELECT * FROM nested_test - -SELECT * -FROM nested_test - -┌─s───────┬─nest.x──┬─nest.y─────┐ -│ Hello │ [1,2] │ [10,20] │ -│ World │ [3,4,5] │ [30,40,50] │ -│ Goodbye │ [] │ [] │ -└─────────┴─────────┴────────────┘ - -3 rows in set. Elapsed: 0.001 sec. - -:) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest - -SELECT s, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN nest - -┌─s─────┬─nest.x─┬─nest.y─┐ -│ Hello │ 1 │ 10 │ -│ Hello │ 2 │ 20 │ -│ World │ 3 │ 30 │ -│ World │ 4 │ 40 │ -│ World │ 5 │ 50 │ -└───────┴────────┴────────┘ - -5 rows in set. Elapsed: 0.001 sec. -%% - -When specifying names of nested data structures in ARRAY JOIN, the meaning is the same as ARRAY JOIN with all the array elements that it consists of. Example: - -%% -:) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest.x, nest.y - -SELECT s, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN `nest.x`, `nest.y` - -┌─s─────┬─nest.x─┬─nest.y─┐ -│ Hello │ 1 │ 10 │ -│ Hello │ 2 │ 20 │ -│ World │ 3 │ 30 │ -│ World │ 4 │ 40 │ -│ World │ 5 │ 50 │ -└───────┴────────┴────────┘ - -5 rows in set. Elapsed: 0.001 sec. -%% - -This variation also makes sense: - -%% -:) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest.x - -SELECT s, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN `nest.x` - -┌─s─────┬─nest.x─┬─nest.y─────┐ -│ Hello │ 1 │ [10,20] │ -│ Hello │ 2 │ [10,20] │ -│ World │ 3 │ [30,40,50] │ -│ World │ 4 │ [30,40,50] │ -│ World │ 5 │ [30,40,50] │ -└───────┴────────┴────────────┘ - -5 rows in set. Elapsed: 0.001 sec. -%% - -An alias may be used for a nested data structure, in order to select either the JOIN result or the source array. Example: - -%% -:) SELECT s, n.x, n.y, nest.x, nest.y FROM nested_test ARRAY JOIN nest AS n - -SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN nest AS n - -┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┐ -│ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ -│ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ -│ World │ 3 │ 30 │ [3,4,5] │ [30,40,50] │ -│ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │ -│ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │ -└───────┴─────┴─────┴─────────┴────────────┘ - -5 rows in set. Elapsed: 0.001 sec. -%% - -Example of using the arrayEnumerate function: - -%% -:) SELECT s, n.x, n.y, nest.x, nest.y, num FROM nested_test ARRAY JOIN nest AS n, arrayEnumerate(nest.x) AS num - -SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num -FROM nested_test -ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num - -┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┬─num─┐ -│ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ 1 │ -│ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ 2 │ -│ World │ 3 │ 30 │ [3,4,5] │ [30,40,50] │ 1 │ -│ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │ 2 │ -│ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │ 3 │ -└───────┴─────┴─────┴─────────┴────────────┴─────┘ - -5 rows in set. Elapsed: 0.002 sec. -%% - -The query can only specify a single ARRAY JOIN clause. - -The corresponding conversion can be performed before the WHERE/PREWHERE clause (if its result is needed in this clause), or after completing WHERE/PREWHERE (to reduce the volume of calculations). - -

    JOIN clause

    - -The normal JOIN, which is not related to ARRAY JOIN described above. - -%% -[GLOBAL] ANY|ALL INNER|LEFT [OUTER] JOIN (subquery)|table USING columns_list -%% - -Performs joins with data from the subquery. At the beginning of query execution, the subquery specified after JOIN is run, and its result is saved in memory. Then it is read from the "left" table specified in the FROM clause, and while it is being read, for each of the read rows from the "left" table, rows are selected from the subquery results table (the "right" table) that meet the condition for matching the values of the columns specified in USING. - -The table name can be specified instead of a subquery. This is equivalent to the 'SELECT * FROM table' subquery, except in a special case when the table has the Join engine - an array prepared for joining. - -All columns that are not needed for the JOIN are deleted from the subquery. - -There are several types of JOINs: - -INNER or LEFT - the type: -If INNER is specified, the result will contain only those rows that have a matching row in the right table. -If LEFT is specified, any rows in the left table that don't have matching rows in the right table will be assigned the default value - zeros or empty rows. LEFT OUTER may be written instead of LEFT; the word OUTER does not affect anything. - -ANY or ALL - strictness: -If ANY is specified and there are multiple matching rows in the right table, only the first one will be joined. -If ALL is specified and there are multiple matching rows in the right table, the data will be multiplied by the number of these rows. - -Using ALL corresponds to the normal JOIN semantic from standard SQL. -Using ANY is optimal. If the right table has only one matching row, the results of ANY and ALL are the same. You must specify either ANY or ALL (neither of them is selected by default). - -GLOBAL - distribution: - -When using a normal %%JOIN%%, the query is sent to remote servers. Subqueries are run on each of them in order to make the right table, and the join is performed with this table. In other words, the right table is formed on each server separately. - -When using %%GLOBAL ... JOIN%%, first the requestor server runs a subquery to calculate the right table. This temporary table is passed to each remote server, and queries are run on them using the temporary data that was transmitted. - -Be careful when using GLOBAL JOINs. For more information, see the section "Distributed subqueries" below. - -Any combination of JOINs is possible. For example, %%GLOBAL ANY LEFT OUTER JOIN%%. - -When running JOINs, there is no optimization of the order of execution in relation to other stages of the query. The join (a search in the right table) is run before filtering in WHERE and before aggregation. In order to explicitly set the order of execution, we recommend running a JOIN subquery with a subquery. - -Example: -%% -SELECT - CounterID, - hits, - visits -FROM -( - SELECT - CounterID, - count() AS hits - FROM test.hits - GROUP BY CounterID -) ANY LEFT JOIN -( - SELECT - CounterID, - sum(Sign) AS visits - FROM test.visits - GROUP BY CounterID -) USING CounterID -ORDER BY hits DESC -LIMIT 10 - -┌─CounterID─┬───hits─┬─visits─┐ -│ 1143050 │ 523264 │ 13665 │ -│ 731962 │ 475698 │ 102716 │ -│ 722545 │ 337212 │ 108187 │ -│ 722889 │ 252197 │ 10547 │ -│ 2237260 │ 196036 │ 9522 │ -│ 23057320 │ 147211 │ 7689 │ -│ 722818 │ 90109 │ 17847 │ -│ 48221 │ 85379 │ 4652 │ -│ 19762435 │ 77807 │ 7026 │ -│ 722884 │ 77492 │ 11056 │ -└───────────┴────────┴────────┘ -%% - -Subqueries don't allow you to set names or use them for referencing a column from a specific subquery. -The columns specified in USING must have the same names in both subqueries, and the other columns must be named differently. You can use aliases to change the names of columns in subqueries (the example uses the aliases 'hits' and 'visits'). - -The USING clause specifies one or more columns to join, which establishes the equality of these columns. The list of columns is set without brackets. More complex join conditions are not supported. - -The right table (the subquery result) resides in RAM. If there isn't enough memory, you can't run a JOIN. - -Only one JOIN can be specified in a query (on a single level). To run multiple JOINs, you can put them in subqueries. - -Each time a query is run with the same JOIN, the subquery is run again - the result is not cached. To avoid this, use the special 'Join' table engine, which is a prepared array for joining that is always in RAM. For more information, see the section "Table engines, Join". - -In some cases, it is more efficient to use IN instead of JOIN. Among the various types of JOINs, the most efficient is ANY LEFT JOIN, then ANY INNER JOIN. The least efficient are ALL LEFT JOIN and ALL INNER JOIN. - -If you need a JOIN for joining with dimension tables (these are relatively small tables that contain dimension properties, such as names for advertising campaigns), a JOIN might not be very convenient due to the bulky syntax and the fact that the right table is re-accessed for every query. For such cases, there is an "external dictionaries" feature that you should use instead of JOIN. For more information, see the section "External dictionaries". - - -

    WHERE clause

    - -If there is a WHERE clause, it must contain an expression with the UInt8 type. This is usually an expression with comparison and logical operators. -This expression will be used for filtering data before all other transformations. - -If indexes are supported by the database table engine, the expression is evaluated on the ability to use indexes. - -

    PREWHERE clause

    - -This clause has the same meaning as the WHERE clause. The difference is in which data is read from the table. When using PREWHERE, first only the columns necessary for executing PREWHERE are read. Then the other columns are read that are needed for running the query, but only those blocks where the PREWHERE expression is true. - -It makes sense to use PREWHERE if there are filtration conditions that are not suitable for indexes that are used by a minority of the columns in the query, but that provide strong data filtration. This reduces the volume of data to read. - -For example, it is useful to write PREWHERE for queries that extract a large number of columns, but that only have filtration for a few columns. - -PREWHERE is only supported by *MergeTree tables. - -A query may simultaneously specify PREWHERE and WHERE. In this case, PREWHERE precedes WHERE. - -Keep in mind that it does not make much sense for PREWHERE to only specify those columns that have an index, because when using an index, only the data blocks that match the index are read. - -If the 'optimize_move_to_prewhere' setting is set to 1 and PREWHERE is omitted, the system uses heuristics to automatically move parts of expressions from WHERE to PREWHERE. - - -

    GROUP BY clause

    - -This is one of the most important parts of a column-oriented DBMS. - -If there is a GROUP BY clause, it must contain a list of expressions. Each expression will be referred to here as a "key". -All the expressions in the SELECT, HAVING, and ORDER BY clauses must be calculated from keys or from aggregate functions. In other words, each column selected from the table must be used either in keys or inside aggregate functions. - -If a query contains only table columns inside aggregate functions, the GROUP BY clause can be omitted, and aggregation by an empty set of keys is assumed. - -Example: - -%%SELECT - count(), - median(FetchTiming > 60 ? 60 : FetchTiming), - count() - sum(Refresh) -FROM hits%% - -However, in contrast to standard SQL, if the table doesn't have any rows (either there aren't any at all, or there aren't any after using WHERE to filter), an empty result is returned, and not the result from one of the rows containing the initial values of aggregate functions. - -As opposed to MySQL (and conforming to standard SQL), you can't get some value of some column that is not in a key or aggregate function (except constant expressions). To work around this, you can use the 'any' aggregate function (get the first encountered value) or 'min/max'. - -Example: - -%%SELECT - domainWithoutWWW(URL) AS domain, - count(), - any(Title) AS title -- we take the first page title for each domain -FROM hits -GROUP BY domain%% - -For every different key value encountered, GROUP BY calculates a set of aggregate function values. - -GROUP BY is not supported for array columns. - -A constant can't be specified as arguments for aggregate functions. Example: sum(1). Instead of this, you can get rid of the constant. Example: count(). - - -
    WITH TOTALS modifier
    - -If the WITH TOTALS modifier is specified, another row will be calculated. This row will have key columns containing default values (zeros or empty lines), and columns of aggregate functions with the values calculated across all the rows (the "total" values). - -This extra row is output in JSON*, TabSeparated*, and Pretty* formats, separately from the other rows. In the other formats, this row is not output. - -In JSON* formats, this row is output as a separate 'totals' field. In TabSeparated formats, the row comes after the main result, preceded by an empty row (after the other data). In Pretty formats, the row is output as a separate table after the main result. - -WITH TOTALS can be run in different ways when HAVING is present. The behavior depends on the 'totals_mode' setting. -By default, totals_mode = 'before_having'. In this case, 'totals' is calculated across all rows, including the ones that don't pass through HAVING and 'max_rows_to_group_by'. - -The other alternatives include only the rows that pass through HAVING in 'totals', and behave differently with the setting 'max_rows_to_group_by' and 'group_by_overflow_mode = 'any''. - -after_having_exclusive - Don't include rows that didn't pass through 'max_rows_to_group_by'. In other words, 'totals' will have less than or the same number of rows as it would if 'max_rows_to_group_by' were omitted. - -after_having_inclusive - Include all the rows that didn't pass through 'max_rows_to_group_by' in 'totals'. In other words, 'totals' will have more than or the same number of rows as it would if 'max_rows_to_group_by' were omitted. - -after_having_auto - Count the number of rows that passed through HAVING. If it is more than a certain amount (by default, 50%), include all the rows that didn't pass through 'max_rows_to_group_by' in 'totals'. Otherwise, do not include them. - -totals_auto_threshold - By default, 0.5 is the coefficient for after_having_auto. - -If 'max_rows_to_group_by' and 'group_by_overflow_mode = 'any'' are not used, all variations of 'after_having' are the same, and you can use any of them (for example, 'after_having_auto'). - -You can use WITH TOTALS in subqueries, including subqueries in the JOIN clause. In this case, the respective total values are combined. - - -
    external memory GROUP BY
    - -It is possible to turn on spilling temporary data to disk to limit memory consumption during the execution of GROUP BY. Value of %%max_bytes_before_external_group_by%% setting determines the maximum memory consumption before temporary data is dumped to the file system. If it is 0 (the default value), the feature is turned off. - -When using %%max_bytes_before_external_group_by%% it is advisable to set %%max_memory_usage%% to an approximately twice greater value. The reason for this is that aggregation is executed in two stages: reading and generation of intermediate data (1) and merging of intermediate data (2). Spilling data to the filesystem can be performed only on stage 1. If the spilling did not happen, then stage 2 could consume up to the same amount of memory as stage 1. - -For example: if %%max_memory_usage%% is equal to 10000000000 and you want to use external aggregation, it makes sense to set %%max_bytes_before_external_group_by%% to 10000000000 and %%max_memory_usage%% to 20000000000. If dumping data to the file system happened at least once during the execution, maximum memory consumption would be just a little bit higher than %%max_bytes_before_external_group_by%%. - -During distributed query execution external aggregation is performed on the remote servers. If you want the memory consumption on the originating server to be small, set %%distributed_aggregation_memory_efficient%% to 1. If %%distributed_aggregation_memory_efficient%% is turned on then during merging of the dumped data and also during merging of the query results from the remote servers, total memory consumption is no more than 1/256 * number of threads of the total amount of memory. - -If external aggregation is turned on and total memory consumption was less than %%max_bytes_before_external_group_by%% (meaning that no spilling took place), the query performance is the same as when external aggregation is turned off. If some data was dumped, then execution time will be several times longer (approximately 3x). - -If you have an ORDER BY clause with some small LIMIT after a GROUP BY, then ORDER BY will not consume significant amount of memory. But if no LIMIT is provided, don't forget to turn on external sorting (%%max_bytes_before_external_sort%%). - - -
    LIMIT N BY modifier
    - -LIMIT %%N%% BY %%COLUMNS%% allows you to restrict top %%N%% rows per each group of %%COLUMNS%%. %%LIMIT N BY%% is unrelated to %%LIMIT%% clause. Key for %%LIMIT N BY%% could contain arbitrary number of columns or expressions. - -Example: - -%%SELECT - domainWithoutWWW(URL) AS domain, - domainWithoutWWW(REFERRER_URL) AS referrer, - device_type, - count() cnt -FROM hits -GROUP BY domain, referrer, device_type -ORDER BY cnt DESC -LIMIT 5 BY domain, device_type -LIMIT 100 -%% - -will select top 5 referrers for each domain - device type pair, total number of rows - 100. - - -

    HAVING clause

    - -Allows filtering the result received after GROUP BY, similar to the WHERE clause. -WHERE and HAVING differ in that WHERE is performed before aggregation (GROUP BY), while HAVING is performed after it. If aggregation is not performed, HAVING can't be used. - - -

    ORDER BY clause

    - -The ORDER BY clause contains a list of expressions, which can each be assigned DESC or ASC (the sorting direction). If the direction is not specified, ASC is assumed. ASC is sorted in ascending order, and DESC in descending order. The sorting direction applies to a single expression, not to the entire list. Example: %%ORDER BY Visits DESC, SearchPhrase%% - -For sorting by String values, you can specify collation (comparison). Example: %%ORDER BY SearchPhrase COLLATE 'tr'%% - for sorting by keyword in ascending order, using the Turkish alphabet, case insensitive, assuming that strings are UTF-8 encoded. COLLATE can be specified or not for each expression in ORDER BY independently. If ASC or DESC is specified, COLLATE is specified after it. When using COLLATE, sorting is always case-insensitive. - -We only recommend using COLLATE for final sorting of a small number of rows, since sorting with COLLATE is less efficient than normal sorting by bytes. - -Rows that have identical values for the list of sorting expressions are output in an arbitrary order, which can also be nondeterministic (different each time). -If the ORDER BY clause is omitted, the order of the rows is also undefined, and may be nondeterministic as well. - -When floating point numbers are sorted, NaNs are separate from the other values. Regardless of the sorting order, NaNs come at the end. In other words, for ascending sorting they are placed as if they are larger than all the other numbers, while for descending sorting they are placed as if they are smaller than the rest. - -Less RAM is used if a small enough LIMIT is specified in addition to ORDER BY. Otherwise, the amount of memory spent is proportional to the volume of data for sorting. For distributed query processing, if GROUP BY is omitted, sorting is partially done on remote servers, and the results are merged on the requestor server. This means that for distributed sorting, the volume of data to sort can be greater than the amount of memory on a single server. - -If there is not enough RAM, it is possible to perform sorting in external memory (creating temporary files on a disk). Use the setting %%max_bytes_before_external_sort%% for this purpose. If it is set to 0 (the default), external sorting is disabled. If it is enabled, when the volume of data to sort reaches the specified number of bytes, the collected data is sorted and dumped into a temporary file. After all data is read, all the sorted files are merged and the results are output. Files are written to the /var/lib/clickhouse/tmp/ directory in the config (by default, but you can use the 'tmp_path' parameter to change this setting). - -Running a query may use more memory than 'max_bytes_before_external_sort'. For this reason, this setting must have a value significantly smaller than 'max_memory_usage'. As an example, if your server has 128 GB of RAM and you need to run a single query, set 'max_memory_usage' to 100 GB, and 'max_bytes_before_external_sort' to 80 GB. - -External sorting works much less effectively than sorting in RAM. - -

    SELECT clause

    - -The expressions specified in the SELECT clause are analyzed after the calculations for all the clauses listed above are completed. -More specifically, expressions are analyzed that are above the aggregate functions, if there are any aggregate functions. The aggregate functions and everything below them are calculated during aggregation (GROUP BY). These expressions work as if they are applied to separate rows in the result. - -

    DISTINCT clause

    - -If DISTINCT is specified, only a single row will remain out of all the sets of fully matching rows in the result. -The result will be the same as if GROUP BY were specified across all the fields specified in SELECT without aggregate functions. But there are several differences from GROUP BY: -- DISTINCT can be applied together with GROUP BY. -- When ORDER BY is omitted and LIMIT is defined, the query stops running immediately after the required number of different rows has been read. In this case, using DISTINCT is much more optimal. -- Data blocks are output as they are processed, without waiting for the entire query to finish running. - -DISTINCT is not supported if SELECT has at least one array column. - -

    LIMIT clause

    - -LIMIT m allows you to select the first 'm' rows from the result. -LIMIT n, m allows you to select the first 'm' rows from the result after skipping the first 'n' rows. - -'n' and 'm' must be non-negative integers. - -If there isn't an ORDER BY clause that explicitly sorts results, the result may be arbitrary and nondeterministic. - - -

    UNION ALL clause

    - -You can use UNION ALL to combine any number of queries. Example: - -%% -SELECT CounterID, 1 AS table, toInt64(count()) AS c - FROM test.hits - GROUP BY CounterID - -UNION ALL - -SELECT CounterID, 2 AS table, sum(Sign) AS c - FROM test.visits - GROUP BY CounterID - HAVING c > 0 -%% - -Only UNION ALL is supported. The regular UNION (UNION DISTINCT) is not supported. If you need UNION DISTINCT, you can write SELECT DISTINCT from a subquery containing UNION ALL. - -Queries that are parts of UNION ALL can be run simultaneously, and their results can be mixed together. - -The structure of results (the number and type of columns) must match for the queries, but the column names can differ. In this case, the column names for the final result will be taken from the first query. - -Queries that are parts of UNION ALL can't be enclosed in brackets. ORDER BY and LIMIT are applied to separate queries, not to the final result. If you need to apply a conversion to the final result, you can put all the queries with UNION ALL in a subquery in the FROM clause. - - -

    INTO OUTFILE clause

    - -Add %%INTO OUTFILE filename%% clause (where filename is a string literal) to redirect query output to a file filename. -In contrast to MySQL the file is created on a client host. The query will fail if a file with the same filename already exists. -INTO OUTFILE is available in the command-line client and clickhouse-local (a query sent via HTTP interface will fail). - -Default output format is TabSeparated (the same as in the batch mode of command-line client). - -

    FORMAT clause

    - -Specify 'FORMAT format' to get data in any specified format. -You can use this for convenience, or for creating dumps. For more information, see the section "Formats". -If the FORMAT clause is omitted, the default format is used, which depends on both the settings and the interface used for accessing the DB. For the HTTP interface and the command-line client in batch mode, the default format is TabSeparated. For the command-line client in interactive mode, the default format is PrettyCompact (it has attractive and compact tables). - -When using the command-line client, data is passed to the client in an internal efficient format. The client independently interprets the FORMAT clause of the query and formats the data itself (thus relieving the network and the server from the load). - - -

    IN operators

    - -The %%IN%%, %%NOT IN%%, %%GLOBAL IN%%, and %%GLOBAL NOT IN%% operators are covered separately, since their functionality is quite rich. - -The left side of the operator is either a single column or a tuple. - -Examples: - -%%SELECT UserID IN (123, 456) FROM ...%% -%%SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ...%% - -If the left side is a single column that is in the index, and the right side is a set of constants, the system uses the index for processing the query. - -Don't list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section "External data for query processing"), then use a subquery. - -The right side of the operator can be a set of constant expressions, a set of tuples with constant expressions (shown in the examples above), or the name of a database table or SELECT subquery in brackets. - -If the right side of the operator is the name of a table (for example, %%UserID IN users%%), this is equivalent to the subquery %%UserID IN (SELECT * FROM users)%%. Use this when working with external data that is sent along with the query. For example, the query can be sent together with a set of user IDs loaded to the 'users' temporary table, which should be filtered. - -If the right side of the operator is a table name that has the Set engine (a prepared data set that is always in RAM), the data set will not be created over again for each query. - -The subquery may specify more than one column for filtering tuples. -Example: -%%SELECT (CounterID, UserID) IN (SELECT CounterID, UserID FROM ...) FROM ...%% - -The columns to the left and right of the %%IN%% operator should have the same type. - -The IN operator and subquery may occur in any part of the query, including in aggregate functions and lambda functions. -Example: - -%%SELECT - EventDate, - avg(UserID IN - ( - SELECT UserID - FROM test.hits - WHERE EventDate = toDate('2014-03-17') - )) AS ratio -FROM test.hits -GROUP BY EventDate -ORDER BY EventDate ASC - -┌──EventDate─┬────ratio─┐ -│ 2014-03-17 │ 1 │ -│ 2014-03-18 │ 0.807696 │ -│ 2014-03-19 │ 0.755406 │ -│ 2014-03-20 │ 0.723218 │ -│ 2014-03-21 │ 0.697021 │ -│ 2014-03-22 │ 0.647851 │ -│ 2014-03-23 │ 0.648416 │ -└────────────┴──────────┘ -%% -- for each day after March 17th, count the percentage of pageviews made by users who visited the site on March 17th. - -A subquery in the IN clause is always run just one time on a single server. There are no dependent subqueries. - - -

    Distributed subqueries

    - -There are two versions of INs with subqueries (and for JOINs): the regular %%IN%% / %%JOIN%%, and %%GLOBAL IN%% / %%GLOBAL JOIN%%. They differ in how they are run for distributed query processing. - -When using the regular %%IN%%, the query is sent to remote servers, and each of them runs the subqueries in the IN or JOIN clause. - -When using %%GLOBAL IN%% / %%GLOBAL JOIN%%, first all the subqueries for %%GLOBAL IN%% / %%GLOBAL JOIN%% are run, and the results are collected in temporary tables. Then the temporary tables are sent to each remote server, where the queries are run using this temporary data. - -For a non-distributed query, use the regular %%IN%% / %%JOIN%%. - - -Be careful when using subqueries in the %%IN%% / %%JOIN%% clauses for distributed query processing. - -Let's look at some examples. Assume that each server in the cluster has a normal local_table. Each server also has a distributed_table table with the Distributed type, which looks at all the servers in the cluster. - -For a query to the distributed_table, the query will be sent to all the remote servers and run on them using the local_table. - -For example, the query -%%SELECT uniq(UserID) FROM distributed_table%% -will be sent to all the remote servers as -%%SELECT uniq(UserID) FROM local_table%% -and run on each of them in parallel, until it reaches the stage where intermediate results can be combined. Then the intermediate results will be returned to the requestor server and merged on it, and the final result will be sent to the client. - -Now let's examine a query with IN: -%%SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34)%% -- calculates the overlap in the audiences of two websites. - -This query will be sent to all the remote servers as -%%SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34)%% -In other words, the data set in the %%IN%% clause will be collected on each server independently, only across the data that is stored locally on each of the servers. - -This will work correctly and optimally if you are prepared for this case and have spread data across the cluster servers such that the data for a single UserID resides entirely on a single server. In this case, all the necessary data will be available locally on each server. Otherwise, the result will be inaccurate. We refer to this variation of the query as "local IN". - -To correct how the query works when data is spread randomly across the cluster servers, you could specify distributed_table inside a subquery. The query would look like this: -%%SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34)%% - -This query will be sent to all remote servers as -%%SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34)%% -Each of the remote servers will start running the subquery. Since the subquery uses a distributed table, each remote server will re-send the subquery to every remote server, as -%%SELECT UserID FROM local_table WHERE CounterID = 34%% -For example, if you have a cluster of 100 servers, executing the entire query will require 10,000 elementary requests, which is generally considered unacceptable. - -In such cases, you should always use %%GLOBAL IN%% instead of %%IN%%. Let's look at how it works for the query -%%SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID GLOBAL IN (SELECT UserID FROM distributed_table WHERE CounterID = 34)%% - -The requestor server will execute the subquery -%%SELECT UserID FROM distributed_table WHERE CounterID = 34%% -and the result will be put in a temporary table in RAM. Then a query will be sent to each remote server as -%%SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID GLOBAL IN _data1%% -and the temporary table '_data1' will be sent to every remote server together with the query (the name of the temporary table is implementation-defined). - -This is more optimal than using the normal IN. However, keep the following points in mind: - -1. When creating a temporary table, data is not made unique. To reduce the volume of data transmitted over the network, specify %%DISTINCT%% in the subquery. (You don't need to do this for a normal IN.) -2. The temporary table will be sent to all the remote servers. Transmission does not account for network topology. For example, if 10 remote servers reside in a datacenter that is very remote in relation to the requestor server, the data will be sent 10 times over the channel to the remote datacenter. Try to avoid large data sets when using %%GLOBAL IN%%. -3. When transmitting data to remote servers, restrictions on network bandwidth are not configurable. You might overload the network. -4. Try to distribute data across servers so that you don't need to use %%GLOBAL IN%% on a regular basis. -5. If you need to use %%GLOBAL IN%% often, plan the location of the ClickHouse cluster so that in each datacenter, there will be at least one replica of each shard, and there is a fast network between them - for possibility to process query with transferring data only inside datacenter. - -It also makes sense to specify a local table in the %%GLOBAL IN%% clause, in case this local table is only available on the requestor server and you want to use data from it on remote servers. - - -

    Extreme values

    - -In addition to results, you can also get minimum and maximum values for the results columns. To do this, set the 'extremes' setting to '1'. Minimums and maximums are calculated for numeric types, dates, and dates with times. For other columns, the default values are output. - -An extra two rows are calculated - the minimums and maximums, respectively. These extra two rows are output in JSON*, TabSeparated*, and Pretty* formats, separate from the other rows. They are not output for other formats. - -In JSON* formats, the extreme values are output in a separate 'extremes' field. In TabSeparated formats, the row comes after the main result, and after 'totals' if present. It is preceded by an empty row (after the other data). In Pretty formats, the row is output as a separate table after the main result, and after 'totals' if present. - -Extreme values are calculated for rows that have passed through LIMIT. However, when using 'LIMIT offset, size', the rows before 'offset' are included in 'extremes'. In stream requests, the result may also include a small number of rows that passed through LIMIT. - - -

    Notes

    - -The GROUP BY and ORDER BY clauses do not support positional arguments. This contradicts MySQL, but conforms to standard SQL. -For example, 'GROUP BY 1, 2' will be interpreted as grouping by constants (i.e. aggregation of all rows into one). - -You can use synonyms (AS aliases) in any part of a query. - -You can put an asterisk in any part of a query instead of an expression. When the query is analyzed, the asterisk is expanded to a list of all table columns (excluding the MATERIALIZED and ALIAS columns). There are only a few cases when using an asterisk is justified: -- When creating a table dump. -- For tables containing just a few columns, such as system tables. -- For getting information about what columns are in a table. In this case, set 'LIMIT 1'. But it is better to use the DESC TABLE query. -- When there is strong filtration on a small number of columns using PREWHERE. -- In subqueries (since columns that aren't needed for the external query are excluded from subqueries). -In all other cases, we don't recommend using the asterisk, since it only gives you the drawbacks of a columnar DBMS instead of the advantages. - -===KILL QUERY=== -%%KILL QUERY WHERE <where expression to SELECT FROM system.processes query> [SYNC|ASYNC|TEST] [FORMAT format]%% -Tries to finish currently executing queries. -Queries to be finished are selected from %%system.processes%% table according to expression after WHERE term. - -Examples: -%%KILL QUERY WHERE query_id='2-857d-4a57-9ee0-327da5d60a90'%% -Finishes all queries with specified %%query_id%%. - -%%KILL QUERY WHERE user='username' SYNC%% -Synchronously finishes all queries of user %%username%%. - -Readonly users can kill only own queries. - -
    -
    -

    External data for query processing

    -
    -
    - -ClickHouse allows sending a server the data that is needed for processing a query, together with a SELECT query. This data is put in a temporary table (see the section "Temporary tables") and can be used in the query (for example, in IN operators). - -For example, if you have a text file with important user identifiers, you can upload it to the server along with a query that uses filtration by this list. - -If you need to run more than one query with a large volume of external data, don't use this feature. It is better to upload the data to the DB ahead of time. - -External data can be uploaded using the command-line client (in non-interactive mode), or using the HTTP interface. - -In the command-line client, you can specify a parameters section in the format - -%%--external --file=... [--name=...] [--format=...] [--types=...|--structure=...]%% - -You may have multiple sections like this, for the number of tables being transmitted. - ---external - Marks the beginning of the section. ---file - Path to the file with the table dump, or %%-%%, which refers to stdin. -Only a single table can be retrieved from stdin. - -The following parameters are optional: ---name - Name of the table. If omitted, %%_data%% is used. ---format - Data format in the file. If omitted, %%TabSeparated%% is used. - -One of the following parameters is required: ---types - A comma-separated list of column types. For example, %%UInt64,String%%. Columns will be named %%_1%%, %%_2%%, ... ---structure - Table structure, in the format %%UserID UInt64, URL String%%. Defines the column names and types. - -The files specified in %%file%% will be parsed by the format specified in %%format%%, using the data types specified in %%types%% or %%structure%%. The table will be uploaded to the server and accessible there as a temporary table with the name %%name%%. - -Examples: - -%%echo -ne "1\n2\n3\n" | clickhouse-client --query="SELECT count() FROM test.visits WHERE TraficSourceID IN _data" --external --file=- --types=Int8 -849897 -%% - -%%cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, count() AS c FROM passwd GROUP BY shell ORDER BY c DESC" --external --file=- --name=passwd --structure='login String, unused String, uid UInt16, gid UInt16, comment String, home String, shell String' -/bin/sh 20 -/bin/false 5 -/bin/bash 4 -/usr/sbin/nologin 1 -/bin/sync 1 -%% - -When using the HTTP interface, external data is passed in the multipart/form-data format. Each table is transmitted as a separate file. The table name is taken from the file name. The 'query_string' passes the parameters 'name_format', 'name_types', and 'name_structure', where name is the name of the table that these parameters correspond to. The meaning of the parameters is the same as when using the command-line client. - -Example: - -
    cat /etc/passwd | sed 's/:/\t/g' > passwd.tsv
    -
    -curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+count()+AS+c+FROM+passwd+GROUP+BY+shell+ORDER+BY+c+DESC&passwd_structure=login+String,+unused+String,+uid+UInt16,+gid+UInt16,+comment+String,+home+String,+shell+String'
    -/bin/sh 20
    -/bin/false      5
    -/bin/bash       4
    -/usr/sbin/nologin       1
    -/bin/sync       1
    -
    - -For distributed query processing, the temporary tables are sent to all the remote servers. - -
    -
    -

    Table engines

    -
    -
    - -The table engine (type of table) determines: -- How and where data is stored - where to write it to, and where to read it from. -- Which queries are supported, and how. -- Concurrent data access. -- Use of indexes, if present. -- Whether multithreaded request execution is possible. -- Data replication. -- When reading data, the engine is only required to extract the necessary set of columns. However, in some cases, the query may be partially processed inside the table engine. - -Note that for most serious tasks, you should use engines from the MergeTree family. - - -==TinyLog== - -The simplest table engine, which stores data on a disk. -Each column is stored in a separate compressed file. -When writing, data is appended to the end of files. -Concurrent data access is not restricted in any way: -- If you are simultaneously reading from a table and writing to it in a different query, the read operation will complete with an error. -- If you are writing to a table in multiple queries simultaneously, the data will be broken. -The typical way to use this table is write-once: first just write the data one time, then read it as many times as needed. -Queries are executed in a single stream. In other words, this engine is intended for relatively small tables (recommended up to 1,000,000 rows). -It makes sense to use this table engine if you have many small tables, since it is simpler than the Log engine (fewer files need to be opened). -The situation when you have a large number of small tables guarantees poor productivity, but may already be used when working with another DBMS, and you may find it easier to switch to using TinyLog types of tables. -Indexes are not supported. - -In Yandex.Metrica, TinyLog tables are used for intermediary data that is processed in small batches. - - -==Log== - -Log differs from TinyLog in that a small file of "marks" resides with the column files. These marks are written on every data block and contain offsets - where to start reading the file in order to skip the specified number of rows. This makes it possible to read table data in multiple threads. For concurrent data access, the read operations can be performed simultaneously, while write operations block reads and each other. -The Log engine does not support indexes. Similarly, if writing to a table failed, the table is broken, and reading from it returns an error. The Log engine is appropriate for temporary data, write-once tables, and for testing or demonstration purposes. - - -==Memory== - -The Memory engine stores data in RAM, in uncompressed form. Data is stored in exactly the same form as it is received when read. In other words, reading from this table is completely free. -Concurrent data access is synchronized. Locks are short: read and write operations don't block each other. -Indexes are not supported. Reading is parallelized. -Maximal productivity (over 10 GB/sec) is reached on simple queries, because there is no reading from the disk, decompressing, or deserializing data. (We should note that in many cases, the productivity of the MergeTree engine is almost as high.) -When restarting a server, data disappears from the table and the table becomes empty. -Normally, using this table engine is not justified. However, it can be used for tests, and for tasks where maximum speed is required on a relatively small number of rows (up to approximately 100,000,000). - -The Memory engine is used by the system for temporary tables with external query data (see the section "External data for processing a query"), and for implementing GLOBAL IN (see the section "IN operators"). - - -==Merge== - -The Merge engine (not to be confused with MergeTree) does not store data itself, but allows reading from any number of other tables simultaneously. -Reading is automatically parallelized. Writing to a table is not supported. When reading, the indexes of tables that are actually being read are used, if they exist. -The Merge engine accepts parameters: the database name and a regular expression for tables. Example: - -%%Merge(hits, '^WatchLog')%% - -- Data will be read from the tables in the 'hits' database with names that match the regex '^WatchLog'. - -Instead of the database name, you can use a constant expression that returns a string. For example, %%currentDatabase()%%. - -Regular expressions are re2 (similar to PCRE), case-sensitive. See the notes about escaping symbols in regular expressions in the "match" section. - -When selecting tables to read, the Merge table itself will not be selected, even if it matches the regex. This is to avoid loops. -It is possible to create two Merge tables that will endlessly try to read each others' data. But don't do this. - -The typical way to use the Merge engine is for working with a large number of TinyLog tables as if with a single table. - -===Virtual columns=== - -Virtual columns are columns that are provided by the table engine, regardless of the table definition. In other words, these columns are not specified in CREATE TABLE, but they are accessible for SELECT. - -Virtual columns differ from normal columns in the following ways: -- They are not specified in table definitions. -- Data can't be added to them with INSERT. -- When using INSERT without specifying the list of columns, virtual columns are ignored. -- They are not selected when using the asterisk (SELECT *). -- Virtual columns are not shown in SHOW CREATE TABLE and DESC TABLE queries. - -A Merge table contains the virtual column _table of the String type. (If the table already has a '_table' column, the virtual column is named '_table1', and if it already has '_table1', it is named '_table2', and so on.) It contains the name of the table that data was read from. - -If the WHERE or PREWHERE clause contains conditions for the '_table' column that do not depend on other table columns (as one of the conjunction elements, or as an entire expression), these conditions are used as an index. The conditions are performed on a data set of table names to read data from, and the read operation will be performed from only those tables that the condition was triggered on. - - -==Distributed== - -The Distributed engine does not store data itself, but allows distributed query processing on multiple servers. -Reading is automatically parallelized. During a read, the table indexes on remote servers are used, if there are any. -The Distributed engine accepts parameters: the cluster name in the server's config file, the name of a remote database, the name of a remote table, and (optionally) a sharding key. -Example: - -%%Distributed(logs, default, hits[, sharding_key])%% - -- Data will be read from all servers in the 'logs' cluster, from the 'default.hits' table located on every server in the cluster. -Data is not only read, but is partially processed on the remote servers (to the extent that this is possible). -For example, for a query with GROUP BY, data will be aggregated on remote servers, and the intermediate states of aggregate functions will be sent to the requestor server. Then data will be further aggregated. - -Instead of the database name, you can use a constant expression that returns a string. For example, %%currentDatabase()%%. - -logs - The cluster name in the server's config file. - -Clusters are set like this: - -%% -<remote_servers> - <logs> - <shard> - <!-- Optional. Shard weight when writing data. By default, 1. --> - <weight>1</weight> - <!-- Optional. Whether to write data to just one of the replicas. By default, false - write data to all of the replicas. --> - <internal_replication>false</internal_replication> - <replica> - <host>example01-01-1</host> - <port>9000</port> - </replica> - <replica> - <host>example01-01-2</host> - <port>9000</port> - </replica> - </shard> - <shard> - <weight>2</weight> - <internal_replication>false</internal_replication> - <replica> - <host>example01-02-1</host> - <port>9000</port> - </replica> - <replica> - <host>example01-02-2</host> - <port>9000</port> - </replica> - </shard> - </logs> -</remote_servers> -%% - -Here a cluster is defined with the name 'logs' that consists of two shards, each of which contains two replicas. Shards refer to the servers that contain different parts of the data (in order to read all the data, you must access all the shards). -Replicas are duplicating servers (in order to read all the data, you can access the data on any one of the replicas). - -For each server, there are several parameters: mandatory: 'host', 'port', and optional: 'user', 'password'. -host - address of remote server. May be specified as domain name or IPv4 or IPv6 address. If you specify domain, server will perform DNS lookup at startup, and result will be cached till server shutdown. If DNS request is failed, server won't start. If you are changing DNS records, restart the server for new records to take effect. -port - TCP-port for interserver communication (tcp_port in configuration file, usually 9000). Don't get confused with http_port. -user - user name to connect to remote server. By default user is 'default'. This user must have access rights to connect to remote server. Access rights are managed in users.xml configuration file. For additional info, consider "Access rights" section. -password - password to log in to remote server, in plaintext. Default is empty string. - -When specifying replicas, one of the available replicas will be selected for each of the shards when reading. You can configure the algorithm for load balancing (the preference for which replica to access) - see the 'load_balancing' setting. -If the connection with the server is not established, there will be an attempt to connect with a short timeout. If the connection failed, the next replica will be selected, and so on for all the replicas. If the connection attempt failed for all the replicas, the attempt will be repeated the same way, several times. -This works in favor of resiliency, but does not provide complete fault tolerance: a remote server might accept the connection, but might not work, or work poorly. - -You can specify just one of the shards (in this case, query processing should be called remote, rather than distributed) or up to any number of shards. In each shard, you can specify from one to any number of replicas. You can specify a different number of replicas for each shard. - -You can specify as many clusters as you wish in the configuration. - -To view your clusters, use the 'system.clusters' table. - -The Distributed engine allows working with a cluster like a local server. However, the cluster is inextensible: you must write its configuration in the server config file (even better, for all the cluster's servers). - -There is no support for Distributed tables that look at other Distributed tables (except in cases when a Distributed table only has one shard). As an alternative, make the Distributed table look at the "final" tables. - -The Distributed engine requires writing clusters to the config file. Clusters from config are updated on the fly, it does not require server restart. If you need to send a query to an unknown set of shards and replicas each time, you don't need to create a Distributed table - use the 'remote' table function instead. See the section "Table functions". - -There are two methods for writing data to a cluster: - -First, you can define which servers to write which data to, and perform the write directly on each shard. In other words, perform INSERT in the tables that the distributed table "looks at". -This is the most flexible solution - you can use any sharding scheme, which could be non-trivial due to the requirements of the subject area. -This is also the most optimal solution, since data can be written to different shards completely independently. - -Second, you can perform INSERT in a Distributed table. In this case, the table will distribute the inserted data across servers itself. -In order to write to a Distributed table, it must have a sharding key set (the last parameter). In addition, if there is only one shard, the write operation works without specifying the sharding key, since it doesn't have any meaning in this case. - -Each shard can have a weight defined in the config file. By default, the weight is equal to one. Data is distributed across shards in the amount proportional to the shard weight. For example, if there are two shards and the first has a weight of 9 while the second has a weight of 10, the first will be sent 9 / 19 parts of the rows, and the second will be sent 10 / 19. - -Each shard can have the 'internal_replication' parameter defined in the config file. - -If this parameter is set to 'true', the write operation selects the first healthy replica and writes data to it. Use this alternative if the Distributed table "looks at" replicated tables. In other words, if the table where data will be written is going to replicate them itself. - -If it is set to 'false' (the default), data is written to all replicas. In essence, this means that the Distributed table replicates data itself. This is worse than using replicated tables, because the consistency of replicas is not checked, and over time they will contain slightly different data. - -To select the shard that a row of data is sent to, the sharding expression is analyzed, and its remainder is taken from dividing it by the total weight of the shards. The row is sent to the shard that corresponds to the half-interval of the remainders from 'prev_weight' to 'prev_weights + weight', where 'prev_weights' is the total weight of the shards with the smallest number, and 'weight' is the weight of this shard. For example, if there are two shards, and the first has a weight of 9 while the second has a weight of 10, the row will be sent to the first shard for the remainders from the range [0, 9), and to the second for the remainders from the range [10, 19). - -The sharding expression can be any expression from constants and table columns that returns an integer. For example, you can use the expression 'rand()' for random distribution of data, or 'UserID' for distribution by the remainder from dividing the user's ID (then the data of a single user will reside on a single shard, which simplifies running IN and JOIN by users). If one of the columns is not distributed evenly enough, you can wrap it in a hash function: intHash64(UserID). - -A simple remainder from division is a limited solution for sharding and isn't always appropriate. It works for medium and large volumes of data (dozens of servers), but not for very large volumes of data (hundreds of servers or more). In the latter case, use the sharding scheme required by the subject area, rather than using entries in Distributed tables. - -SELECT queries are sent to all the shards, and work regardless of how data is distributed across the shards (they can be distributed completely randomly). When you add a new shard, you don't have to transfer the old data to it. You can write new data with a heavier weight - the data will be distributed slightly unevenly, but queries will work correctly and efficiently. - -You should be concerned about the sharding scheme in the following cases: -- Queries are used that require joining data (IN or JOIN) by a specific key. If data is sharded by this key, you can use local IN or JOIN instead of GLOBAL IN or GLOBAL JOIN, which is much more efficient. -- A large number of servers is used (hundreds or more) with a large number of small queries (queries of individual clients - websites, advertisers, or partners). In order for the small queries to not affect the entire cluster, it makes sense to locate data for a single client on a single shard. Alternatively, as we've done in Yandex.Metrica, you can set up bi-level sharding: divide the entire cluster into "layers", where a layer may consist of multiple shards. Data for a single client is located on a single layer, but shards can be added to a layer as necessary, and data is randomly distributed within them. Distributed tables are created for each layer, and a single shared distributed table is created for global queries. - -Data is written asynchronously. For an INSERT to a Distributed table, the data block is just written to the local file system. The data is sent to the remote servers in the background as soon as possible. You should check whether data is sent successfully by checking the list of files (data waiting to be sent) in the table directory: -/var/lib/clickhouse/data/database/table/. - -If the server ceased to exist or had a rough restart (for example, after a device failure) after an INSERT to a Distributed table, the inserted data might be lost. If a damaged data part is detected in the table directory, it is transferred to the 'broken' subdirectory and no longer used. - - -==MergeTree== - -The MergeTree engine supports an index by primary key and by date, and provides the possibility to update data in real time. -This is the most advanced table engine in ClickHouse. Don't confuse it with the Merge engine. - -The engine accepts parameters: the name of a Date type column containing the date, a sampling expression (optional), a tuple that defines the table's primary key, and the index granularity. -Example: - -Example without sampling support: -%%MergeTree(EventDate, (CounterID, EventDate), 8192)%% - -Example with sampling support: -%%MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID)), 8192)%% - -A MergeTree type table must have a separate column containing the date. In this example, it is the 'EventDate' column. The type of the date column must be 'Date' (not 'DateTime'). - -The primary key may be a tuple from any expressions (usually this is just a tuple of columns), or a single expression. - -The sampling expression (optional) can be any expression. It must also be present in the primary key. The example uses a hash of user IDs to pseudo-randomly disperse data in the table for each CounterID and EventDate. In other words, when using the SAMPLE clause in a query, you get an evenly pseudo-random sample of data for a subset of users. - -The table is implemented as a set of parts. Each part is sorted by the primary key. In addition, each part has the minimum and maximum date assigned. When inserting in the table, a new sorted part is created. The merge process is periodically initiated in the background. When merging, several parts are selected, usually the smallest ones, and then merged into one large sorted part. - -In other words, incremental sorting occurs when inserting to the table. Merging is implemented so that the table always consists of a small number of sorted parts, and the merge itself doesn't do too much work. - -During insertion, data belonging to different months is separated into different parts. The parts that correspond to different months are never combined. The purpose of this is to provide local data modification (for ease in backups). - -Parts are combined up to a certain size threshold, so there aren't any merges that are too long. - -For each part, an index file is also written. The index file contains the primary key value for every 'index_granularity' row in the table. In other words, this is an abbreviated index of sorted data. - -For columns, "marks" are also written to each 'index_granularity' row so that data can be read in a specific range. - -When reading from a table, the SELECT query is analyzed for whether indexes can be used. An index can be used if the WHERE or PREWHERE clause has an expression (as one of the conjunction elements, or entirely) that represents an equality or inequality comparison operation, or if it has IN above columns that are in the primary key or date, or Boolean operators over them. - -Thus, it is possible to quickly run queries on one or many ranges of the primary key. In the example given, queries will work quickly for a specific counter, for a specific counter and range of dates, for a specific counter and date, for multiple counters and a range of dates, and so on. - -%%SELECT count() FROM table WHERE EventDate = toDate(now()) AND CounterID = 34%% -%%SELECT count() FROM table WHERE EventDate = toDate(now()) AND (CounterID = 34 OR CounterID = 42)%% -%%SELECT count() FROM table WHERE ((EventDate >= toDate('2014-01-01') AND EventDate <= toDate('2014-01-31')) OR EventDate = toDate('2014-05-01')) AND CounterID IN (101500, 731962, 160656) AND (CounterID = 101500 OR EventDate != toDate('2014-05-01'))%% - -All of these cases will use the index by date and by primary key. The index is used even for complex expressions. Reading from the table is organized so that using the index can't be slower than a full scan. - -In this example, the index can't be used: -%%SELECT count() FROM table WHERE CounterID = 34 OR URL LIKE '%upyachka%'%% - -The index by date only allows reading those parts that contain dates from the desired range. However, a data part may contain data for many dates (up to an entire month), while within a single part the data is ordered by the primary key, which might not contain the date as the first column. Because of this, using a query with only a date condition that does not specify the primary key prefix will cause more data to be read than for a single date. - -For concurrent table access, we use multi-versioning. In other words, when a table is simultaneously read and updated, data is read from a set of parts that is current at the time of the query. There are no lengthy locks. Inserts do not get in the way of read operations. - -Reading from a table is automatically parallelized. - -The OPTIMIZE query is supported, which calls an extra merge step. - -You can use a single large table and continually add data to it in small chunks - this is what MergeTree is intended for. - -Data replication is possible for all types of tables in the MergeTree family (see the section "Data replication"). - - -==CollapsingMergeTree== - -This engine differs from MergeTree in that it allows automatic deletion, or "collapsing" certain pairs of rows when merging. - -Yandex.Metrica has normal logs (such as hit logs) and change logs. Change logs are used for incrementally calculating statistics on data that is constantly changing. Examples are the log of session changes, or logs of changes to user histories. Sessions are constantly changing in Yandex.Metrica. For example, the number of hits per session increases. We refer to changes in any object as a pair (?old values, ?new values). Old values may be missing if the object was created. New values may be missing if the object was deleted. If the object was changed, but existed previously and was not deleted, both values are present. In the change log, one or two entries are made for each change. Each entry contains all the attributes that the object has, plus a special attribute for differentiating between the old and new values. When objects change, only the new entries are added to the change log, and the existing ones are not touched. - -The change log makes it possible to incrementally calculate almost any statistics. To do this, we need to consider "new" rows with a plus sign, and "old" rows with a minus sign. In other words, incremental calculation is possible for all statistics whose algebraic structure contains an operation for taking the inverse of an element. This is true of most statistics. We can also calculate "idempotent" statistics, such as the number of unique visitors, since the unique visitors are not deleted when making changes to sessions. - -This is the main concept that allows Yandex.Metrica to work in real time. - -CollapsingMergeTree accepts an additional parameter - the name of an Int8-type column that contains the row's "sign". Example: - -%%CollapsingMergeTree(EventDate, (CounterID, EventDate, intHash32(UniqID), VisitID), 8192, Sign)%% - -Here, 'Sign' is a column containing -1 for "old" values and 1 for "new" values. - -When merging, each group of consecutive identical primary key values (columns for sorting data) is reduced to no more than one row with the column value 'sign_column = -1' (the "negative row") and no more than one row with the column value 'sign_column = 1' (the "positive row"). In other words, entries from the change log are collapsed. - -If the number of positive and negative rows matches, the first negative row and the last positive row are written. -If there is one more positive row than negative rows, only the last positive row is written. -If there is one more negative row than positive rows, only the first negative row is written. -Otherwise, there will be a logical error and none of the rows will be written. (A logical error can occur if the same section of the log was accidentally inserted more than once. The error is just recorded in the server log, and the merge continues.) - -Thus, collapsing should not change the results of calculating statistics. -Changes are gradually collapsed so that in the end only the last value of almost every object is left. -Compared to MergeTree, the CollapsingMergeTree engine allows a multifold reduction of data volume. - -There are several ways to get completely "collapsed" data from a CollapsingMergeTree table: -1. Write a query with GROUP BY and aggregate functions that accounts for the sign. For example, to calculate quantity, write 'sum(Sign)' instead of 'count()'. To calculate the sum of something, write 'sum(Sign * x)' instead of 'sum(x)', and so on, and also add 'HAVING sum(Sign) > 0'. Not all amounts can be calculated this way. For example, the aggregate functions 'min' and 'max' can't be rewritten. -2. If you must extract data without aggregation (for example, to check whether rows are present whose newest values match certain conditions), you can use the FINAL modifier for the FROM clause. This approach is significantly less efficient. - - -==SummingMergeTree== - -This engine differs from MergeTree in that it totals data while merging. - -%%SummingMergeTree(EventDate, (OrderID, EventDate, BannerID, ...), 8192)%% - -The columns to total are implicit. When merging, all rows with the same primary key value (in the example, OrderId, EventDate, BannerID, ...) have their values totaled in numeric columns that are not part of the primary key. - -%%SummingMergeTree(EventDate, (OrderID, EventDate, BannerID, ...), 8192, (Shows, Clicks, Cost, ...))%% - -The columns to total are set explicitly (the last parameter - Shows, Clicks, Cost, ...). When merging, all rows with the same primary key value have their values totaled in the specified columns. The specified columns also must be numeric and must not be part of the primary key. - -If the values were null in all of these columns, the row is deleted. (The exception is cases when the data part would not have any rows left in it.) - -For the other rows that are not part of the primary key, the first value that occurs is selected when merging. - -Summation is not performed for a read operation. If it is necessary, write the appropriate GROUP BY. - -In addition, a table can have nested data structures that are processed in a special way. -If the name of a nested table ends in 'Map' and it contains at least two columns that meet the following criteria: -- for the first table, numeric ((U)IntN, Date, DateTime), we'll refer to it as 'key' -- for other tables, arithmetic ((U)IntN, Float32/64), we'll refer to it as '(values...)' -then this nested table is interpreted as a mapping of key => (values...), and when merging its rows, the elements of two data sets are merged by 'key' with a summation of the corresponding (values...). -Examples: - -%% -[(1, 100)] + [(2, 150)] -> [(1, 100), (2, 150)] -[(1, 100)] + [(1, 150)] -> [(1, 250)] -[(1, 100)] + [(1, 150), (2, 150)] -> [(1, 250), (2, 150)] -[(1, 100), (2, 150)] + [(1, -100)] -> [(2, 150)] -%% - -For nested data structures, you don't need to specify the columns as a list of columns for totaling. - -This table engine is not particularly useful. Remember that when saving just pre-aggregated data, you lose some of the system's advantages. - - -==AggregatingMergeTree== - -This engine differs from MergeTree in that the merge combines the states of aggregate functions stored in the table for rows with the same primary key value. - -In order for this to work, it uses the AggregateFunction data type and the -State and -Merge modifiers for aggregate functions. Let's examine it more closely. - -There is an AggregateFunction data type, which is a parametric data type. As parameters, the name of the aggregate function is passed, then the types of its arguments. -Examples: - -%%CREATE TABLE t -( - column1 AggregateFunction(uniq, UInt64), - column2 AggregateFunction(anyIf, String, UInt8), - column3 AggregateFunction(quantiles(0.5, 0.9), UInt64) -) ENGINE = ... -%% - -This type of column stores the state of an aggregate function. - -To get this type of value, use aggregate functions with the 'State' suffix. -Example: uniqState(UserID), quantilesState(0.5, 0.9)(SendTiming) - in contrast to the corresponding 'uniq' and 'quantiles' functions, these functions return the state, rather than the prepared value. In other words, they return an AggregateFunction type value. - -An AggregateFunction type value can't be output in Pretty formats. In other formats, these types of values are output as implementation-specific binary data. The AggregateFunction type values are not intended for output or saving in a dump. - -The only useful thing you can do with AggregateFunction type values is combine the states and get a result, which essentially means to finish aggregation. Aggregate functions with the 'Merge' suffix are used for this purpose. -Example: uniqMerge(UserIDState), where UserIDState has the AggregateFunction type. - -In other words, an aggregate function with the 'Merge' suffix takes a set of states, combines them, and returns the result. -As an example, these two queries return the same result: - -%%SELECT uniq(UserID) FROM table%% - -%%SELECT uniqMerge(state) FROM (SELECT uniqState(UserID) AS state FROM table GROUP BY RegionID)%% - -There is an AggregatingMergeTree engine. Its job during a merge is to combine the states of aggregate functions from different table rows with the same primary key value. - -You can't use a normal INSERT to insert a row in a table containing AggregateFunction columns, because you can't explicitly define the AggregateFunction value. Instead, use INSERT SELECT with '-State' aggregate functions for inserting data. - -With SELECT from an AggregatingMergeTree table, use GROUP BY and aggregate functions with the '-Merge' modifier in order to complete data aggregation. - -You can use AggregatingMergeTree tables for incremental data aggregation, including for aggregated materialized views. - -Example: -Creating a materialized AggregatingMergeTree view that tracks the 'test.visits' table: - -%% -CREATE MATERIALIZED VIEW test.basic -ENGINE = AggregatingMergeTree(StartDate, (CounterID, StartDate), 8192) -AS SELECT - CounterID, - StartDate, - sumState(Sign) AS Visits, - uniqState(UserID) AS Users -FROM test.visits -GROUP BY CounterID, StartDate; -%% - -Inserting data in the 'test.visits' table. Data will also be inserted in the view, where it will be aggregated: - -%% -INSERT INTO test.visits ... -%% - -Performing SELECT from the view using GROUP BY to finish data aggregation: - -%% -SELECT - StartDate, - sumMerge(Visits) AS Visits, - uniqMerge(Users) AS Users -FROM test.basic -GROUP BY StartDate -ORDER BY StartDate; -%% - -You can create a materialized view like this and assign a normal view to it that finishes data aggregation. - -Note that in most cases, using AggregatingMergeTree is not justified, since queries can be run efficiently enough on non-aggregated data. - - -==ReplacingMergeTree== - -This engine differs from MergeTree in that it can deduplicate data by primary key while merging. - -For ReplacingMergeTree mode, last parameter is optional name of 'version' column. While merging, for all rows with same primary key, only one row is selected: last row, if version column was not specified, or last row with maximum version value, if specified. - -Version column must have type of UInt family or Date or DateTime. - -%%ReplacingMergeTree(EventDate, (OrderID, EventDate, BannerID, ...), 8192, ver)%% - -Please note, that data is deduplicated only while merging process. Merges are processed in background. Exact time of merge is unspecified and you could not rely on it. Some part of data could be not merged at all. While you could trigger extra merge with OPTIMIZE query, it is not recommended, as OPTIMIZE will read and write vast amount of data. - -This table engine is suitable for background removal of duplicate data to save space, but not suitable to guarantee of deduplication. - -Developed for special purposes of not Yandex.Metrica department. - - -==Null== - -When writing to a Null table, data is ignored. When reading from a Null table, the response is empty. - -However, you can create a materialized view on a Null table, so the data written to the table will end up in the view. - - -==View== - -Used for implementing views (for more information, see the CREATE VIEW query). It does not store data, but only stores the specified SELECT query. When reading from a table, it runs this query (and deletes all unnecessary columns from the query). - - -==MaterializedView== - -Used for implementing materialized views (for more information, see CREATE MATERIALIZED VIEW). For storing data, it uses a different engine that was specified when creating the view. When reading from a table, it just uses this engine. - - -==Set== - -A data set that is always in RAM. It is intended for use on the right side of the IN operator (see the section "IN operators"). - -You can use INSERT to insert data in the table. New elements will be added to the data set, while duplicates will be ignored. -But you can't perform SELECT from the table. The only way to retrieve data is by using it in the right half of the IN operator. - -Data is always located in RAM. For INSERT, the blocks of inserted data are also written to the directory of tables on the disk. When starting the server, this data is loaded to RAM. In other words, after restarting, the data remains in place. - -For a rough server restart, the block of data on the disk might be lost or damaged. In the latter case, you may need to manually delete the file with damaged data. - - -==Join== - -A prepared data structure for JOIN that is always located in RAM. - -%%Join(ANY|ALL, LEFT|INNER, k1[, k2, ...])%% - -Engine parameters: ANY|ALL - strictness, and LEFT|INNER - the type. These parameters are set without quotes and must match the JOIN that the table will be used for. k1, k2, ... are the key columns from the USING clause that the join will be made on. - -The table can't be used for GLOBAL JOINs. - -You can use INSERT to add data to the table, similar to the Set engine. For ANY, data for duplicated keys will be ignored. For ALL, it will be counted. You can't perform SELECT directly from the table. The only way to retrieve data is to use it as the "right-hand" table for JOIN. - -Storing data on the disk is the same as for the Set engine. - - -==Buffer== - -Buffers the data to write in RAM, periodically flushing it to another table. During the read operation, data is read from the buffer and the other table simultaneously. - -%%Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes)%% - -Engine parameters: -database, table - The table to flush data to. Instead of the database name, you can use a constant expression that returns a string. -num_layers - The level of parallelism. Physically, the table will be represented as 'num_layers' of independent buffers. The recommended value is 16. -min_time, max_time, min_rows, max_rows, min_bytes, and max_bytes are conditions for flushing data from the buffer. - -Data is flushed from the buffer and written to the destination table if all the 'min' conditions or at least one 'max' condition are met. -min_time, max_time - Condition for the time in seconds from the moment of the first write to the buffer. -min_rows, max_rows - Condition for the number of rows in the buffer. -min_bytes, max_bytes - Condition for the number of bytes in the buffer. - -During the write operation, data is inserted to a 'num_layers' number of random buffers. Or, if the data part to insert is large enough (greater than 'max_rows' or 'max_bytes'), it is written directly to the destination table, omitting the buffer. - -The conditions for flushing the data are calculated separately for each of the 'num_layers' buffers. For example, if num_layers = 16 and max_bytes = 100000000, the maximum RAM consumption is 1.6 GB. - -Example: - -%%CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10, 100, 10000, 1000000, 10000000, 100000000)%% - -Creating a 'merge.hits_buffer' table with the same structure as 'merge.hits' and using the Buffer engine. When writing to this table, data is buffered in RAM and later written to the 'merge.hits' table. 16 buffers are created. The data in each of them is flushed if either 100 seconds have passed, or one million rows have been written, or 100 MB of data have been written; or if simultaneously 10 seconds have passed and 10,000 rows and 10 MB of data have been written. For example, if just one row has been written, after 100 seconds it will be flushed, no matter what. But if many rows have been written, the data will be flushed sooner. - -When the server is stopped, with DROP TABLE or DETACH TABLE, buffer data is also flushed to the destination table. - -You can set empty strings in single quotation marks for the database and table name. This indicates the absence of a destination table. In this case, when the data flush conditions are reached, the buffer is simply cleared. This may be useful for keeping a window of data in memory. - -When reading from a Buffer table, data is processed both from the buffer and from the destination table (if there is one). -Note that the Buffer tables does not support an index. In other words, data in the buffer is fully scanned, which might be slow for large buffers. (For data in a subordinate table, the index it supports will be used.) - -If the set of columns in the Buffer table doesn't match the set of columns in a subordinate table, a subset of columns that exist in both tables is inserted. - -If the types don't match for one of the columns in the Buffer table and a subordinate table, an error message is entered in the server log and the buffer is cleared. -The same thing happens if the subordinate table doesn't exist when the buffer is flushed. - -If you need to run ALTER for a subordinate table and the Buffer table, we recommend first deleting the Buffer table, running ALTER for the subordinate table, then creating the Buffer table again. - -If the server is restarted abnormally, the data in the buffer is lost. - -PREWHERE, FINAL and SAMPLE do not work correctly for Buffer tables. These conditions are passed to the destination table, but are not used for processing data in the buffer. Because of this, we recommend only using the Buffer table for writing, while reading from the destination table. - -When adding data to a Buffer, one of the buffers is locked. This causes delays if a read operation is simultaneously being performed from the table. - -Data that is inserted to a Buffer table may end up in the subordinate table in a different order and in different blocks. Because of this, a Buffer table is difficult to use for writing to a CollapsingMergeTree correctly. To avoid problems, you can set 'num_layers' to 1. - -If the destination table is replicated, some expected characteristics of replicated tables are lost when writing to a Buffer table. The random changes to the order of rows and sizes of data parts cause data deduplication to quit working, which means it is not possible to have a reliable 'exactly once' write to replicated tables. - -Due to these disadvantages, we can only recommend using a Buffer table in rare cases. - -A Buffer table is used when too many INSERTs are received from a large number of servers over a unit of time and data can't be buffered before insertion, which means the INSERTs can't run fast enough. - -Note that it doesn't make sense to insert data one row at a time, even for Buffer tables. This will only produce a speed of a few thousand rows per second, while inserting larger blocks of data can produce over a million rows per second (see the section "Performance"). - - -==Data replication== - -===ReplicatedMergeTree=== -===ReplicatedCollapsingMergeTree=== -===ReplicatedAggregatingMergeTree=== -===ReplicatedSummingMergeTree=== - -Replication is only supported for tables in the MergeTree family. Replication works at the level of an individual table, not the entire server. A server can store both replicated and non-replicated tables at the same time. - -INSERT and ALTER are replicated (for more information, see ALTER). Compressed data is replicated, not query texts. -The CREATE, DROP, ATTACH, DETACH, and RENAME queries are not replicated. In other words, they belong to a single server. The CREATE TABLE query creates a new replicatable table on the server where the query is run. If this table already exists on other servers, it adds a new replica. The DROP TABLE query deletes the replica located on the server where the query is run. The RENAME query renames the table on one of the replicas. In other words, replicated tables can have different names on different replicas. - -Replication is not related to sharding in any way. Replication works independently on each shard. - -Replication is an optional feature. To use replication, set the addresses of the ZooKeeper cluster in the config file. Example: - -%% -<zookeeper> - <node index="1"> - <host>example1</host> - <port>2181</port> - </node> - <node index="2"> - <host>example2</host> - <port>2181</port> - </node> - <node index="3"> - <host>example3</host> - <port>2181</port> - </node> -</zookeeper> -%% - -Use ZooKeeper version 3.4.5 or later. For example, the version in the Ubuntu Precise package is too old. - -You can specify any existing ZooKeeper cluster - the system will use a directory on it for its own data (the directory is specified when creating a replicatable table). - -If ZooKeeper isn't set in the config file, you can't create replicated tables, and any existing replicated tables will be read-only. - -ZooKeeper isn't used for SELECT queries. In other words, replication doesn't affect the productivity of SELECT queries - they work just as fast as for non-replicated tables. - -For each INSERT query (more precisely, for each inserted block of data; the INSERT query contains a single block, or per block for every max_insert_block_size = 1048576 rows), approximately ten entries are made in ZooKeeper in several transactions. This leads to slightly longer latencies for INSERT compared to non-replicated tables. But if you follow the recommendations to insert data in batches of no more than one INSERT per second, it doesn't create any problems. The entire ClickHouse cluster used for coordinating one ZooKeeper cluster has a total of several hundred INSERTs per second. The throughput on data inserts (the number of rows per second) is just as high as for non-replicated data. - -For very large clusters, you can use different ZooKeeper clusters for different shards. However, this hasn't proven necessary on the Yandex.Metrica cluster (approximately 300 servers). - -Replication is asynchronous and multi-master. INSERT queries (as well as ALTER) can be sent to any available server. Data is inserted on this server, then sent to the other servers. Because it is asynchronous, recently inserted data appears on the other replicas with some latency. If a part of the replicas is not available, the data on them is written when they become available. If a replica is available, the latency is the amount of time it takes to transfer the block of compressed data over the network. - -There are no quorum writes. You can't write data with confirmation that it was received by more than one replica. If you write a batch of data to one replica and the server with this data ceases to exist before the data has time to get to the other replicas, this data will be lost. - -Each block of data is written atomically. The INSERT query is divided into blocks up to max_insert_block_size = 1048576 rows. In other words, if the INSERT query has less than 1048576 rows, it is made atomically. - -Blocks of data are duplicated. For multiple writes of the same data block (data blocks of the same size containing the same rows in the same order), the block is only written once. The reason for this is in case of network failures when the client application doesn't know if the data was written to the DB, so the INSERT query can simply be repeated. It doesn't matter which replica INSERTs were sent to with identical data - INSERTs are idempotent. This only works for the last 100 blocks inserted in a table. - -During replication, only the source data to insert is transferred over the network. Further data transformation (merging) is coordinated and performed on all the replicas in the same way. This minimizes network usage, which means that replication works well when replicas reside in different datacenters. (Note that duplicating data in different datacenters is the main goal of replication.) - -You can have any number of replicas of the same data. Yandex.Metrica uses double replication in production. Each server uses RAID-5 or RAID-6, and RAID-10 in some cases. This is a relatively reliable and convenient solution. - -The system monitors data synchronicity on replicas and is able to recover after a failure. Failover is automatic (for small differences in data) or semi-automatic (when data differs too much, which may indicate a configuration error). - - -===Creating replicated tables=== - -The 'Replicated' prefix is added to the table engine name. For example, ReplicatedMergeTree. - -Two parameters are also added in the beginning of the parameters list - the path to the table in ZooKeeper, and the replica name in ZooKeeper. - -Example: -ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/hits', '{replica}', EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID), EventTime), 8192) - -As the example shows, these parameters can contain substitutions in curly brackets. The substituted values are taken from the 'macros' section of the config file. Example: - -%% -<macros> - <layer>05</layer> - <shard>02</shard> - <replica>example05-02-1.yandex.ru</replica> -</macros> -%% - -The path to the table in ZooKeeper should be unique for each replicated table. Tables on different shards should have different paths. -In this case, the path consists of the following parts: - -%%/clickhouse/tables/%% is the common prefix. We recommend using exactly this one. - -%%{layer}-{shard}%% is the shard identifier. In this example it consists of two parts, since the Yandex.Metrica cluster uses bi-level sharding. For most tasks, you can leave just the {shard} substitution, which will be expanded to the shard identifier. - -%%hits%% is the name of the node for the table in ZooKeeper. It is a good idea to make it the same as the table name. It is defined explicitly, because in contrast to the table name, it doesn't change after a RENAME query. - -The replica name identifies different replicas of the same table. You can use the server name for this, as in the example. The name only needs to be unique within each shard. - -You can define everything explicitly instead of using substitutions. This might be convenient for testing and for configuring small clusters, but it is inconvenient when working with large clusters. - -Run CREATE TABLE on each replica. This query creates a new replicated table, or adds a new replica to an existing one. - -If you add a new replica after the table already contains some data on other replicas, the data will be copied from the other replicas to the new one after running the query. In other words, the new replica syncs itself with the others. - -To delete a replica, run DROP TABLE. However, only one replica is deleted - the one that resides on the server where you run the query. - - -===Recovery after failures=== - -If ZooKeeper is unavailable when a server starts, replicated tables switch to read-only mode. The system periodically attempts to connect to ZooKeeper. - -If ZooKeeper is unavailable during an INSERT, or an error occurs when interacting with ZooKeeper, an exception is thrown. - -After connecting to ZooKeeper, the system checks whether the set of data in the local file system matches the expected set of data (ZooKeeper stores this information). If there are minor inconsistencies, the system resolves them by syncing data with the replicas. - -If the system detects broken data parts (with the wrong size of files) or unrecognized parts (parts written to the file system but not recorded in ZooKeeper), it moves them to the 'detached' subdirectory (they are not deleted). Any missing parts are copied from the replicas. - -Note that ClickHouse does not perform any destructive actions such as automatically deleting a large amount of data. - -When the server starts (or establishes a new session with ZooKeeper), it only checks the quantity and sizes of all files. If the file sizes match but bytes have been changed somewhere in the middle, this is not detected immediately, but only when attempting to read the data for a SELECT query. The query throws an exception about a non-matching checksum or size of a compressed block. In this case, data parts are added to the verification queue and copied from the replicas if necessary. - -If the local set of data differs too much from the expected one, a safety mechanism is triggered. The server enters this in the log and refuses to launch. The reason for this is that this case may indicate a configuration error, such as if a replica on a shard was accidentally configured like a replica on a different shard. However, the thresholds for this mechanism are set fairly low, and this situation might occur during normal failure recovery. In this case, data is restored semi-automatically - by "pushing a button". - -To start recovery, create the node /path_to_table/replica_name/flags/force_restore_data in ZooKeeper with any content or run command to recover all replicated tables: -%%sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data%% -Then launch the server. On start, the server deletes these flags and starts recovery. - - -===Recovery after complete data loss=== - -If all data and metadata disappeared from one of the servers, follow these steps for recovery: - -1. Install ClickHouse on the server. Define substitutions correctly in the config file that contains the shard identifier and replicas, if you use them. - -2. If you had unreplicated tables that must be manually duplicated on the servers, copy their data from a replica (in the directory /var/lib/clickhouse/data/db_name/table_name/). - -3. Copy table definitions located in %%/var/lib/clickhouse/metadata/%% from a replica. If a shard or replica identifier is defined explicitly in the table definitions, correct it so that it corresponds to this replica. (Alternatively, launch the server and make all the ATTACH TABLE queries that should have been in the .sql files in %%/var/lib/clickhouse/metadata/%%.) - -4. Create the /path_to_table/replica_name/flags/force_restore_data node in ZooKeeper with any content or run command to recover all replicated tables: -%%sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data%% -Then launch the server (restart it if it is already running). Data will be downloaded from replicas. - -An alternative recovery option is to delete information about the lost replica from ZooKeeper ( /path_to_table/replica_name), then create the replica again as described in "Creating replicated tables". - -There is no restriction on network bandwidth during recovery. Keep this in mind if you are restoring many replicas at once. - - -===Converting from MergeTree to ReplicatedMergeTree=== - -From here on, we use "MergeTree" to refer to all the table engines in the MergeTree family, including ReplicatedMergeTree. - -If you had a MergeTree table that was manually replicated, you can convert it to a replicatable table. You might need to do this if you have already collected a large amount of data in a MergeTree table and now you want to enable replication. - -If the data differs on various replicas, first sync it, or delete this data on all the replicas except one. - -Rename the existing MergeTree table, then create a ReplicatedMergeTree table with the old name. -Move the data from the old table to the 'detached' subdirectory inside the directory with the new table data (/var/lib/clickhouse/data/db_name/table_name/). -Then run ALTER TABLE ATTACH PART on one of the replicas to add these data parts to the working set. - -If exactly the same parts exist on the other replicas, they are added to the working set on them. If not, the parts are downloaded from the replica that has them. - - -===Converting from ReplicatedMergeTree to MergeTree=== - -Create a MergeTree table with a different name. Move all the data from the directory with the ReplicatedMergeTree table data to the new table's data directory. Then delete the ReplicatedMergeTree table and restart the server. - -If you want to get rid of a ReplicatedMergeTree table without launching the server: -- Delete the corresponding .sql file in the metadata directory (%%/var/lib/clickhouse/metadata/%%). -- Delete the corresponding path in ZooKeeper (/path_to_table/replica_name). -After this, you can launch the server, create a MergeTree table, move the data to its directory, and then restart the server. - - -===Recovery when metadata in the ZooKeeper cluster is lost or damaged=== - -If you lost ZooKeeper, you can save data by moving it to an unreplicated table as described above. - - -
    -
    -

    System tables

    -
    -
    - -System tables are used for implementing part of the system's functionality, and for providing access to information about how the system is working. -You can't delete a system table (but you can perform DETACH). -System tables don't have files with data on the disk or files with metadata. The server creates all the system tables when it starts. -System tables are read-only. -System tables are located in the 'system' database. - -==system.one== - -This table contains a single row with a single 'dummy' UInt8 column containing the value 0. -This table is used if a SELECT query doesn't specify the FROM clause. -This is similar to the DUAL table found in other DBMSs. - -==system.numbers== - -This table contains a single UInt64 column named 'number' that contains almost all the natural numbers starting from zero. -You can use this table for tests, or if you need to do a brute force search. -Reads from this table are not parallelized. - -==system.numbers_mt== - -The same as 'system.numbers' but reads are parallelized. The numbers can be returned in any order. -Used for tests. - -==system.tables== - -This table contains the String columns 'database', 'name', and 'engine' and DateTime column metadata_modification_time. -Each table that the server knows about is entered in the 'system.tables' table. -There is an issue: table engines are specified without parameters. -This system table is used for implementing SHOW TABLES queries. - -==system.databases== - -This table contains a single String column called 'name' - the name of a database. -Each database that the server knows about has a corresponding entry in the table. -This system table is used for implementing the SHOW DATABASES query. - -==system.processes== - -This system table is used for implementing the SHOW PROCESSLIST query. -Columns: -%% -user String - Name of the user who made the request. For distributed query processing, this is the user who helped the requestor server send the query to this server, not the user who made the distributed request on the requestor server. - -address String - The IP address the request was made from. The same for distributed processing. - -elapsed Float64 - The time in seconds since request execution started. - -rows_read UInt64 - The number of rows read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. - -bytes_read UInt64 - The number of uncompressed bytes read from the table. For distributed processing, on the requestor server, this is the total for all remote servers. - -total_rows_approx UInt64 - The approximation of the total number of rows that should be read. For distributed processing, on the requestor server, this is the total for all remote servers. It can be updated during request processing, when new sources to process become known. - -memory_usage UInt64 - How much memory the request uses. It might not include some types of dedicated memory. - -query String - The query text. For INSERT, it doesn't include the data to insert. - -query_id String - Query ID, if defined. -%% - -==system.events== - -Contains information about the number of events that have occurred in the system. This is used for profiling and monitoring purposes. -Example: The number of processed SELECT queries. -Columns: 'event String' - the event name, and 'value UInt64' - the quantity. - -==system.metrics== -==system.asynchronous_metrics== -Like system.events, but show info about currently executing events or consuming resources. -Example: The number of currently executing SELECT queries; memory consumption of the system. - -==system.clusters== - -Contains information about clusters available in the config file and the servers in them. -Columns: - -%% -cluster String - Cluster name. -shard_num UInt32 - Number of a shard in the cluster, starting from 1. -shard_weight UInt32 - Relative weight of a shard when writing data. -replica_num UInt32 - Number of a replica in the shard, starting from 1. -host_name String - Host name as specified in the config. -host_address String - Host's IP address obtained from DNS. -port UInt16 - The port used to access the server. -user String - The username to use for connecting to the server. -%% - -==system.columns== - -Contains information about the columns in all tables. -You can use this table to get information similar to DESCRIBE TABLE, but for multiple tables at once. - -%% -database String - Name of the database the table is located in. -table String - Table name. -name String - Column name. -type String - Column type. -default_type String - Expression type (DEFAULT, MATERIALIZED, ALIAS) for the default value, or an empty string if it is not defined. -default_expression String - Expression for the default value, or an empty string if it is not defined. -%% - -==system.dictionaries== - -Contains information about external dictionaries. -Columns: - -%% -name String - Dictionary name. -type String - Dictionary type: Flat, Hashed, Cache. -origin String - Path to the config file where the dictionary is described. -attribute.names Array(String) - Array of attribute names provided by the dictionary. -attribute.types Array(String) - Corresponding array of attribute types provided by the dictionary. -has_hierarchy UInt8 - Whether the dictionary is hierarchical. -bytes_allocated UInt64 - The amount of RAM used by the dictionary. -hit_rate Float64 - For cache dictionaries, the percent of usage for which the value was in the cache. -element_count UInt64 - The number of items stored in the dictionary. -load_factor Float64 - The filled percentage of the dictionary (for a hashed dictionary, it is the filled percentage of the hash table). -creation_time DateTime - Time spent for the creation or last successful reload of the dictionary. -last_exception String - Text of an error that occurred when creating or reloading the dictionary, if the dictionary couldn't be created. -source String - Text describing the data source for the dictionary. -%% - -Note that the amount of memory used by the dictionary is not proportional to the number of items stored in it. So for flat and cached dictionaries, all the memory cells are pre-assigned, regardless of how full the dictionary actually is. - - -==system.functions== - -Contains information about normal and aggregate functions. -Columns: - -%% -name String - Function name. -is_aggregate UInt8 - Whether it is an aggregate function. -%% - -==system.merges== - -Contains information about merges currently in process for tables in the MergeTree family. -Columns: - -%% -database String - Name of the database the table is located in. -table String - Name of the table. -elapsed Float64 - Time in seconds since the merge started. -progress Float64 - Percent of progress made, from 0 to 1. -num_parts UInt64 - Number of parts to merge. -result_part_name String - Name of the part that will be formed as the result of the merge. -total_size_bytes_compressed UInt64 - Total size of compressed data in the parts being merged. -total_size_marks UInt64 - Total number of marks in the parts being merged. -bytes_read_uncompressed UInt64 - Amount of bytes read, decompressed. -rows_read UInt64 - Number of rows read. -bytes_written_uncompressed UInt64 - Amount of bytes written, uncompressed. -rows_written UInt64 - Number of rows written. -%% - -==system.parts== - -Contains information about parts of a table in the MergeTree family. -Columns: - -%% -database String - Name of the database where the table that this part belongs to is located. -table String - Name of the table that this part belongs to. -engine String - Name of the table engine, without parameters. -partition String - Name of the partition, in the format YYYYMM. -name String - Name of the part. -replicated UInt8 - Whether the part belongs to replicated data. -active UInt8 - Whether the part is used in a table, or is no longer needed and will be deleted soon. Inactive parts remain after merging. -marks UInt64 - Number of marks - multiply by the index granularity (usually 8192) to get the approximate number of rows in the part. -bytes UInt64 - Number of bytes when compressed. -modification_time DateTime - Time the directory with the part was modified. Usually corresponds to the part's creation time. -remove_time DateTime - For inactive parts only - the time when the part became inactive. -refcount UInt32 - The number of places where the part is used. A value greater than 2 indicates that this part participates in queries or merges. -%% - -==system.replicas== - -Contains information and status for replicated tables residing on the local server. This table can be used for monitoring. The table contains a row for every Replicated* table. - -Example: - -%% -SELECT * -FROM system.replicas -WHERE table = 'visits' -FORMAT Vertical - -Row 1: -────── -database: merge -table: visits -engine: ReplicatedCollapsingMergeTree -is_leader: 1 -is_readonly: 0 -is_session_expired: 0 -future_parts: 1 -parts_to_check: 0 -zookeeper_path: /clickhouse/tables/01-06/visits -replica_name: example01-06-1.yandex.ru -replica_path: /clickhouse/tables/01-06/visits/replicas/example01-06-1.yandex.ru -columns_version: 9 -queue_size: 1 -inserts_in_queue: 0 -merges_in_queue: 1 -log_max_index: 596273 -log_pointer: 596274 -total_replicas: 2 -active_replicas: 2 -%% - -Columns: - -%% -database: Database name. -table: Table name. -engine: Table engine name. - -is_leader: Whether the replica is the leader. -Only one replica can be the leader at a time. The leader is responsible for selecting background merges to perform. -Note that writes can be performed to any replica that is available and has a session in ZK, regardless of whether it is a leader. - -is_readonly: Whether the replica is in read-only mode. -This mode is turned on if the config doesn't have sections with ZK, if an unknown error occurred when reinitializing sessions in ZK, and during session reinitialization in ZK. - -is_session_expired: Whether the session with ZK has expired. -Basically the same as 'is_readonly'. - -future_parts: The number of data parts that will appear as the result of INSERTs or merges that haven't been done yet. - -parts_to_check: The number of data parts in the queue for verification. -A part is put in the verification queue if there is suspicion that it might be damaged. - -zookeeper_path: Path to table data in ZK. -replica_name: Replica name in ZK. Different replicas of the same table have different names. -replica_path: Path to replica data in ZK. The same as concatenating 'zookeeper_path/replicas/replica_path'. - -columns_version: Version number of the table structure. Indicates how many times ALTER was performed. If replicas have different versions, it means some replicas haven't made all of the ALTERs yet. - -queue_size: Size of the queue for operations waiting to be performed. Operations include inserting blocks of data, merges, and certain other actions. It usually coincides with 'future_parts'. - -inserts_in_queue: Number of inserts of blocks of data that need to be made. Insertions are usually replicated fairly quickly. If this number is large, it means something is wrong. - -merges_in_queue: The number of merges waiting to be made. Sometimes merges are lengthy, so this value may be greater than one for a long time. - -The next 4 columns have a non-zero value only where there is an active session with ZK. - -log_max_index: Maximum entry number in the log of general activity. -log_pointer: Maximum entry number from the log of general activity that the replica copied to its queue for execution, plus one. -If log_pointer is much smaller than log_max_index, something is wrong. - -total_replicas: The total number of known replicas of this table. -active_replicas: The number of replicas of this table that have a session in ZK (i.e., the number of functioning replicas). -%% - -If you request all the columns, the table may work a bit slowly, since several reads from ZK are made for each row. -If you don't request the last 4 columns (log_max_index, log_pointer, total_replicas, active_replicas), the table works quickly. - -For example, you can check that everything is working correctly like this: - -%% -SELECT - database, - table, - is_leader, - is_readonly, - is_session_expired, - future_parts, - parts_to_check, - columns_version, - queue_size, - inserts_in_queue, - merges_in_queue, - log_max_index, - log_pointer, - total_replicas, - active_replicas -FROM system.replicas -WHERE - is_readonly - OR is_session_expired - OR future_parts > 20 - OR parts_to_check > 10 - OR queue_size > 20 - OR inserts_in_queue > 10 - OR log_max_index - log_pointer > 10 - OR total_replicas < 2 - OR active_replicas < total_replicas -%% - -If this query doesn't return anything, it means that everything is fine. - -==system.settings== - -Contains information about settings that are currently in use (i.e. used for executing the query you are using to read from the system.settings table). - -Columns: - -%% -name String - Setting name. -value String - Setting value. -changed UInt8 - Whether the setting was explicitly defined in the config or explicitly changed. -%% - -Example: - -%% -SELECT * -FROM system.settings -WHERE changed - -┌─name───────────────────┬─value───────┬─changed─┐ -│ max_threads │ 8 │ 1 │ -│ use_uncompressed_cache │ 0 │ 1 │ -│ load_balancing │ random │ 1 │ -│ max_memory_usage │ 10000000000 │ 1 │ -└────────────────────────┴─────────────┴─────────┘ -%% - - -==system.zookeeper== - -Allows reading data from the ZooKeeper cluster defined in the config. -The query must have a 'path' equality condition in the WHERE clause. This is the path in ZooKeeper for the children that you want to get data for. - -Query SELECT * FROM system.zookeeper WHERE path = '/clickhouse' outputs data for all children on the /clickhouse node. -To output data for all root nodes, write path = '/'. -If the path specified in 'path' doesn't exist, an exception will be thrown. - -Columns: - -%% -name String - Name of the node. -path String - Path to the node. -value String - Value of the node. -dataLength Int32 - Size of the value. -numChildren Int32 - Number of children. -czxid Int64 - ID of the transaction that created the node. -mzxid Int64 - ID of the transaction that last changed the node. -pzxid Int64 - ID of the transaction that last added or removed children. -ctime DateTime - Time of node creation. -mtime DateTime - Time of the last node modification. -version Int32 - Node version - the number of times the node was changed. -cversion Int32 - Number of added or removed children. -aversion Int32 - Number of changes to ACL. -ephemeralOwner Int64 - For ephemeral nodes, the ID of the session that owns this node. -%% - -Example: - -%% -SELECT * -FROM system.zookeeper -WHERE path = '/clickhouse/tables/01-08/visits/replicas' -FORMAT Vertical - -Row 1: -────── -name: example01-08-1.yandex.ru -value: -czxid: 932998691229 -mzxid: 932998691229 -ctime: 2015-03-27 16:49:51 -mtime: 2015-03-27 16:49:51 -version: 0 -cversion: 47 -aversion: 0 -ephemeralOwner: 0 -dataLength: 0 -numChildren: 7 -pzxid: 987021031383 -path: /clickhouse/tables/01-08/visits/replicas - -Row 2: -────── -name: example01-08-2.yandex.ru -value: -czxid: 933002738135 -mzxid: 933002738135 -ctime: 2015-03-27 16:57:01 -mtime: 2015-03-27 16:57:01 -version: 0 -cversion: 37 -aversion: 0 -ephemeralOwner: 0 -dataLength: 0 -numChildren: 7 -pzxid: 987021252247 -path: /clickhouse/tables/01-08/visits/replicas -%% - - - -
    -
    -

    Table functions

    -
    -
    - -Table functions can be specified in the FROM clause instead of the database and table names. -Table functions can only be used if 'readonly' is not set. -Table functions aren't related to other functions. - -==merge== - -%%merge(db_name, 'tables_regexp')%% creates a temporary Merge table. For more information, see the section "Table engines, Merge". -The table structure is taken from the first table encountered that matches the regular expression. - -==remote== - -%%remote('addresses_expr', db, table[, 'user'[, 'password']])%% -or %%remote('addresses_expr', db.table[, 'user'[, 'password']])%% -- Allows accessing a remote server without creating a Distributed table. - -%%addresses_expr%% - An expression that generates addresses of remote servers. - -This may be just one server address. The server address is host:port, or just the host. The host can be specified as the server name, or as the IPv4 or IPv6 address. An IPv6 address is specified in square brackets. The port is the TCP port on the remote server. If the port is omitted, it uses %%tcp_port%% from the server's config file (by default, 9000). - -Note: As an exception, when specifying an IPv6 address, the port is required. - -Examples: -%% -example01-01-1 -example01-01-1:9000 -localhost -127.0.0.1 -[::]:9000 -[2a02:6b8:0:1111::11]:9000%% - -Multiple addresses can be comma-separated. In this case, the query goes to all the specified addresses (like to shards with different data) and uses distributed processing. - -Example: -%%example01-01-1,example01-02-1%% - -Part of the expression can be specified in curly brackets. The previous example can be written as follows: -%%example01-0{1,2}-1%% - -Curly brackets can contain a range of numbers separated by two dots (non-negative integers). In this case, the range is expanded to a set of values that generate shard addresses. If the first number starts with zero, the values are formed with the same zero alignment. -The previous example can be written as follows: -%%example01-{01..02}-1%% - -If you have multiple pairs of curly brackets, it generates the direct product of the corresponding sets. - -Addresses and fragments in curly brackets can be separated by the pipe (|) symbol. In this case, the corresponding sets of addresses are interpreted as replicas, and the query will be sent to the first healthy replica. The replicas are evaluated in the order currently set in the 'load_balancing' setting. -Example: - -%%example01-{01..02}-{1|2}%% - -This example specifies two shards that each have two replicas. - -The number of addresses generated is limited by a constant. Right now this is 1000 addresses. - -Using the 'remote' table function is less optimal than creating a Distributed table, because in this case, the server connection is re-established for every request. In addition, if host names are set, the names are resolved, and errors are not counted when working with various replicas. When processing a large number of queries, always create the Distributed table ahead of time, and don't use the 'remote' table function. - -The 'remote' table function can be useful in the following cases: -- Accessing a specific server for data comparison, debugging, and testing. -- Queries between various ClickHouse clusters for research purposes. -- Infrequent distributed requests that are made manually. -- Distributed requests where the set of servers is re-defined each time. - -The username can be omitted. In this case, the 'default' username is used. -The password can be omitted. In this case, an empty password is used. - -
    -
    -

    Formats

    -
    -
    - -The format determines how data is given (written by server as output) to you after SELECTs, and how it is accepted (read by server as input) for INSERTs. - - - -==Native== - -The most efficient format. Data is written and read by blocks in binary format. For each block, the number of rows, number of columns, column names and types, and parts of columns in this block are recorded one after another. In other words, this format is "columnar" - it doesn't convert columns to rows. This is the format used in the native interface for interaction between servers, for using the command-line client, and for C++ clients. - -You can use this format to quickly generate dumps that can only be read by the ClickHouse DBMS. It doesn't make sense to work with this format yourself. - - -==TabSeparated== - -In TabSeparated format, data is written by row. Each row contains values separated by tabs. Each value is follow by a tab, except the last value in the row, which is followed by a line break. Strictly Unix line breaks are assumed everywhere. The last row also must contain a line break at the end. Values are written in text format, without enclosing quotation marks, and with special characters escaped. - -Numbers are written in decimal form. Numbers may contain an extra "+" symbol at the beginning (but it is not recorded during an output). Non-negative numbers can't contain the negative sign. When parsing, it is allowed to parse an empty string as a zero, or (for signed types) a string consisting of just a minus sign as a zero. Numbers that do not fit into the corresponding data type may be parsed as a different number, without an error message. - -Floating-point numbers are formatted in decimal form. The dot is used as the decimal separator. Exponential entries are supported, as are 'inf', '+inf', '-inf', and 'nan'. An entry of floating-point numbers may begin or end with a decimal point. -During formatting, accuracy may be lost on floating-point numbers. -During parsing, a result is not necessarily the nearest machine-representable number. - -Dates are formatted in YYYY-MM-DD format and parsed in the same format, but with any characters as separators. -DateTimes are formatted in the format YYYY-MM-DD hh:mm:ss and parsed in the same format, but with any characters as separators. -This all occurs in the system time zone at the time the client or server starts (depending on which one formats data). For DateTimes, daylight saving time is not specified. So if a dump has times during daylight saving time, the dump does not unequivocally match the data, and parsing will select one of the two times. -During a parsing operation, incorrect dates and dates with times can be parsed with natural overflow or as null dates and times, without an error message. - -As an exception, parsing DateTime is also supported in Unix timestamp format, if it consists of exactly 10 decimal digits. The result is not time zone-dependent. The formats YYYY-MM-DD hh:mm:ss and NNNNNNNNNN are differentiated automatically. - -Strings are parsed and formatted with backslash-escaped special characters. The following escape sequences are used while formatting: %%\b%%, %%\f%%, %%\r,%% %%\n%%, %%\t%%, %%\0%%, %%\'%%, and %%\\%%. For parsing, also supported %%\a%%, %%\v%% and \xHH (hex escape sequence) and any sequences of the type \c where c is any character (these sequences are converted to c). This means that parsing supports formats where a line break can be written as %%\n%% or as %%\%% and a line break. For example, the string 'Hello world' with a line break between the words instead of a space can be retrieved in any of the following variations: - -%%Hello\nworld%% - -%%Hello\ -world%% - -The second variant is supported because MySQL uses it when writing tab-separated dumps. - -Only a small set of symbols are escaped. You can easily stumble onto a string value that your terminal will ruin in output. - -Minimum set of symbols that you must escape in TabSeparated format is tab, newline (LF) and backslash. - -Arrays are formatted as a list of comma-separated values in square brackets. Number items in the array are formatted as normally, but dates, dates with times, and strings are formatted in single quotes with the same escaping rules as above. - -The TabSeparated format is convenient for processing data using custom programs and scripts. It is used by default in the HTTP interface, and in the command-line client's batch mode. This format also allows transferring data between different DBMSs. For example, you can get a dump from MySQL and upload it to ClickHouse, or vice versa. - -The TabSeparated format supports outputting total values (when using WITH TOTALS) and extreme values (when 'extremes' is set to 1). In these cases, the total values and extremes are output after the main data. The main result, total values, and extremes are separated from each other by an empty line. Example: - -%%SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated%% - -%% -2014-03-17 1406958 -2014-03-18 1383658 -2014-03-19 1405797 -2014-03-20 1353623 -2014-03-21 1245779 -2014-03-22 1031592 -2014-03-23 1046491 - -0000-00-00 8873898 - -2014-03-17 1031592 -2014-03-23 1406958 -%% - -It's also available as %%TSV%%. - - -==TabSeparatedWithNames== - -Differs from the TabSeparated format in that the column names are output in the first row. -For parsing, the first row is completely ignored. You can't use column names to determine their position or to check their correctness. -(Support for using header while parsing could be added in future.) - -It's also available as %%TSVWithNames%%. - - -==TabSeparatedWithNamesAndTypes== - -Differs from the TabSeparated format in that the column names are output to the first row, while the column types are in the second row. -For parsing, the first and second rows are completely ignored. - -It's also available as %%TSVWithNamesAndTypes%%. - -==TabSeparatedRaw== - -Differs from the TabSeparated format in that the rows are formatted without escaping. -This format is only appropriate for outputting a query result, but not for parsing data to insert into a table. - -It's also available as %%TSVRaw%%. - - -==BlockTabSeparated== - -Data is not written by row, but by column and block. -Each block consists of parts of columns, each of which is written on a separate line. -The values are tab-separated. The last value in a column part is followed by a line break instead of a tab. -Blocks are separated by a double line break. -The rest of the rules are the same as in the TabSeparated format. -This format is only appropriate for outputting a query result, not for parsing. - - -==CSV== - -Comma separated values (RFC). - -String values are output in double quotes. Double quote inside a string is output as two consecutive double quotes. That's all escaping rules. Date and DateTime values are output in double quotes. Numbers are output without quotes. Fields are delimited by commas. Rows are delimited by unix newlines (LF). Arrays are output in following way: first, array are serialized to String (as in TabSeparated or Values formats), and then the String value are output in double quotes. Tuples are narrowed and serialized as separate columns. - -During parsing, values could be enclosed or not enclosed in quotes. Supported both single and double quotes. In particular, Strings could be represented without quotes - in that case, they are parsed up to comma or newline (CR or LF). Contrary to RFC, in case of parsing strings without quotes, leading and trailing spaces and tabs are ignored. As line delimiter, both Unix (LF), Windows (CR LF) or Mac OS Classic (LF CR) variants are supported. - -CSV format supports output of totals and extremes similar to TabSeparated format. - - -==CSVWithNames== - -Also contains header, similar to TabSeparatedWithNames. - - -==RowBinary== - -Writes data by row in binary format. Rows and values are listed consecutively, without separators. -This format is less efficient than the Native format, since it is row-based. - -Numbers is written in little endian, fixed width. For example, UInt64 takes 8 bytes. -DateTime is written as UInt32 with unix timestamp value. -Date is written as UInt16 with number of days since 1970-01-01 in value. -String is written as length in varint (unsigned LEB128) format and then bytes of string. -FixedString is written as just its bytes. -Array is written as length in varint (unsigned LEB128) format and then all elements, contiguously. - - -==Pretty== - -Writes data as Unicode-art tables, also using ANSI-escape sequences for setting colors in the terminal. -A full grid of the table is drawn, and each row occupies two lines in the terminal. Each result block is output as a separate table. This is necessary so that blocks can be output without buffering results (buffering would be necessary in order to pre-calculate the visible width of all the values). -To avoid dumping too much data to the terminal, only the first 10,000 rows are printed. If the number of rows is greater than or equal to 10,000, the message "Showed first 10,000" is printed. -This format is only appropriate for outputting a query result, not for parsing. - -The Pretty format supports outputting total values (when using WITH TOTALS) and extremes (when 'extremes' is set to 1). In these cases, total values and extreme values are output after the main data, in separate tables. Example (shown for the PrettyCompact format): - -%%SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT PrettyCompact%% - -%% -┌──EventDate─┬───────c─┐ -│ 2014-03-17 │ 1406958 │ -│ 2014-03-18 │ 1383658 │ -│ 2014-03-19 │ 1405797 │ -│ 2014-03-20 │ 1353623 │ -│ 2014-03-21 │ 1245779 │ -│ 2014-03-22 │ 1031592 │ -│ 2014-03-23 │ 1046491 │ -└────────────┴─────────┘ - -Totals: -┌──EventDate─┬───────c─┐ -│ 0000-00-00 │ 8873898 │ -└────────────┴─────────┘ - -Extremes: -┌──EventDate─┬───────c─┐ -│ 2014-03-17 │ 1031592 │ -│ 2014-03-23 │ 1406958 │ -└────────────┴─────────┘ -%% - -==PrettyCompact== - -Differs from Pretty in that the grid is drawn between rows and the result is more compact. This format is used by default in the command-line client in interactive mode. - - -==PrettyCompactMonoBlock== - -Differs from PrettyCompact in that up to 10,000 rows are buffered, then output as a single table, not by blocks. - - -==PrettySpace== - -Differs from PrettyCompact in that whitespace (space characters) is used instead of the grid. - - -==PrettyNoEscapes== - -Differs from Pretty in that ANSI-escape sequences aren't used. This is necessary for displaying this format in a browser, as well as for using the 'watch' command-line utility. -Example: - -%%watch -n1 "clickhouse-client --query='SELECT * FROM system.events FORMAT PrettyCompactNoEscapes'"%% - -You can use the HTTP interface for displaying in the browser. - - -==PrettyCompactNoEscapes== - -The same. - - -==PrettySpaceNoEscapes== - -The same. - - -==Vertical== - -Prints each value on a separate line with the column name specified. This format is convenient for printing just one or a few rows, if each row consists of a large number of columns. -This format is only appropriate for outputting a query result, not for parsing. - - -==Values== - -Prints every row in parentheses. Rows are separated by commas. There is no comma after the last row. The values inside the parentheses are also comma-separated. Numbers are output in decimal format without quotes. Arrays are output in square brackets. Strings, dates, and dates with times are output in quotes. Escaping rules and parsing are same as in the TabSeparated format. During formatting, extra spaces aren't inserted, but during parsing, they are allowed and skipped (except for spaces inside array values, which are not allowed). - -Minimum set of symbols that you must escape in Values format is single quote and backslash. - -This is the format that is used in INSERT INTO t VALUES ... -But you can also use it for query result. - - -==JSON== - -Outputs data in JSON format. Besides data tables, it also outputs column names and types, along with some additional information - the total number of output rows, and the number of rows that could have been output if there weren't a LIMIT. Example: - -%%SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTALS ORDER BY c DESC LIMIT 5 FORMAT JSON%% - -%% -{ - "meta": - [ - { - "name": "SearchPhrase", - "type": "String" - }, - { - "name": "c", - "type": "UInt64" - } - ], - - "data": - [ - { - "SearchPhrase": "", - "c": "8267016" - }, - { - "SearchPhrase": "bath interiors", - "c": "2166" - }, - { - "SearchPhrase": "yandex", - "c": "1655" - }, - { - "SearchPhrase": "spring 2014 fashion", - "c": "1549" - }, - { - "SearchPhrase": "freeform photo", - "c": "1480" - } - ], - - "totals": - { - "SearchPhrase": "", - "c": "8873898" - }, - - "extremes": - { - "min": - { - "SearchPhrase": "", - "c": "1480" - }, - "max": - { - "SearchPhrase": "", - "c": "8267016" - } - }, - - "rows": 5, - - "rows_before_limit_at_least": 141137 -} -%% - -JSON is compatible with JavaScript. For this purpose, certain symbols are additionally escaped: the forward slash %%/%% is escaped as %%\/%%; alternative line breaks %%U+2028%% and %%U+2029%%, which don't work in some browsers, are escaped as \uXXXX-sequences. ASCII control characters are escaped: backspace, form feed, line feed, carriage return, and horizontal tab as %%\b%%, %%\f%%, %%\n%%, %%\r%%, and %%\t%% respectively, along with the rest of the bytes from the range 00-1F using \uXXXX-sequences. Invalid UTF-8 sequences are changed to the replacement character %%�%% and, thus, the output text will consist of valid UTF-8 sequences. UInt64 and Int64 numbers are output in double quotes for compatibility with JavaScript. - -%%rows%% - The total number of output rows. -%%rows_before_limit_at_least%% - The minimal number of rows there would have been without a LIMIT. Output only if the query contains LIMIT. -If the query contains GROUP BY, %%rows_before_limit_at_least%% is the exact number of rows there would have been without a LIMIT. - -%%totals%% - Total values (when using %%WITH TOTALS%%). -%%extremes%% - Extreme values (when %%extremes%% is set to 1). - -This format is only appropriate for outputting a query result, not for parsing. -See JSONEachRow format for INSERT queries. - - -==JSONCompact== - -Differs from JSON only in that data rows are output in arrays, not in objects. Example: - -%% -{ - "meta": - [ - { - "name": "SearchPhrase", - "type": "String" - }, - { - "name": "c", - "type": "UInt64" - } - ], - - "data": - [ - ["", "8267016"], - ["bath interiors", "2166"], - ["yandex", "1655"], - ["spring 2014 fashion", "1549"], - ["freeform photo", "1480"] - ], - - "totals": ["","8873898"], - - "extremes": - { - "min": ["","1480"], - "max": ["","8267016"] - }, - - "rows": 5, - - "rows_before_limit_at_least": 141137 -} -%% - -This format is only appropriate for outputting a query result, not for parsing. -See JSONEachRow format for INSERT queries. - - -==JSONEachRow== - -If put in SELECT query, displays data in newline delimited JSON (JSON objects separated by \\n character) format. -If put in INSERT query, expects this kind of data as input. - -%% -{"SearchPhrase":"","count()":"8267016"} -{"SearchPhrase":"bathroom interior","count()":"2166"} -{"SearchPhrase":"yandex","count()":"1655"} -{"SearchPhrase":"spring 2014 fashion","count()":"1549"} -{"SearchPhrase":"free-form photo","count()":"1480"} -{"SearchPhrase":"Angelina Jolie","count()":"1245"} -{"SearchPhrase":"omsk","count()":"1112"} -{"SearchPhrase":"photos of dog breeds","count()":"1091"} -{"SearchPhrase":"curtain design","count()":"1064"} -{"SearchPhrase":"baku","count()":"1000"} -%% - -Unlike JSON format, there are no replacements of invalid UTF-8 sequences. There can be arbitrary amount of bytes in a line. -This is done in order to avoid data loss during formatting. Values are displayed analogous to JSON format. - -In INSERT queries JSON data can be supplied with arbitrary order of columns (JSON key-value pairs). It is also possible to omit values in which case the default value of the column is inserted. N.B. when using JSONEachRow format, complex default values are not supported, so when omitting a column its value will be zeros or empty string depending on its type. - -Space characters between JSON objects are skipped. Between objects there can be a comma which is ignored. Newline character is not a mandatory separator for objects. - - -==TSKV== - -Similar to TabSeparated, but displays data in %%name=value%% format. Names are displayed just as in TabSeparated. Additionally, a %%=%% symbol is displayed. - -%% -SearchPhrase= count()=8267016 -SearchPhrase=bathroom interior count()=2166 -SearchPhrase=yandex count()=1655 -SearchPhrase=spring 2014 fashion count()=1549 -SearchPhrase=free-form photo count()=1480 -SearchPhrase=Angelina Jolie count()=1245 -SearchPhrase=omsk count()=1112 -SearchPhrase=photos of dog breeds count()=1091 -SearchPhrase=curtain design count()=1064 -SearchPhrase=baku count()=1000 -%% - -In case of many small columns this format is obviously not effective and there usually is no reason to use it. This format is supported because it is used for some cases in Yandex. - -Format is supported both for input and output. In INSERT queries data can be supplied with arbitrary order of columns. It is also possible to omit values in which case the default value of the column is inserted. N.B. when using TSKV format, complex default values are not supported, so when omitting a column its value will be zeros or empty string depending on its type. - - -==XML== - -XML format is supported only for displaying data, not for INSERTS. Example: - -%% -<?xml version='1.0' encoding='UTF-8' ?> -<result> - <meta> - <columns> - <column> - <name>SearchPhrase</name> - <type>String</type> - </column> - <column> - <name>count()</name> - <type>UInt64</type> - </column> - </columns> - </meta> - <data> - <row> - <SearchPhrase></SearchPhrase> - <field>8267016</field> - </row> - <row> - <SearchPhrase>bathroom interior</SearchPhrase> - <field>2166</field> - </row> - <row> - <SearchPhrase>yandex> - <field>1655</field> - </row> - <row> - <SearchPhrase>spring 2014 fashion</SearchPhrase> - <field>1549</field> - </row> - <row> - <SearchPhrase>free-form photo</SearchPhrase> - <field>1480</field> - </row> - <row> - <SearchPhrase>Angelina Jolie</SearchPhrase> - <field>1245</field> - </row> - <row> - <SearchPhrase>omsk</SearchPhrase> - <field>1112</field> - </row> - <row> - <SearchPhrase>photos of dog breeds</SearchPhrase> - <field>1091</field> - </row> - <row> - <SearchPhrase>curtain design</SearchPhrase> - <field>1064</field> - </row> - <row> - <SearchPhrase>baku</SearchPhrase> - <field>1000</field> - </row> - </data> - <rows>10</rows> - <rows_before_limit_at_least>141137</rows_before_limit_at_least> -</result> -%% - -If name of a column contains some unacceptable character, field is used as a name. In other aspects XML uses JSON structure. -As in case of JSON, invalid UTF-8 sequences are replaced by replacement character � so displayed text will only contain valid UTF-8 sequences. - -In string values %%<%% and %%&%% are displayed as %%&lt;%% and %%&amp;%%. - -Arrays are displayed as %%<array><elem>Hello</elem><elem>World</elem>...</array>%%, -and tuples as %%<tuple><elem>Hello</elem><elem>World</elem>...</tuple>%%. - - -==Null== - -Nothing is output. However, the query is processed, and when using the command-line client, data is transmitted to the client. This is used for tests, including productivity testing. Obviously, this format is only appropriate for outputting a query result, not for parsing. - - -
    -
    -

    Data types

    -
    -
    - -==UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64== - -Fixed-length integers, with or without a sign. - - -==Float32, Float64== - -Floating-point numbers are just like 'float' and 'double' in the C language. -In contrast to standard SQL, floating-point numbers support 'inf', '-inf', and even 'nan's. -See the notes on sorting nans in "ORDER BY clause". -We do not recommend storing floating-point numbers in tables. - - -==String== - -Strings of an arbitrary length. The length is not limited. The value can contain an arbitrary set of bytes, including null bytes. -The String type replaces the types VARCHAR, BLOB, CLOB, and others from other DBMSs. - -===Encodings=== - -ClickHouse doesn't have the concept of encodings. Strings can contain an arbitrary set of bytes, which are stored and output as-is. -If you need to store texts, we recommend using UTF-8 encoding. At the very least, if your terminal uses UTF-8 (as recommended), you can read and write your values without making conversions. -Similarly, certain functions for working with strings have separate variations that work under the assumption that the string contains a set of bytes representing a UTF-8 encoded text. -For example, the 'length' function calculates the string length in bytes, while the 'lengthUTF8' function calculates the string length in Unicode code points, assuming that the value is UTF-8 encoded. - - -==FixedString(N)== - -A fixed-length string of N bytes (not characters or code points). N must be a strictly positive natural number. -When server reads a string (as an input passed in INSERT query, for example) that contains fewer bytes, the string is padded to N bytes by appending null bytes at the right. -When server reads a string that contains more bytes, an error message is returned. -When server writes a string (as an output of SELECT query, for example), null bytes are not trimmed off of the end of the string, but are output. -Note that this behavior differs from MySQL behavior for the CHAR type (where strings are padded with spaces, and the spaces are removed for output). - -Fewer functions can work with the FixedString(N) type than with String, so it is less convenient to use. - - -==Date== - -A date. Stored in two bytes as the number of days since 1970-01-01 (unsigned). Allows storing values from just after the beginning of the Unix Epoch to the upper threshold defined by a constant at the compilation stage (currently, this is until the year 2038, but it may be expanded to 2106). -The minimum value is output as 0000-00-00. - -The date is stored without the time zone. - - -==DateTime== - -Date with time. Stored in four bytes as a Unix timestamp (unsigned). Allows storing values in the same range as for the Date type. The minimal value is output as 0000-00-00 00:00:00. The time is stored with accuracy up to one second (without leap seconds). - -===Time zones=== - -The date with time is converted from text (divided into component parts) to binary and back, using the system's time zone at the time the client or server starts. In text format, information about daylight savings is lost. - -Note that by default the client adopts the server time zone at the beginning of the session. You can change this behaviour with the --use_client_time_zone command line switch. - -Supports only those time zones that never had the time differ from UTC for a partial number of hours (without leap seconds) over the entire time range you will be working with. - -So when working with a textual date (for example, when saving text dumps), keep in mind that there may be ambiguity during changes for daylight savings time, and there may be problems matching data if the time zone changed. - -==Enum== - -Enum8 or Enum16. A set of enumerated string values that are stored as Int8 or Int16. Example: - -%%Enum8('hello' = 1, 'world' = 2)%% -- This data type has two possible values - 'hello' and 'world'. - -The numeric values must be within -128..127 for %%Enum8%% and -32768..32767 for %%Enum16%%. Every member of the enum must also have different numbers. The empty string is a valid value. The numbers do not need to be sequential and can be in any order. The order does not matter. - -In memory, the data is stored in the same way as the numeric types %%Int8%% and %%Int16%%. -When reading in text format, the string is read and the corresponding numeric value is looked up. An exception will be thrown if it is not found. -When writing in text format, the stored number is looked up and the corresponding string is written out. An exception will be thrown if the number does not correspond to a known value. -In binary format, the information is saved in the same way as %%Int8%% and %%Int16%%. -The implicit default value for an Enum is the value having the smallest numeric value. - -In %%ORDER BY%%, %%GROUP BY%%, %%IN%%, %%DISTINCT%%, etc. Enums behave like the numeric value. e.g. they will be sorted by the numeric value in an %%ORDER BY%%. Equality and comparison operators behave like they do on the underlying numeric value. - -Enum values cannot be compared to numbers, they must be compared to a string. If the string compared to is not a valid value for the Enum, an exception will be thrown. The %%IN%% operator is supported with the Enum on the left hand side and a set of strings on the right hand side. - -Most numeric and string operations are not defined for Enum values, e.g. adding a number to an Enum or concatenating a string to an Enum. However, the %%toString%% function can be used to convert the Enum to its string value. Enum values are also convertible to numeric types using the %%toT%% function where T is a numeric type. When T corresponds to the enum's underlying numeric type, this conversion is zero-cost. - -It is possible to add new members to the Enum using ALTER. If the only change is to the set of values, the operation will be almost instant. It is also possible to remove members of the Enum using ALTER. Removing members is only safe if the removed value has never been used in the table. As a safeguard, changing the numeric value of a previously defined Enum member will throw an exception. - -Using ALTER, it is possible to change an %%Enum8%% to an %%Enum16%% or vice versa - just like changing an %%Int8%% to %%Int16%%. - - -==Array(T)== - -Array of T-type items. The T type can be any type, including an array. -We don't recommend using multidimensional arrays, because they are not well supported (for example, you can't store multidimensional arrays in tables with engines from MergeTree family). - - -==Tuple(T1, T2, ...)== - -Tuples can't be written to tables (other than Memory tables). They are used for temporary column grouping. Columns can be grouped when an IN expression is used in a query, and for specifying certain formal parameters of lambda functions. For more information, see "IN operators" and "Higher order functions". - -Tuples can be output as the result of running a query. In this case, for text formats other than JSON*, values are comma-separated in brackets. In JSON* formats, tuples are output as arrays (in square brackets). - - -==Nested data structures== - -==Nested(Name1 Type1, Name2 Type2, ...)== - -A nested data structure is like a nested table. The parameters of a nested data structure - the column names and types - are specified the same way as in a CREATE query. Each table row can correspond to any number of rows in a nested data structure. - -Example: - -%% -CREATE TABLE test.visits -( - CounterID UInt32, - StartDate Date, - Sign Int8, - IsNew UInt8, - VisitID UInt64, - UserID UInt64, - ... - Goals Nested - ( - ID UInt32, - Serial UInt32, - EventTime DateTime, - Price Int64, - OrderID String, - CurrencyID UInt32 - ), - ... -) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign) -%% - -This example declares the 'Goals' nested data structure, which contains data about conversions (goals reached). Each row in the 'visits' table can correspond to zero or any number of conversions. - -Only a single nesting level is supported. Nested structure columns with array type are equivalent to multidimensional arrays and thus their support is limited (storing such columns in tables with engines from MergeTree family is not supported). - -In most cases, when working with a nested data structure, its individual columns are specified. To do this, the column names are separated by a dot. These columns make up an array of matching types. All the column arrays of a single nested data structure have the same length. - -Example: - -%% -SELECT - Goals.ID, - Goals.EventTime -FROM test.visits -WHERE CounterID = 101500 AND length(Goals.ID) < 5 -LIMIT 10 - -┌─Goals.ID───────────────────────┬─Goals.EventTime───────────────────────────────────────────────────────────────────────────┐ -│ [1073752,591325,591325] │ ['2014-03-17 16:38:10','2014-03-17 16:38:48','2014-03-17 16:42:27'] │ -│ [1073752] │ ['2014-03-17 00:28:25'] │ -│ [1073752] │ ['2014-03-17 10:46:20'] │ -│ [1073752,591325,591325,591325] │ ['2014-03-17 13:59:20','2014-03-17 22:17:55','2014-03-17 22:18:07','2014-03-17 22:18:51'] │ -│ [] │ [] │ -│ [1073752,591325,591325] │ ['2014-03-17 11:37:06','2014-03-17 14:07:47','2014-03-17 14:36:21'] │ -│ [] │ [] │ -│ [] │ [] │ -│ [591325,1073752] │ ['2014-03-17 00:46:05','2014-03-17 00:46:05'] │ -│ [1073752,591325,591325,591325] │ ['2014-03-17 13:28:33','2014-03-17 13:30:26','2014-03-17 18:51:21','2014-03-17 18:51:45'] │ -└────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────┘ -%% - -It is easiest to think of a nested data structure as a set of multiple column arrays of the same length. - -The only place where a SELECT query can specify the name of an entire nested data structure instead of individual columns is the ARRAY JOIN clause. For more information, see "ARRAY JOIN clause". Example: - -%% -SELECT - Goal.ID, - Goal.EventTime -FROM test.visits -ARRAY JOIN Goals AS Goal -WHERE CounterID = 101500 AND length(Goals.ID) < 5 -LIMIT 10 - -┌─Goal.ID─┬──────Goal.EventTime─┐ -│ 1073752 │ 2014-03-17 16:38:10 │ -│ 591325 │ 2014-03-17 16:38:48 │ -│ 591325 │ 2014-03-17 16:42:27 │ -│ 1073752 │ 2014-03-17 00:28:25 │ -│ 1073752 │ 2014-03-17 10:46:20 │ -│ 1073752 │ 2014-03-17 13:59:20 │ -│ 591325 │ 2014-03-17 22:17:55 │ -│ 591325 │ 2014-03-17 22:18:07 │ -│ 591325 │ 2014-03-17 22:18:51 │ -│ 1073752 │ 2014-03-17 11:37:06 │ -└─────────┴─────────────────────┘ -%% - -You can't perform SELECT for an entire nested data structure. You can only explicitly list individual columns that are part of it. - -For an INSERT query, you should pass all the component column arrays of a nested data structure separately (as if they were individual column arrays). During insertion, the system checks that they have the same length. - -For a DESCRIBE query, the columns in a nested data structure are listed separately in the same way. - -The ALTER query is very limited for elements in a nested data structure. - - -==AggregateFunction(name, types_of_arguments...)== - -The intermediate state of an aggregate function. To get it, use aggregate functions with the '-State' suffix. For more information, see "AggregatingMergeTree". - - -==Special data types== - -Special data type values can't be saved to a table or output in results, but are used as the intermediate result of running a query. - -===Set=== - -Used for the right half of an IN expression. - -===Expression=== - -Used for representing lambda expressions in high-order functions. - - -==Boolean values== - -There isn't a separate type for boolean values. They use the UInt8 type, restricted to the values 0 or 1. - -
    -
    -

    Operators

    -
    -
    - -All operators are transformed to the corresponding functions at the query parsing stage, in accordance with their precedence and associativity. - -==Access operators== - -%%a[N]%% - Access to an array element, arrayElement(a, N) function. -%%a.N%% - Access to a tuple element, tupleElement(a, N) function. - -==Numeric negation operator== - -%%-a%% - negate(a) function - -==Multiplication and division operators== - -%%a * b%% - multiply(a, b) function -%%a / b%% - divide(a, b) function -%%a % b%% - modulo(a, b) function - -==Addition and subtraction operators== - -%%a + b%% - plus(a, b) function -%%a - b%% - minus(a, b) function - -==Comparison operators== - -%%a = b%% - equals(a, b) function -%%a == b%% - equals(a, b) function -%%a != b%% - notEquals(a, b) function -%%a <> b%% - notEquals(a, b) function -%%a <= b%% - lessOrEquals(a, b) function -%%a >= b%% - greaterOrEquals(a, b) function -%%a < b%% - less(a, b) function -%%a > b%% - greater(a, b) function -%%a LIKE s%% - like(a, b) function -%%a NOT LIKE s%% - notLike(a, b) function -%%a BETWEEN b AND c%% - equivalent to %%a >= b AND a <= c%% - -==Operators for working with data sets== - -See the section "IN operators". - -%%a IN ...%% - in(a, b) function -%%a NOT IN ...%% - notIn(a, b) function -%%a GLOBAL IN ...%% - globalIn(a, b) function -%%a GLOBAL NOT IN ...%% - globalNotIn(a, b) function - -==Logical negation operator== - -%%NOT a%% - not(a) function - -==Logical "AND" operator== - -%%a AND b%% - and(a, b) function - -==Logical "OR" operator== - -%%a OR b%% - or(a, b) function - -==Conditional operator== - -%%a ? b : c%% - if(a, b, c) function - -==Conditional expression== - -%% -CASE [x] - WHEN a THEN b - [WHEN ... THEN ...] - ELSE c -END -%% -If x is given - transform(x, [a, ...], [b, ...], c). -Otherwise, multiIf(a, b, ..., c). - -==String concatenation operator== -%%s1 || s2%% - concat(s1, s2) function - -==Lambda creation operator== - -x -> expr - lambda(x, expr) function - -The following operators do not have a priority, since they are brackets: - -==Array creation operator== - -%%[x1, ...]%% - array(x1, ...) function - -==Tuple creation operator== - -%%(x1, x2, ...)%% - tuple(x2, x2, ...) function - - -==Associativity== - -All binary operators have left associativity. For example, '1 + 2 + 3' is transformed to 'plus(plus(1, 2), 3)'. -Sometimes this doesn't work the way you expect. For example, 'SELECT 4 > 3 > 2' results in 0. - -For efficiency, the 'and' and 'or' functions accept any number of arguments. The corresponding chains of AND and OR operators are transformed to a single call of these functions. - -
    -
    -

    Functions

    -
    -
    - -There are at least* two types of functions - regular functions (they are just called "functions") and aggregate functions. These are completely different concepts. Regular functions work as if they are applied to each row separately (for each row, the result of the function doesn't depend on the other rows). Aggregate functions accumulate a set of values from various rows (i.e. they depend on the entire set of rows). - -In this section we discuss regular functions. For aggregate functions, see the section "Aggregate functions". -* - There is a third type of function that the 'arrayJoin' function belongs to; table functions can also be mentioned separately. - -===Strong typing=== - -In contrast to standard SQL, ClickHouse has strong typing. In other words, it doesn't make implicit conversions between types. Each function works for a specific set of types. This means that sometimes you need to use type conversion functions. - -===Common subexpression elimination=== - -All expressions in a query that have the same AST (the same record or same result of syntactic parsing) are considered to have identical values. Such expressions are concatenated and executed once. Identical subqueries are also eliminated this way. - -===Types of results=== - -All functions return a single return as the result (not several values, and not zero values). The type of result is usually defined only by the types of arguments, not by the values. Exceptions are the tupleElement function (the a.N operator), and the toFixedString function. - -===Constants=== - -For simplicity, certain functions can only work with constants for some arguments. For example, the right argument of the LIKE operator must be a constant. -Almost all functions return a constant for constant arguments. The exception is functions that generate random numbers. -The 'now' function returns different values for queries that were run at different times, but the result is considered a constant, since constancy is only important within a single query. -A constant expression is also considered a constant (for example, the right half of the LIKE operator can be constructed from multiple constants). - -Functions can be implemented in different ways for constant and non-constant arguments (different code is executed). But the results for a constant and for a true column containing only the same value should match each other. - -===Immutability=== - -Functions can't change the values of their arguments - any changes are returned as the result. Thus, the result of calculating separate functions does not depend on the order in which the functions are written in the query. - -===Error handling=== - -Some functions might throw an exception if the data is invalid. In this case, the query is canceled and an error text is returned to the client. For distributed processing, when an exception occurs on one of the servers, the other servers also attempt to abort the query. - -===Evaluation of argument expressions=== - -In almost all programming languages, one of the arguments might not be evaluated for certain operators. This is usually for the operators &&, ||, ?:. -But in ClickHouse, arguments of functions (operators) are always evaluated. This is because entire parts of columns are evaluated at once, instead of calculating each row separately. - -===Performing functions for distributed query processing=== - -For distributed query processing, as many stages of query processing as possible are performed on remote servers, and the rest of the stages (merging intermediate results and everything after that) are performed on the requestor server. - -This means that functions can be performed on different servers. -For example, in the query SELECT f(sum(g(x))) FROM distributed_table GROUP BY h(y), -- if %%distributed_table%% has at least two shards, the functions %%g%% and %%h%% are performed on remote servers, and the function %%f%% - is performed on the requestor server. -- if %%distributed_table%% has only one shard, all the functions %%f%%, %%g%%, and %%h%% are performed on this shard's server. - -The result of a function usually doesn't depend on which server it is performed on. However, sometimes this is important. -For example, functions that work with dictionaries use the dictionary that exists on the server they are running on. -Another example is the %%hostName%% function, which returns the name of the server it is running on in order to make GROUP BY by servers in a SELECT query. - -If a function in a query is performed on the requestor server, but you need to perform it on remote servers, you can wrap it in an 'any' aggregate function or add it to a key in GROUP BY. - - -==Arithmetic functions== - -For all arithmetic functions, the result type is calculated as the smallest number type that the result fits in, if there is such a type. The minimum is taken simultaneously based on the number of bits, whether it is signed, and whether it floats. If there are not enough bits, the highest bit type is taken. - -Example: - -
    -:) SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 + 0 + 0)
    -
    -┌─toTypeName(0)─┬─toTypeName(plus(0, 0))─┬─toTypeName(plus(plus(0, 0), 0))─┬─toTypeName(plus(plus(plus(0, 0), 0), 0))─┐
    -│ UInt8         │ UInt16                 │ UInt32                          │ UInt64                                   │
    -└───────────────┴────────────────────────┴─────────────────────────────────┴──────────────────────────────────────────┘
    -
    - -Arithmetic functions work for any pair of types from UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, or Float64. - -Overflow is produced the same way as in C++. - - -===plus(a, b), a + b operator=== - -Calculates the sum of the numbers. - -You can also add whole numbers with a date or date and time. In the case of a date, adding a whole number means adding the corresponding number of days. For a date with time, it means adding the corresponding number of seconds. - -===minus(a, b), a - b operator=== - -Calculates the difference. The result is always signed. - -You can also calculate whole numbers from a date or date with time. The idea is the same - see above for 'plus'. - -===multiply(a, b), a * b operator=== - -Calculates the product of the numbers. - -===divide(a, b), a / b operator=== - -Calculates the quotient of the numbers. The result type is always a floating-point type. -It is not integer division. For integer division, use the 'intDiv' function. -When dividing by zero you get 'inf', '-inf', or 'nan'. - -===intDiv(a, b)=== - -Calculates the quotient of the numbers. Divides into integers, rounding down (by the absolute value). -When dividing by zero or when dividing a minimal negative number by minus one, an exception is thrown. - -===intDivOrZero(a, b)=== - -Differs from 'intDiv' in that it returns zero when dividing by zero or when dividing a minimal negative number by minus one. - -===modulo(a, b), a % b operator=== - -Calculates the remainder after division. -If arguments are floating-point numbers, they are pre-converted to integers by dropping the decimal portion. The remainder is taken in the same sense as in C++. Truncated division is used for negative numbers. -An exception is thrown when dividing by zero or when dividing a minimal negative number by minus one. - -===negate(a), -a operator=== - -Calculates a number with the reverse sign. The result is always signed. - -===abs(a)=== - -Calculates the absolute value of the number 'a'. That is, if a< 0, it returns -a. -For unsigned types, it doesn't do anything. For signed integer types, it returns an unsigned number. - -==Bit functions== - -Bit functions work for any pair of types from UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, or Float64. - -The result type is an integer with bits equal to the maximum bits of its arguments. If at least one of the arguments is signed, the result is a signed number. If an argument is a floating-point number, it is cast to Int64. - -===bitAnd(a, b)=== - -===bitOr(a, b)=== - -===bitXor(a, b)=== - -===bitNot(a)=== - -===bitShiftLeft(a, b)=== - -===bitShiftRight(a, b)=== - - -==Comparison functions== - -Comparison functions always return 0 or 1 (Uint8). - -The following types can be compared: -- numbers -- strings and fixed strings -- dates -- dates with times -within each group, but not between different groups. - -For example, you can't compare a date with a string. You have to use a function to convert the string to a date, or vice versa. - -Strings are compared by bytes. A shorter string is smaller than all strings that start with it and that contain at least one more character. - -Note: before version 1.1.54134 signed and unsigned numbers were compared the same way as in C++. That is, you could got an incorrect result in such cases: SELECT 9223372036854775807 > -1. From version 1.1.54134, the behavior has changed and numbers are compared mathematically correct. - - -===equals, a = b and a == b operator=== - -

    notEquals, a != b and a <> b operator

    - -===less, < operator=== - -

    greater, > operator

    - -===lessOrEquals, <= operator=== - -

    greaterOrEquals, >= operator

    - - -==Logical functions== - -Logical functions accept any numeric types, but return a UInt8 number equal to 0 or 1. - -Zero as an argument is considered "false," while any non-zero value is considered "true". - - -===and, AND operator=== - -===or, OR operator=== - -===not, NOT operator=== - -===xor=== - - -==Type conversion functions== - -===toUInt8, toUInt16, toUInt32, toUInt64=== -===toInt8, toInt16, toInt32, toInt64=== -===toFloat32, toFloat64=== -===toUInt8OrZero, toUInt16OrZero, toUInt32OrZero, toUInt64OrZero, toInt8OrZero, toInt16OrZero, toInt32OrZero, toInt64OrZero, toFloat32OrZero, toFloat64OrZero=== -===toDate, toDateTime=== -===toString=== - -Functions for converting between numbers, strings (but not fixed strings), dates, and dates with times. All these functions accept one argument. - -When converting to or from a string, the value is formatted or parsed using the same rules as for the TabSeparated format (and almost all other text formats). If the string can't be parsed, an exception is thrown and the request is canceled. - -When converting dates to numbers or vice versa, the date corresponds to the number of days since the beginning of the Unix epoch. -When converting dates with times to numbers or vice versa, the date with time corresponds to the number of seconds since the beginning of the Unix epoch. - -Formats of date and date with time for toDate/toDateTime functions are defined as follows: -%% -YYYY-MM-DD -YYYY-MM-DD hh:mm:ss -%% - -As an exception, if converting from UInt32, Int32, UInt64, or Int64 type numbers to Date, and if the number is greater than or equal to 65536, the number is interpreted as a Unix timestamp (and not as the number of days) and is rounded to the date. This allows support for the common occurrence of writing 'toDate(unix_timestamp)', which otherwise would be an error and would require writing the more cumbersome 'toDate(toDateTime(unix_timestamp))'. - -Conversion between a date and date with time is performed the natural way: by adding a null time or dropping the time. - -Conversion between numeric types uses the same rules as assignments between different numeric types in C++. - -To do transformations on DateTime in given time zone, pass second argument with time zone name: -%% -SELECT - toDateTime('2016-06-15 23:00:00') AS time, - toDate(time) AS date_local, - toDate(time, 'Asia/Yekaterinburg') AS date_yekat, - toString(time, 'US/Samoa') AS time_samoa - -┌────────────────time─┬─date_local─┬─date_yekat─┬─time_samoa──────────┐ -│ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-16 │ 2016-06-15 09:00:00 │ -└─────────────────────┴────────────┴────────────┴─────────────────────┘ -%% - -To format DateTime in given time zone: -%% -toString(now(), 'Asia/Yekaterinburg') -%% -To get unix timestamp for string with datetime in specified time zone: -%% -toUnixTimestamp('2000-01-01 00:00:00', 'Asia/Yekaterinburg') -%% - -===toFixedString(s, N)=== - -Converts a String type argument to a FixedString(N) type (a string with fixed length N). N must be a constant. If the string has fewer bytes than N, it is passed with null bytes to the right. If the string has more bytes than N, an exception is thrown. - -===toStringCutToZero(s)=== - -Accepts a String or FixedString argument. Returns a String that is cut to a first null byte occurrence. - -Example: -%% -:) SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut - -┌─s─────────────┬─s_cut─┐ -│ foo\0\0\0\0\0 │ foo │ -└───────────────┴───────┘ - -:) SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut - -┌─s──────────┬─s_cut─┐ -│ foo\0bar\0 │ foo │ -└────────────┴───────┘ -%% - -===reinterpretAsUInt8, reinterpretAsUInt16, reinterpretAsUInt32, reinterpretAsUInt64=== -===reinterpretAsInt8, reinterpretAsInt16, reinterpretAsInt32, reinterpretAsInt64=== -===reinterpretAsFloat32, reinterpretAsFloat64=== -===reinterpretAsDate, reinterpretAsDateTime=== - -These functions accept a string and interpret the bytes placed at the beginning of the string as a number in host order (little endian). If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes. If the string is longer than needed, the extra bytes are ignored. A date is interpreted as the number of days since the beginning of the Unix Epoch, and a date with time is interpreted as the number of seconds since the beginning of the Unix Epoch. - -===reinterpretAsString=== - -This function accepts a number or date or date with time, and returns a string containing bytes representing the corresponding value in host order (little endian). Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a string that is one byte long. - -===CAST(x, t)=== - -Casts x to the t data type. -The syntax %%CAST(x AS t)%% is also supported. - -Example: -%% -SELECT - '2016-06-15 23:00:00' AS timestamp, - CAST(timestamp AS DateTime) AS datetime, - CAST(timestamp AS Date) AS date, - CAST(timestamp, 'String') AS string, - CAST(timestamp, 'FixedString(22)') AS fixed_string - -┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ -│ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ -└─────────────────────┴─────────────────────┴────────────┴─────────────────────┴───────────────────────────┘ -%% - -Casting to FixedString(N) works only for String and FixedString(N). - - -==Functions for working with dates and times== - -===toYear=== -Converts a date or date with time to a UInt16 number containing the year number (AD). - -===toMonth=== -Converts a date or date with time to a UInt8 number containing the month number (1-12). - -===toDayOfMonth=== -Converts a date or date with time to a UInt8 number containing the number of the day of the month (1-31). - -===toDayOfWeek=== -Converts a date or date with time to a UInt8 number containing the number of the day of the week (Monday is 1, and Sunday is 7). - -===toHour=== -Converts a date with time to a UInt8 number containing the number of the hour in 24-hour time (0-23). -This function assumes that if clocks are moved ahead, it is by one hour and occurs at 2 a.m., and if clocks are moved back, it is by one hour and occurs at 3 a.m. (which is not always true - even in Moscow the clocks were once changed at a different time). - -===toMinute=== -Converts a date with time to a UInt8 number containing the number of the minute of the hour (0-59). - -===toSecond=== -Converts a date with time to a UInt8 number containing the number of the second in the minute (0-59). -Leap seconds are not accounted for. - -===toStartOfDay=== -Rounds down a date with time to the start of the day. - -===toMonday=== -Rounds down a date or date with time to the nearest Monday. -Returns the date. - -===toStartOfMonth=== -Rounds down a date or date with time to the first day of the month. -Returns the date. - -===toStartOfQuarter=== -Rounds down a date or date with time to the first day of the quarter. -The first day of the quarter is either 1 January, 1 April, 1 July, or 1 October. Returns the date. - -===toStartOfYear=== -Rounds down a date or date with time to the first day of the year. -Returns the date. - -===toStartOfMinute=== -Rounds down a date with time to the start of the minute. - -===toStartOfFiveMinute=== -Rounds down a date with time to the start of the 5 minute (00:00, 00:05, 00:10...). - -===toStartOfHour=== -Rounds down a date with time to the start of the hour. - -===toTime=== -Converts a date with time to some fixed date, while preserving the time. - -===toRelativeYearNum=== -Converts a date with time or date to the number of the year, starting from a certain fixed point in the past. - -===toRelativeMonthNum=== -Converts a date with time or date to the number of the month, starting from a certain fixed point in the past. - -===toRelativeWeekNum=== -Converts a date with time or date to the number of the week, starting from a certain fixed point in the past. - -===toRelativeDayNum=== -Converts a date with time or date to the number of the day, starting from a certain fixed point in the past. - -===toRelativeHourNum=== -Converts a date with time or date to the number of the hour, starting from a certain fixed point in the past. - -===toRelativeMinuteNum=== -Converts a date with time or date to the number of the minute, starting from a certain fixed point in the past. - -===toRelativeSecondNum=== -Converts a date with time or date to the number of the second, starting from a certain fixed point in the past. - -===now=== -Accepts zero arguments and returns the current time at one of the moments of request execution. -This function returns a constant, even if the request took a long time to complete. - -===today=== -Accepts zero arguments and returns the current date at one of the moments of request execution. -The same as 'toDate(now())'. - -===yesterday=== -Accepts zero arguments and returns yesterday's date at one of the moments of request execution. -The same as 'today() - 1'. - -===timeSlot=== -Rounds the time to the half hour. -This function is specific to Yandex.Metrica, since half an hour is the minimum amount of time for breaking a session into two sessions if a counter shows a single user's consecutive pageviews that differ in time by strictly more than this amount. This means that tuples (the counter number, user ID, and time slot) can be used to search for pageviews that are included in the corresponding session. - -===timeSlots(StartTime, Duration)=== -For a time interval starting at 'StartTime' and continuing for 'Duration' seconds, it returns an array of moments in time, consisting of points from this interval rounded down to the half hour. -For example, %%timeSlots(toDateTime('2012-01-01 12:20:00'), toUInt32(600)) = [toDateTime('2012-01-01 12:00:00'), toDateTime('2012-01-01 12:30:00')]%%. -This is necessary for searching for pageviews in the corresponding session. - - -==Functions for working with strings== - -===empty=== -Returns 1 for an empty string or 0 for a non-empty string. -The result type is UInt8. -A string is considered non-empty if it contains at least one byte, even if this is a space or a null byte. -The function also works for arrays. - -===notEmpty=== -Returns 0 for an empty string or 1 for a non-empty string. -The result type is UInt8. -The function also works for arrays. - -===length=== -Returns the length of a string in bytes (not in characters, and not in code points). -The result type is UInt64. -The function also works for arrays. - -===lengthUTF8=== -Returns the length of a string in Unicode code points (not in characters), assuming that the string contains a set of bytes that make up UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn't throw an exception). -The result type is UInt64. - -===lower=== -Converts ASCII Latin symbols in a string to lowercase. - -===upper=== -Converts ASCII Latin symbols in a string to uppercase. - -===lowerUTF8=== -Converts a string to lowercase, assuming the string contains a set of bytes that make up a UTF-8 encoded text. It doesn't detect the language. So for Turkish the result might not be exactly correct. -If length of UTF-8 sequence is different for upper and lower case of code point, then result for that code point could be incorrect. -If value contains invalid UTF-8, the behavior is unspecified. - -===upperUTF8=== -Converts a string to uppercase, assuming the string contains a set of bytes that make up a UTF-8 encoded text. It doesn't detect the language. So for Turkish the result might not be exactly correct. -If length of UTF-8 sequence is different for upper and lower case of code point, then result for that code point could be incorrect. -If value contains invalid UTF-8, the behavior is unspecified. - -===reverse=== -Reverses the string (as a sequence of bytes). - -===reverseUTF8=== -Reverses a sequence of Unicode code points, assuming that the string contains a set of bytes representing a UTF-8 text. Otherwise, it does something else (it doesn't throw an exception). - -===concat(s1, s2, ...)=== -Concatenates strings from the function arguments, without a separator. - -===substring(s, offset, length)=== -Returns a substring starting with the byte from the 'offset' index that is 'length' bytes long. Character indexing starts from one (as in standard SQL). The 'offset' and 'length' arguments must be constants. - -===substringUTF8(s, offset, length)=== -The same as 'substring', but for Unicode code points. Works under the assumption that the string contains a set of bytes representing a UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn't throw an exception). - -===appendTrailingCharIfAbsent(s, c)=== -If the %%s%% string is non-empty and does not contain the %%c%% character at the end, it appends the %%c%% character to the end. - -===convertCharset(s, from, to)=== -Returns a string with the data %%s%% (encoded as %%from%% charset) that was converted to the %%to%% charset. - -==Functions for searching strings== - -The search is case-sensitive in all these functions. -The search substring or regular expression must be a constant in all these functions. - -===position(haystack, needle)=== -Searches for the 'needle' substring in the 'haystack' string. -Returns the position (in bytes) of the found substring, starting from 1, or returns 0 if the substring was not found. -There's also positionCaseInsensitive function. - -===positionUTF8(haystack, needle)=== -The same as 'position', but the position is returned in Unicode code points. Works under the assumption that the string contains a set of bytes representing a UTF-8 encoded text. If this assumption is not met, it returns some result (it doesn't throw an exception). -There's also positionCaseInsensitiveUTF8 function. - -===match(haystack, pattern)=== -Checks whether the string matches the 'pattern' regular expression. -The regular expression is re2. -Returns 0 if it doesn't match, or 1 if it matches. - -Note that the backslash symbol (%%\%%) is used for escaping in the regular expression. The same symbol is used for escaping in string literals. So in order to escape the symbol in a regular expression, you must write two backslashes (%%\\%%) in a string literal. - -The regular expression works with the string as if it is a set of bytes. -The regular expression can't contain null bytes. -For patterns to search for substrings in a string, it is better to use LIKE or 'position', since they work much faster. - -===extract(haystack, pattern)=== -Extracts a fragment of a string using a regular expression. If 'haystack' doesn't match the 'pattern' regex, an empty string is returned. If the regex doesn't contain subpatterns, it takes the fragment that matches the entire regex. Otherwise, it takes the fragment that matches the first subpattern. - -===extractAll(haystack, pattern)=== -Extracts all the fragments of a string using a regular expression. If 'haystack' doesn't match the 'pattern' regex, an empty string is returned. Returns an array of strings consisting of all matches to the regex. In general, the behavior is the same as the 'extract' function (it takes the first subpattern, or the entire expression if there isn't a subpattern). - -===like(haystack, pattern), haystack LIKE pattern operator=== -Checks whether a string matches a simple regular expression. The regular expression can contain the metasymbols %%%%% and %%_%%. -%%%%% indicates any quantity of any bytes (including zero characters). -%%_%% indicates any one byte. - -Use the backslash (%%\%%) for escaping metasymbols. See the note on escaping in the description of the 'match' function. - -For regular expressions like%%%needle%%%, the code is more optimal and works as fast as the 'position' function. For other regular expressions, the code is the same as for the 'match' function. - -===notLike(haystack, pattern), haystack NOT LIKE pattern operator=== -The same thing as 'like', but negative. - - -==Functions for searching and replacing in strings== - -===replaceOne(haystack, pattern, replacement)=== -Replaces the first occurrence, if it exists, of the 'pattern' substring in 'haystack' with the 'replacement' substring. -Hereafter, 'pattern' and 'replacement' must be constants. - -===replaceAll(haystack, pattern, replacement)=== -Replaces all occurrences of the 'pattern' substring in 'haystack' with the 'replacement' substring. - -===replaceRegexpOne(haystack, pattern, replacement)=== -Replacement using the 'pattern' regular expression. A re2 regular expression. Replaces only the first occurrence, if it exists. -A pattern can be specified as 'replacement'. This pattern can include substitutions \0-\9\. -The substitution \0 includes the entire regular expression. -The substitutions \1-\9 include the subpattern corresponding to the number. -In order to specify the \ symbol in a pattern, you must use a \ symbol to escape it. -Also keep in mind that a string literal requires an extra escape. - -Example 1. Converting the date to American format: - -%% -SELECT DISTINCT - EventDate, - replaceRegexpOne(toString(EventDate), '(\\d{4})-(\\d{2})-(\\d{2})', '\\2/\\3/\\1') AS res -FROM test.hits -LIMIT 7 -FORMAT TabSeparated - -2014-03-17 03/17/2014 -2014-03-18 03/18/2014 -2014-03-19 03/19/2014 -2014-03-20 03/20/2014 -2014-03-21 03/21/2014 -2014-03-22 03/22/2014 -2014-03-23 03/23/2014 -%% - -Example 2. Copy the string ten times: - -%% -SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') AS res - -┌─res────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World! │ -└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -%% - -===replaceRegexpAll(haystack, pattern, replacement)=== -This does the same thing, but replaces all the occurrences. Example: - -%% -SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res - -┌─res────────────────────────┐ -│ HHeelllloo,, WWoorrlldd!! │ -└────────────────────────────┘ -%% - -As an exception, if a regular expression worked on an empty substring, the replacement is not made more than once. Example: - -%% -SELECT replaceRegexpAll('Hello, World!', '^', 'here: ') AS res - -┌─res─────────────────┐ -│ here: Hello, World! │ -└─────────────────────┘ -%% - -==Functions for working with arrays== - -===empty=== -Returns 1 for an empty array, or 0 for a non-empty array. -The result type is UInt8. -The function also works for strings. - -===notEmpty=== -Returns 0 for an empty array, or 1 for a non-empty array. -The result type is UInt8. -The function also works for strings. - -===length=== -Returns the number of items in the array. -The result type is UInt64. -The function also works for strings. - -===emptyArrayUInt8, emptyArrayUInt16, emptyArrayUInt32, emptyArrayUInt64=== -===emptyArrayInt8, emptyArrayInt16, emptyArrayInt32, emptyArrayInt64=== -===emptyArrayFloat32, emptyArrayFloat64=== -===emptyArrayDate, emptyArrayDateTime=== -===emptyArrayString=== -Accepts zero arguments and returns an empty array of the appropriate type. - -===emptyArrayToSingle=== -Accepts an empty array as argument and returns an array of one element equal to the default value. - -===range(N)=== -Returns an array of numbers from 0 to N-1. -Just in case, an exception is thrown if arrays with a total length of more than 100,000,000 elements are created in a data block. - -===array(x1, ...), [x1, ...] operator=== -Creates an array from the function arguments. -The arguments must be constants and have types that have the smallest common type. At least one argument must be passed, because otherwise it isn't clear which type of array to create. That is, you can't use this function to create an empty array (to do that, use the 'emptyArray*' function described above). -Returns an 'Array(T)' type result, where 'T' is the smallest common type out of the passed arguments. - -===arrayElement(arr, n), arr[n] operator=== -Get the element with the index 'n' from the array 'arr'. -'n' should be any integer type. -Indexes in an array begin from one. -Negative indexes are supported - in this case, it selects the corresponding element numbered from the end. For example, 'arr[-1]' is the last item in the array. - -If the index goes beyond the array bounds: -- if both arguments are constants, an exception is thrown. -- otherwise, a default value is returned (0 for numbers, an empty string for strings, etc.). - -===has(arr, elem)=== -Checks whether the 'arr' array has the 'elem' element. -Returns 0 if the the element is not in the array, or 1 if it is. -'elem' must be a constant. - -===indexOf(arr, x)=== -Returns the index of the 'x' element (starting from 1) if it is in the array, or 0 if it is not. - -===countEqual(arr, x)=== -Returns the number of elements in the array equal to 'x'. Equivalent to arrayCount(elem -> elem = x, arr). - -===arrayEnumerate(arr)=== -Returns the array %%[1, 2, 3, ..., length(arr)]%% - -This function is normally used together with ARRAY JOIN. It allows counting something just once for each array after applying ARRAY JOIN. Example: - -%% -SELECT - count() AS Reaches, - countIf(num = 1) AS Hits -FROM test.hits -ARRAY JOIN - GoalsReached, - arrayEnumerate(GoalsReached) AS num -WHERE CounterID = 160656 -LIMIT 10 - -┌─Reaches─┬──Hits─┐ -│ 95606 │ 31406 │ -└─────────┴───────┘ -%% - -In this example, Reaches is the number of conversions (the strings received after applying ARRAY JOIN), and Hits is the number of pageviews (strings before ARRAY JOIN). In this particular case, you can get the same result in an easier way: - -%% -SELECT - sum(length(GoalsReached)) AS Reaches, - count() AS Hits -FROM test.hits -WHERE (CounterID = 160656) AND notEmpty(GoalsReached) - -┌─Reaches─┬──Hits─┐ -│ 95606 │ 31406 │ -└─────────┴───────┘ -%% - -This function can also be used in higher-order functions. For example, you can use it to get array indexes for elements that match a condition. - -===arrayEnumerateUniq(arr, ...)=== -Returns an array the same size as the source array, indicating for each element what its position is among elements with the same value. -For example: %%arrayEnumerateUniq([10, 20, 10, 30]) = [1, 1, 2, 1]%%. - -This function is useful when using ARRAY JOIN and aggregation of array elements. Example: - -%% -SELECT - Goals.ID AS GoalID, - sum(Sign) AS Reaches, - sumIf(Sign, num = 1) AS Visits -FROM test.visits -ARRAY JOIN - Goals, - arrayEnumerateUniq(Goals.ID) AS num -WHERE CounterID = 160656 -GROUP BY GoalID -ORDER BY Reaches DESC -LIMIT 10 - -┌──GoalID─┬─Reaches─┬─Visits─┐ -│ 53225 │ 3214 │ 1097 │ -│ 2825062 │ 3188 │ 1097 │ -│ 56600 │ 2803 │ 488 │ -│ 1989037 │ 2401 │ 365 │ -│ 2830064 │ 2396 │ 910 │ -│ 1113562 │ 2372 │ 373 │ -│ 3270895 │ 2262 │ 812 │ -│ 1084657 │ 2262 │ 345 │ -│ 56599 │ 2260 │ 799 │ -│ 3271094 │ 2256 │ 812 │ -└─────────┴─────────┴────────┘ -%% - -In this example, each goal ID has a calculation of the number of conversions (each element in the Goals nested data structure is a goal that was reached, which we refer to as a conversion) and the number of sessions. Without ARRAY JOIN, we would have counted the number of sessions as %%sum(Sign)%%. But in this particular case, the rows were multiplied by the nested Goals structure, so in order to count each session one time after this, we apply a condition to the value of the %%arrayEnumerateUniq(Goals.ID)%% function. - -The arrayEnumerateUniq function can take multiple arrays of the same size as arguments. In this case, uniqueness is considered for tuples of elements in the same positions in all the arrays. - -%% -SELECT arrayEnumerateUniq([1, 1, 1, 2, 2, 2], [1, 1, 2, 1, 1, 2]) AS res - -┌─res───────────┐ -│ [1,2,1,1,2,1] │ -└───────────────┘ -%% - -This is necessary when using ARRAY JOIN with a nested data structure and further aggregation across multiple elements in this structure. - - -===arrayUniq(arr, ...)=== - -If a single array is passed, returns a number of unique elements in that array. -If multiple arrays of the same size are passed as arguments to the function, returns a number of unique tuples of elements in the same positions in all the arrays. - -If you need an array of the unique elements, you can use %%arrayReduce('groupUniqArray', arr)%%. - - -===arrayJoin(arr)=== -A special function. See the section "arrayJoin function". - - -==Higher-order functions== - -

    -> operator, lambda(params, expr) function

    -Allows describing a lambda function for passing to a higher-order function. The left side of the arrow has a formal parameter - any ID, or multiple formal parameters - any IDs in a tuple. The right side of the arrow has an expression that can use these formal parameters, as well as any table columns. - -Examples: x -> 2 * x, str -> str != Referer. - -Higher-order functions can only accept lambda functions as their functional argument. - -A lambda function that accepts multiple arguments can be passed to a higher-order function. In this case, the higher-order function is passed several arrays of identical length that these arguments will correspond to. - -For all functions other than 'arrayMap' and 'arrayFilter', the first argument (the lambda function) can be omitted. In this case, identical mapping is assumed. - -===arrayMap(func, arr1, ...)=== -Returns an array obtained from the original application of the 'func' function to each element in the 'arr' array. - -===arrayFilter(func, arr1, ...)=== -Returns an array containing only the elements in 'arr1' for which 'func' returns something other than 0. - -Examples: - -%% -SELECT arrayFilter(x -> x LIKE '%World%', ['Hello', 'abc World']) AS res - -┌─res───────────┐ -│ ['abc World'] │ -└───────────────┘ - -SELECT - arrayFilter( - (i, x) -> x LIKE '%World%', - arrayEnumerate(arr), - ['Hello', 'abc World'] AS arr) - AS res - -┌─res─┐ -│ [2] │ -└─────┘ -%% - -===arrayCount([func,] arr1, ...)=== -Returns the number of elements in 'arr' for which 'func' returns something other than 0. If 'func' is not specified, it returns the number of non-zero items in the array. - -===arrayExists([func,] arr1, ...)=== -Returns 1 if there is at least one element in 'arr' for which 'func' returns something other than 0. Otherwise, it returns 0. - -===arrayAll([func,] arr1, ...)=== -Returns 1 if 'func' returns something other than 0 for all the elements in 'arr'. Otherwise, it returns 0. - -===arraySum([func,] arr1, ...)=== -Returns the sum of the 'func' values. If the function is omitted, it just returns the sum of the array elements. - -===arrayFirst(func, arr1, ...)=== -Returns the first element in the 'arr1' array for which 'func' returns something other than 0. - -===arrayFirstIndex(func, arr1, ...)=== -Returns the index of the first element in the 'arr1' array for which 'func' returns something other than 0. - - -==Functions for splitting and merging strings and arrays== - -===splitByChar(separator, s)=== -Splits a string into substrings, using 'separator' as the separator. -'separator' must be a string constant consisting of exactly one character. -Returns an array of selected substrings. Empty substrings may be selected if the separator occurs at the beginning or end of the string, or if there are multiple consecutive separators. - -===splitByString(separator, s)=== -The same as above, but it uses a string of multiple characters as the separator. The string must be non-empty. - -===arrayStringConcat(arr[, separator])=== -Concatenates strings from the array elements, using 'separator' as the separator. -'separator' is a string constant, an optional parameter. By default it is an empty string. -Returns a string. - -===alphaTokens(s)=== -Selects substrings of consecutive bytes from the range a-z and A-Z. -Returns an array of selected substrings. - - -==Functions for working with URLs== - -All these functions don't follow the RFC. They are maximally simplified for improved performance. - -===Functions that extract part of a URL=== - -If there isn't anything similar in a URL, an empty string is returned. - -

    protocol

    -- Selects the protocol. Examples: http, ftp, mailto, magnet... - -

    domain

    -- Selects the domain. - -

    domainWithoutWWW

    -- Selects the domain and removes no more than one 'www.' from the beginning of it, if present. - -

    topLevelDomain

    -- Selects the top-level domain. Example: .ru. - -

    firstSignificantSubdomain

    -- Selects the "first significant subdomain". This is a non-standard concept specific to Yandex.Metrica. -The first significant subdomain is a second-level domain if it is 'com', 'net', 'org', or 'co'. Otherwise, it is a third-level domain. -For example, firstSignificantSubdomain('https://news.yandex.ru/') = 'yandex', firstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex'. -The list of "insignificant" second-level domains and other implementation details may change in the future. - -

    cutToFirstSignificantSubdomain

    -- Selects the part of the domain that includes top-level subdomains up to the "first significant subdomain" (see the explanation above). -For example, cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'. - -

    path

    -- Selects the path. Example: /top/news.html -The path does not include the query-string. - -

    pathFull

    -- The same as above, but including query-string and fragment. Example: /top/news.html?page=2#comments - -

    queryString

    -- Selects the query-string. Example: page=1&lr=213. -query-string does not include the first question mark, or # and everything that comes after #. - -

    fragment

    -- Selects the fragment identifier. -fragment does not include the first number sign (#). - -

    queryStringAndFragment

    -- Selects the query-string and fragment identifier. Example: page=1#29390. - -

    extractURLParameter(URL, name)

    -- Selects the value of the 'name' parameter in the URL, if present. Otherwise, selects an empty string. If there are many parameters with this name, it returns the first occurrence. This function works under the assumption that the parameter name is encoded in the URL in exactly the same way as in the argument passed. - -

    extractURLParameters(URL)

    -- Gets an array of name=value strings corresponding to the URL parameters. The values are not decoded in any way. - -

    extractURLParameterNames(URL)

    -- Gets an array of name=value strings corresponding to the names of URL parameters. The values are not decoded in any way. - -

    URLHierarchy(URL)

    -- Gets an array containing the URL trimmed to the %%/%%, %%?%% characters in the path and query-string. Consecutive separator characters are counted as one. The cut is made in the position after all the consecutive separator characters. Example: - -

    URLPathHierarchy(URL)

    -- The same thing, but without the protocol and host in the result. The / element (root) is not included. Example: - -This function is used for implementing tree-view reports by URL in Yandex.Metrica. - -%% -URLPathHierarchy('https://example.com/browse/CONV-6788') = -[ - '/browse/', - '/browse/CONV-6788' -] -%% - -

    decodeURLComponent(URL)

    -Returns a URL-decoded URL. - -Example: -%% -:) SELECT decodeURLComponent('http://127.0.0.1:8123/?query=SELECT%201%3B') AS DecodedURL; - -┌─DecodedURL─────────────────────────────┐ -│ http://127.0.0.1:8123/?query=SELECT 1; │ -└────────────────────────────────────────┘ -%% - -===Functions that remove part of a URL.=== - -If the URL doesn't have anything similar, the URL remains unchanged. - -

    cutWWW

    -Removes no more than one 'www.' from the beginning of the URL's domain, if present. - -

    cutQueryString

    -Removes the query-string. The question mark is also removed. - -

    cutFragment

    -Removes the fragment identifier. The number sign is also removed. - -

    cutQueryStringAndFragment

    -Removes the query-string and fragment identifier. The question mark and number sign are also removed. - -

    cutURLParameter(URL, name)

    -Removes the URL parameter named 'name', if present. This function works under the assumption that the parameter name is encoded in the URL exactly the same way as in the passed argument. - - -==Functions for working with IP addresses== - -===IPv4NumToString(num)=== -Takes a UInt32 number. Interprets it as an IPv4 address in big endian. Returns a string containing the corresponding IPv4 address in the format A.B.C.d (dot-separated numbers in decimal form). - -===IPv4StringToNum(s)=== -The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it returns 0. - -===IPv4NumToStringClassC(num)=== -Similar to IPv4NumToString, but using %%xxx%% instead of the last octet. Example: - -%% -SELECT - IPv4NumToStringClassC(ClientIP) AS k, - count() AS c -FROM test.hits -GROUP BY k -ORDER BY c DESC -LIMIT 10 - -┌─k──────────────┬─────c─┐ -│ 83.149.9.xxx │ 26238 │ -│ 217.118.81.xxx │ 26074 │ -│ 213.87.129.xxx │ 25481 │ -│ 83.149.8.xxx │ 24984 │ -│ 217.118.83.xxx │ 22797 │ -│ 78.25.120.xxx │ 22354 │ -│ 213.87.131.xxx │ 21285 │ -│ 78.25.121.xxx │ 20887 │ -│ 188.162.65.xxx │ 19694 │ -│ 83.149.48.xxx │ 17406 │ -└────────────────┴───────┘ -%% - -Since using 'xxx' is highly unusual, this may be changed in the future. We recommend that you don't rely on the exact format of this fragment. - -===IPv6NumToString(x)=== -Accepts a FixedString(16) value containing the IPv6 address in binary format. Returns a string containing this address in text format. -IPv6-mapped IPv4 addresses are output in the format %%::ffff:111.222.33.44%%. Examples: - -%% -SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr - -┌─addr─────────┐ -│ 2a02:6b8::11 │ -└──────────────┘ -%% - -%% -SELECT - IPv6NumToString(ClientIP6 AS k), - count() AS c -FROM hits_all -WHERE EventDate = today() AND substring(ClientIP6, 1, 12) != unhex('00000000000000000000FFFF') -GROUP BY k -ORDER BY c DESC -LIMIT 10 - -┌─IPv6NumToString(ClientIP6)──────────────┬─────c─┐ -│ 2a02:2168:aaa:bbbb::2 │ 24695 │ -│ 2a02:2698:abcd:abcd:abcd:abcd:8888:5555 │ 22408 │ -│ 2a02:6b8:0:fff::ff │ 16389 │ -│ 2a01:4f8:111:6666::2 │ 16016 │ -│ 2a02:2168:888:222::1 │ 15896 │ -│ 2a01:7e00::ffff:ffff:ffff:222 │ 14774 │ -│ 2a02:8109:eee:ee:eeee:eeee:eeee:eeee │ 14443 │ -│ 2a02:810b:8888:888:8888:8888:8888:8888 │ 14345 │ -│ 2a02:6b8:0:444:4444:4444:4444:4444 │ 14279 │ -│ 2a01:7e00::ffff:ffff:ffff:ffff │ 13880 │ -└─────────────────────────────────────────┴───────┘ -%% - -%% -SELECT - IPv6NumToString(ClientIP6 AS k), - count() AS c -FROM hits_all -WHERE EventDate = today() -GROUP BY k -ORDER BY c DESC -LIMIT 10 - -┌─IPv6NumToString(ClientIP6)─┬──────c─┐ -│ ::ffff:94.26.111.111 │ 747440 │ -│ ::ffff:37.143.222.4 │ 529483 │ -│ ::ffff:5.166.111.99 │ 317707 │ -│ ::ffff:46.38.11.77 │ 263086 │ -│ ::ffff:79.105.111.111 │ 186611 │ -│ ::ffff:93.92.111.88 │ 176773 │ -│ ::ffff:84.53.111.33 │ 158709 │ -│ ::ffff:217.118.11.22 │ 154004 │ -│ ::ffff:217.118.11.33 │ 148449 │ -│ ::ffff:217.118.11.44 │ 148243 │ -└────────────────────────────┴────────┘ -%% - -===IPv6StringToNum(s)=== -The reverse function of IPv6NumToString. If the IPv6 address has an invalid format, it returns a string of null bytes. -HEX can be uppercase or lowercase. - - -==Functions for generating pseudo-random numbers== - -Non-cryptographic generators of pseudo-random numbers are used. - -All the functions accept zero arguments or one argument. -If an argument is passed, it can be any type, and its value is not used for anything. -The only purpose of this argument is to prevent common subexpression elimination, so that two different instances of the same function return different columns with different random numbers. - -===rand=== -Returns a pseudo-random UInt32 number, evenly distributed among all UInt32-type numbers. -Uses a linear congruential generator. - -===rand64=== -Returns a pseudo-random UInt64 number, evenly distributed among all UInt64-type numbers. -Uses a linear congruential generator. - - -==Hash functions== - -Hash functions can be used for deterministic pseudo-random shuffling of elements. - -===halfMD5=== -Calculates the MD5 from a string. Then it takes the first 8 bytes of the hash and interprets them as UInt64 in big endian. -Accepts a String-type argument. Returns UInt64. -This function works fairly slowly (5 million short strings per second per processor core). -If you don't need MD5 in particular, use the 'sipHash64' function instead. - -===MD5=== -Calculates the MD5 from a string and returns the resulting set of bytes as FixedString(16). -If you don't need MD5 in particular, but you need a decent cryptographic 128-bit hash, use the 'sipHash128' function instead. -If you need the same result as gives 'md5sum' utility, write %%lower(hex(MD5(s)))%%. - -===sipHash64=== -Calculates SipHash from a string. -Accepts a String-type argument. Returns UInt64. -SipHash is a cryptographic hash function. It works at least three times faster than MD5. For more information, see https://131002.net/siphash/ - -===sipHash128=== -Calculates SipHash from a string. -Accepts a String-type argument. Returns FixedString(16). -Differs from sipHash64 in that the final xor-folding state is only done up to 128 bits. - -===cityHash64=== -Calculates CityHash64 from a string or a similar hash function for any number of any type of arguments. -For String-type arguments, CityHash is used. This is a fast non-cryptographic hash function for strings with decent quality. -For other types of arguments, a decent implementation-specific fast non-cryptographic hash function is used. -If multiple arguments are passed, the function is calculated using the same rules and chain combinations using the CityHash combinator. -For example, you can compute the checksum of an entire table with accuracy up to the row order: %%SELECT sum(cityHash64(*)) FROM table%%. - -===intHash32=== -Calculates a 32-bit hash code from any type of integer. -This is a relatively fast non-cryptographic hash function of average quality for numbers. - -===intHash64=== -Calculates a 64-bit hash code from any type of integer. -It works faster than intHash32. Average quality. - -===SHA1=== -===SHA224=== -===SHA256=== -Calculates SHA-1, SHA-224, or SHA-256 from a string and returns the resulting set of bytes as FixedString(20), FixedString(28), or FixedString(32). -The function works fairly slowly (SHA-1 processes about 5 million short strings per second per processor core, while SHA-224 and SHA-256 process about 2.2 million). -We recommend using this function only in cases when you need a specific hash function and you can't select it. -Even in these cases, we recommend applying the function offline and pre-calculating values when inserting them into the table, instead of applying it in SELECTS. - -===URLHash(url[, N])=== -A fast, decent-quality non-cryptographic hash function for a string obtained from a URL using some type of normalization. -URLHash(s) - Calculates a hash from a string without one of the trailing symbols /,? or # at the end, if present. -URL Hash(s, N) - Calculates a hash from a string up to the N level in the URL hierarchy, without one of the trailing symbols /,? or # at the end, if present. -Levels are the same as in URLHierarchy. This function is specific to Yandex.Metrica. - -==Encoding functions== - -===hex=== -Accepts a string, number, date, or date with time. Returns a string containing the argument's hexadecimal representation. Uses uppercase letters A-F. Doesn't use %%0x%% prefixes or %%h%% suffixes. For strings, all bytes are simply encoded as two hexadecimal numbers. Numbers are converted to big endian ("human readable") format. For numbers, older zeros are trimmed, but only by entire bytes. For example, %%hex(1) = '01'%%. Dates are encoded as the number of days since the beginning of the Unix Epoch. Dates with times are encoded as the number of seconds since the beginning of the Unix Epoch. - -===unhex(str)=== -Accepts a string containing any number of hexadecimal digits, and returns a string containing the corresponding bytes. Supports both uppercase and lowercase letters A-F. The number of hexadecimal digits doesn't have to be even. If it is odd, the last digit is interpreted as the younger half of the 00-0F byte. If the argument string contains anything other than hexadecimal digits, some implementation-defined result is returned (an exception isn't thrown). -If you want to convert the result to a number, you can use the functions 'reverse' and 'reinterpretAsType'. - -===UUIDStringToNum(str)=== -Accepts a string containing the UUID in the text format (%%123e4567-e89b-12d3-a456-426655440000%%). Returns a binary representation of the UUID in FixedString(16). - -===UUIDNumToString(str)=== -Accepts a FixedString(16) value containing the UUID in the binary format. Returns a readable string containing the UUID in the text format. - -===bitmaskToList(num)=== -Accepts an integer. Returns a string containing the list of powers of two that total the source number when summed. They are comma-separated without spaces in text format, in ascending order. - -===bitmaskToArray(num)=== -Accepts an integer. Returns an array of UInt64 numbers containing the list of powers of two that total the source number when summed. Numbers in the array are in ascending order. - - -==Rounding functions== - -===floor(x[, N])=== -Returns a rounder number that is less than or equal to 'x'. -A round number is a multiple of 1 / 10N, or the nearest number of the appropriate data type if 1 / 10N isn't exact. -'N' is an integer constant, optional parameter. By default it is zero, which means to round to an integer. -'N' may be negative. -Examples: %%floor(123.45, 1) = 123.4%%, %%floor(123.45, -1) = 120%%. -'x' is any numeric type. The result is a number of the same type. -For integer arguments, it makes sense to round with a negative 'N' value (for non-negative 'N', the function doesn't do anything). -If rounding causes overflow (for example, %%floor(-128, -1)%%), an implementation-specific result is returned. - -===ceil(x[, N])=== -Returns the smallest round number that is greater than or equal to 'x'. In every other way, it is the same as the 'floor' function (see above). - -===round(x[, N])=== -Returns the round number nearest to 'num', which may be less than, greater than, or equal to 'x'. -If 'x' is exactly in the middle between the nearest round numbers, one of them is returned (implementation-specific). -The number '-0.' may or may not be considered round (implementation-specific). -In every other way, this function is the same as 'floor' and 'ceil' described above. - -===roundToExp2(num)=== -Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to the nearest (whole non-negative) degree of two. - -===roundDuration(num)=== -Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to numbers from the set: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000. This function is specific to Yandex.Metrica and used for implementing the report on session length. - -===roundAge(num)=== -Accepts a number. If the number is less than 18, it returns 0. Otherwise, it rounds the number down to numbers from the set: 18, 25, 35, 45. This function is specific to Yandex.Metrica and used for implementing the report on user age. - - - -==Conditional functions== - -===if(cond, then, else), cond ? then : else operator=== - -Returns 'then' if 'cond != 0', or 'else' if 'cond = 0'. -'cond' must be UInt 8, and 'then' and 'else' must be a type that has the smallest common type. - - -==Mathematical functions== - -All the functions return a Float64 number. The accuracy of the result is close to the maximum precision possible, but the result might not coincide with the machine representable number nearest to the corresponding real number. - -===e()=== -Accepts zero arguments and returns a Float64 number close to the e number. - -===pi()=== -Accepts zero arguments and returns a Float64 number close to π. - -===exp(x)=== -Accepts a numeric argument and returns a Float64 number close to the exponent of the argument. - -===log(x)=== -Accepts a numeric argument and returns a Float64 number close to the natural logarithm of the argument. - -===exp2(x)=== -Accepts a numeric argument and returns a Float64 number close to 2x. - -===log2(x)=== -Accepts a numeric argument and returns a Float64 number close to the binary logarithm of the argument. - -===exp10(x)=== -Accepts a numeric argument and returns a Float64 number close to 10x. - -===log10(x)=== -Accepts a numeric argument and returns a Float64 number close to the decimal logarithm of the argument. - -===sqrt(x)=== -Accepts a numeric argument and returns a Float64 number close to the square root of the argument. - -===cbrt(x)=== -Accepts a numeric argument and returns a Float64 number close to the cubic root of the argument. - -===erf(x)=== - -If 'x' is non-negative, then %%erf(x / σ√2)%% is the probability that a random variable having a normal distribution with standard deviation 'σ' takes the value that is separated from the expected value by more than 'x'. - -Example (three sigma rule): - -%% -SELECT erf(3 / sqrt(2)) - -┌─erf(divide(3, sqrt(2)))─┐ -│ 0.9973002039367398 │ -└─────────────────────────┘ -%% - -===erfc(x)=== -Accepts a numeric argument and returns a Float64 number close to 1 - erf(x), but without loss of precision for large 'x' values. - -===lgamma(x)=== -The logarithm of the gamma function. - -===tgamma(x)=== -Gamma function. - -===sin(x)=== -The sine. - -===cos(x)=== -The cosine. - -===tan(x)=== -The tangent. - -===asin(x)=== -The arc sine. - -===acos(x)=== -The arc cosine. - -===atan(x)=== -The arc tangent. - -===pow(x, y)=== -xy. - -==Functions for working with Yandex.Metrica dictionaries== - -In order for the functions below to work, the server config must specify the paths and addresses for getting all the Yandex.Metrica dictionaries. The dictionaries are loaded at the first call of any of these functions. If the reference lists can't be loaded, an exception is thrown. - -For information about creating reference lists, see the section "Dictionaries". - -===Multiple geobases=== - -ClickHouse supports working with multiple alternative geobases (regional hierarchies) simultaneously, in order to support various perspectives on which countries certain regions belong to. - -The 'clickhouse-server' config specifies the file with the regional hierarchy: -<path_to_regions_hierarchy_file>/opt/geo/regions_hierarchy.txt</path_to_regions_hierarchy_file> - -Besides this file, it also searches for files nearby that have the _ symbol and any suffix appended to the name (before the file extension). -For example, it will also find the file %%/opt/geo/regions_hierarchy_ua.txt%%, if present. - -%%ua%% is called the dictionary key. For a dictionary without a suffix, the key is an empty string. - -All the dictionaries are re-loaded in runtime (once every certain number of seconds, as defined in the builtin_dictionaries_reload_interval config parameter, or once an hour by default). However, the list of available dictionaries is defined one time, when the server starts. - -All functions for working with regions have an optional argument at the end - the dictionary key. It is indicated as the geobase. -Example: -%% -regionToCountry(RegionID) - Uses the default dictionary: /opt/geo/regions_hierarchy.txt -regionToCountry(RegionID, '') - Uses the default dictionary: /opt/geo/regions_hierarchy.txt -regionToCountry(RegionID, 'ua') - Uses the dictionary for the 'ua' key: /opt/geo/regions_hierarchy_ua.txt -%% - -===regionToCity(id[, geobase])=== - -Accepts a UInt32 number - the region ID from the Yandex geobase. If this region is a city or part of a city, it returns the region ID for the appropriate city. Otherwise, returns 0. - -===regionToArea(id[, geobase])=== - -Converts a region to an area (type 5 in the geobase). In every other way, this function is the same as 'regionToCity'. - -%% -SELECT DISTINCT regionToName(regionToArea(toUInt32(number), 'ua'), 'en') -FROM system.numbers -LIMIT 15 - -┌─regionToName(regionToArea(toUInt32(number), \'ua\'), \'en\')─┐ -│ │ -│ Moscow and Moscow region │ -│ Saint-Petersburg and Leningradskaya oblast │ -│ Belgorod District │ -│ Ivanovo district │ -│ Kaluga District │ -│ Kostroma District │ -│ Kursk District │ -│ Lipetsk District │ -│ Orel District │ -│ Ryazhan District │ -│ Smolensk District │ -│ Tambov District │ -│ Tver District │ -│ Tula District │ -└──────────────────────────────────────────────────────────────┘ -%% - -===regionToDistrict(id[, geobase])=== - -Converts a region to a federal district (type 4 in the geobase). In every other way, this function is the same as 'regionToCity'. - -%% -SELECT DISTINCT regionToName(regionToDistrict(toUInt32(number), 'ua'), 'en') -FROM system.numbers -LIMIT 15 - -┌─regionToName(regionToDistrict(toUInt32(number), \'ua\'), \'en\')─┐ -│ │ -│ Central │ -│ Northwest │ -│ South │ -│ North Kavkaz │ -│ Volga Region │ -│ Ural │ -│ Siberian │ -│ Far East │ -│ Scotland │ -│ Faroe Islands │ -│ Flemish Region │ -│ Brussels-Capital Region │ -│ Wallonia │ -│ Federation of Bosnia and Herzegovina │ -└──────────────────────────────────────────────────────────────────┘ -%% - -===regionToCountry(id[, geobase])=== - -Converts a region to a country. In every other way, this function is the same as 'regionToCity'. -Example: %%regionToCountry(toUInt32(213)) = 225%% converts Moscow (213) to Russia (225). - -===regionToContinent(id[, geobase])=== - -Converts a region to a continent. In every other way, this function is the same as 'regionToCity'. -Example: %%regionToContinent(toUInt32(213)) = 10001%% converts Moscow (213) to Eurasia (10001). - -===regionToPopulation(id[, geobase])=== - -Gets the population for a region. -The population can be recorded in files with the geobase. See the section "External dictionaries". -If the population is not recorded for the region, it returns 0. -In the Yandex geobase, the population might be recorded for child regions, but not for parent regions. - -===regionIn(lhs, rhs[, geobase])=== - -Checks whether a 'lhs' region belongs to a 'rhs' region. Returns a UInt8 number equal to 1 if it belongs, or 0 if it doesn't belong. -The relationship is reflexive - any region also belongs to itself. - -===regionHierarchy(id[, geobase])=== - -Accepts a UInt32 number - the region ID from the Yandex geobase. Returns an array of region IDs consisting of the passed region and all parents along the chain. -Example: %%regionHierarchy(toUInt32(213)) = [213,1,3,225,10001,10000]%%. - -===regionToName(id[, lang])=== - -Accepts a UInt32 number - the region ID from the Yandex geobase. A string with the name of the language can be passed as a second argument. Supported languages are: ru, en, ua, uk, by, kz, tr. If the second argument is omitted, the language 'ru' is used. If the language is not supported, an exception is thrown. Returns a string - the name of the region in the corresponding language. If the region with the specified ID doesn't exist, an empty string is returned. - -'ua' and 'uk' mean the same thing - Ukrainian. - - -==Functions for working with external dictionaries== - -For more information, see the section "External dictionaries". - -===dictGetUInt8, dictGetUInt16, dictGetUInt32, dictGetUInt64=== -===dictGetInt8, dictGetInt16, dictGetInt32, dictGetInt64=== -===dictGetFloat32, dictGetFloat64=== -===dictGetDate, dictGetDateTime=== -===dictGetString=== - -dictGetT('dict_name', 'attr_name', id) -- Gets the value of the 'attr_name' attribute from the 'dict_name' dictionary by the 'id' key. -'dict_name' and 'attr_name' are constant strings. -'id' must be UInt64. -If the 'id' key is not in the dictionary, it returns the default value set in the dictionary definition. - -===dictIsIn=== -%%dictIsIn('dict_name', child_id, ancestor_id)%% -- For the 'dict_name' hierarchical dictionary, finds out whether the 'child_id' key is located inside 'ancestor_id' (or matches 'ancestor_id'). Returns UInt8. - -===dictGetHierarchy=== -%%dictGetHierarchy('dict_name', id)%% -- For the 'dict_name' hierarchical dictionary, returns an array of dictionary keys starting from 'id' and continuing along the chain of parent elements. Returns Array(UInt64). - - -==Functions for working with JSON.== - -In Yandex.Metrica, JSON is passed by users as session parameters. There are several functions for working with this JSON. (Although in most of the cases, the JSONs are additionally pre-processed, and the resulting values are put in separate columns in their processed format.) All these functions are based on strong assumptions about what the JSON can be, but they try not to do anything. - -The following assumptions are made: - -1. The field name (function argument) must be a constant. -2. The field name is somehow canonically encoded in JSON. For example, -%%visitParamHas('{"abc":"def"}', 'abc') = 1%% -, but -%%visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0%% -3. Fields are searched for on any nesting level, indiscriminately. If there are multiple matching fields, the first occurrence is used. -4. JSON doesn't have space characters outside of string literals. - - -===visitParamHas(params, name)=== - -Checks whether there is a field with the 'name' name. - -===visitParamExtractUInt(params, name)=== - -Parses UInt64 from the value of the field named 'name'. If this is a string field, it tries to parse a number from the beginning of the string. If the field doesn't exist, or it exists but doesn't contain a number, it returns 0. - -===visitParamExtractInt(params, name)=== - -The same as for Int64. - -===visitParamExtractFloat(params, name)=== - -The same as for Float64. - -===visitParamExtractBool(params, name)=== - -Parses a true/false value. The result is UInt8. - -===visitParamExtractRaw(params, name)=== - -Returns the value of a field, including separators. Examples: -%%visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"'%% -%%visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}'%% - -===visitParamExtractString(params, name)=== - -Parses the string in double quotes. The value is unescaped. If unescaping failed, it returns an empty string. Examples: -%%visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0'%% -%%visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺'%% -%%visitParamExtractString('{"abc":"\\u263"}', 'abc') = ''%% -%%visitParamExtractString('{"abc":"hello}', 'abc') = ''%% -Currently, there is no support for code points not from the basic multilingual plane written in the format \uXXXX\uYYYY (they are converted to CESU-8 instead of UTF-8). - - -==Functions for implementing the IN operator== - -===in, notIn, globalIn, globalNotIn=== - -See the section "IN operators". - - -===tuple(x, y, ...), operator (x, y, ...)=== -A function that allows grouping multiple columns. -For columns with the types T1, T2, ..., it returns a Tuple(T1, T2, ...) type tuple containing these columns. There is no cost to execute the function. -Tuples are normally used as intermediate values for an argument of IN operators, or for creating a list of formal parameters of lambda functions. Tuples can't be written to a table. - -===tupleElement(tuple, n), operator x.N=== -A function that allows getting columns from a tuple. -'N' is the column index, starting from 1. 'N' must be a constant. 'N' must be a strict postive integer no greater than the size of the tuple. -There is no cost to execute the function. - - -==Other functions== - -===hostName()=== -Returns a string with the name of the host that this function was performed on. For distributed processing, this is the name of the remote server host, if the function is performed on a remote server. - -===visibleWidth(x)=== -Calculates the approximate width when outputting values to the console in text format (tab-separated). This function is used by the system for implementing Pretty formats. - -===toTypeName(x)=== -Gets the type name. Returns a string containing the type name of the passed argument. - -===blockSize()=== -Gets the size of the block. -In ClickHouse, queries are always run on blocks (sets of column parts). This function allows getting the size of the block that you called it for. - -===materialize(x)=== -Turns a constant into a full column containing just one value. -In ClickHouse, full columns and constants are represented differently in memory. Functions work differently for constant arguments and normal arguments (different code is executed), although the result is almost always the same. This function is for debugging this behavior. - -===ignore(...)=== -A function that accepts any arguments and always returns 0. -However, the argument is still calculated. This can be used for benchmarks. - -===sleep(seconds)=== -Sleeps 'seconds' seconds on each data block. You can specify an integer or a floating-point number. - -===currentDatabase()=== -Returns the name of the current database. -You can use this function in table engine parameters in a CREATE TABLE query where you need to specify the database. - -===isFinite(x)=== -Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is not infinite and not a NaN, otherwise 0. - -===isInfinite(x)=== -Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is infinite, otherwise 0. -Note that 0 is returned for a NaN. - -===isNaN(x)=== -Accepts Float32 and Float64 and returns UInt8 equal to 1 if the argument is a NaN, otherwise 0. - -===hasColumnInTable('database', 'table', 'column')=== -Accepts constant String columns - database name, table name and column name. Returns constant UInt8 value, equal to 1 if column exists, -otherwise 0. -If table doesn't exist than exception is thrown. -For elements of nested data structure function checks existence of column. For nested data structure 0 is returned. - -===bar=== -Allows building a unicode-art diagram. - -bar(x, min, max, width) - Draws a band with a width proportional to (x - min) and equal to 'width' characters when x == max. -min, max - Integer constants. The value must fit in Int64. -width - Constant, positive number, may be a fraction. - -The band is drawn with accuracy to one eighth of a symbol. Example: - -%% -SELECT - toHour(EventTime) AS h, - count() AS c, - bar(c, 0, 600000, 20) AS bar -FROM test.hits -GROUP BY h -ORDER BY h ASC - -┌──h─┬──────c─┬─bar────────────────┐ -│ 0 │ 292907 │ █████████▋ │ -│ 1 │ 180563 │ ██████ │ -│ 2 │ 114861 │ ███▋ │ -│ 3 │ 85069 │ ██▋ │ -│ 4 │ 68543 │ ██▎ │ -│ 5 │ 78116 │ ██▌ │ -│ 6 │ 113474 │ ███▋ │ -│ 7 │ 170678 │ █████▋ │ -│ 8 │ 278380 │ █████████▎ │ -│ 9 │ 391053 │ █████████████ │ -│ 10 │ 457681 │ ███████████████▎ │ -│ 11 │ 493667 │ ████████████████▍ │ -│ 12 │ 509641 │ ████████████████▊ │ -│ 13 │ 522947 │ █████████████████▍ │ -│ 14 │ 539954 │ █████████████████▊ │ -│ 15 │ 528460 │ █████████████████▌ │ -│ 16 │ 539201 │ █████████████████▊ │ -│ 17 │ 523539 │ █████████████████▍ │ -│ 18 │ 506467 │ ████████████████▊ │ -│ 19 │ 520915 │ █████████████████▎ │ -│ 20 │ 521665 │ █████████████████▍ │ -│ 21 │ 542078 │ ██████████████████ │ -│ 22 │ 493642 │ ████████████████▍ │ -│ 23 │ 400397 │ █████████████▎ │ -└────┴────────┴────────────────────┘ -%% - -===transform=== -Transforms a value according to the explicitly defined mapping of some elements to other ones. -There are two variations of this function: - -1. %%transform(x, array_from, array_to, default)%% - -%%x%% - What to transform. -%%array_from%% - Constant array of values for converting. -%%array_to%% - Constant array of values to convert the values in 'from' to. -%%default%% - Constant. Which value to use if 'x' is not equal to one of the values in 'from'. - -'array_from' and 'array_to' are arrays of the same size. - -Types: -transform(T, Array(T), Array(U), U) -> U - -'T' and 'U' can be numeric, string, or Date or DateTime types. -Where the same letter is indicated (T or U), for numeric types these might not be matching types, but types that have a common type. -For example, the first argument can have the Int64 type, while the second has the Array(Uint16) type. - -If the 'x' value is equal to one of the elements in the 'array_from' array, it returns the existing element (that is numbered the same) from the 'array_to' array. Otherwise, it returns 'default'. If there are multiple matching elements in 'array_from', it returns one of the matches. - -Example: - -%% - -SELECT - transform(SearchEngineID, [2, 3], ['Yandex', 'Google'], 'Others') AS title, - count() AS c -FROM test.hits -WHERE SearchEngineID != 0 -GROUP BY title -ORDER BY c DESC - -┌─title──┬──────c─┐ -│ Yandex │ 498635 │ -│ Google │ 229872 │ -│ Others │ 104472 │ -└────────┴────────┘ -%% - -2. %%transform(x, array_from, array_to)%% - -Differs from the first variation in that the 'default' argument is omitted. -If the 'x' value is equal to one of the elements in the 'array_from' array, it returns the matching element (that is numbered the same) from the 'array_to' array. Otherwise, it returns 'x'. - -Types: -transform(T, Array(T), Array(T)) -> T - -Example: - -%% - -SELECT - transform(domain(Referer), ['yandex.ru', 'google.ru', 'vk.com'], ['www.yandex', 'ввв.яндекс.рф', 'example.com']) AS s, - count() AS c -FROM test.hits -GROUP BY domain(Referer) -ORDER BY count() DESC -LIMIT 10 - -┌─s──────────────┬───────c─┐ -│ │ 2906259 │ -│ www.yandex │ 867767 │ -│ ███████.ru │ 313599 │ -│ mail.yandex.ru │ 107147 │ -│ ввв.яндекс.рф │ 105668 │ -│ ██████.ru │ 100355 │ -│ █████████.ru │ 65040 │ -│ news.yandex.ru │ 64515 │ -│ ██████.net │ 59141 │ -│ example.com │ 57316 │ -└────────────────┴─────────┘ -%% - -===formatReadableSize(x)=== - -Gets a size (number of bytes). Returns a string that contains rounded size with the suffix (KiB, MiB etc.). - -Example: - -%% -SELECT - arrayJoin([1, 1024, 1024*1024, 192851925]) AS filesize_bytes, - formatReadableSize(filesize_bytes) AS filesize - -┌─filesize_bytes─┬─filesize───┐ -│ 1 │ 1.00 B │ -│ 1024 │ 1.00 KiB │ -│ 1048576 │ 1.00 MiB │ -│ 192851925 │ 183.92 MiB │ -└────────────────┴────────────┘ -%% - -===least(a, b)=== - -Returns the least element of a and b. - -===greatest(a, b)=== - -Returns the greatest element of a and b. - -===uptime()=== - -Returns server's uptime in seconds. - -===version()=== - -Returns server's version as a string. - -===rowNumberInAllBlocks()=== - -Returns an incremental row number within all blocks that were processed by this function. - -===runningDifference(x)=== - -Calculates the difference between consecutive values in the data block. -Result of the function depends on the order of the data in the blocks. - -It works only inside of the each processed block of data. Data splitting in the blocks is not explicitly controlled by the user. -If you specify ORDER BY in subquery and call runningDifference outside of it, you could get an expected result. - -Example: -%% -SELECT - EventID, - EventTime, - runningDifference(EventTime) AS delta -FROM -( - SELECT - EventID, - EventTime - FROM events - WHERE EventDate = '2016-11-24' - ORDER BY EventTime ASC - LIMIT 5 -) - -┌─EventID─┬───────────EventTime─┬─delta─┐ -│ 1106 │ 2016-11-24 00:00:04 │ 0 │ -│ 1107 │ 2016-11-24 00:00:05 │ 1 │ -│ 1108 │ 2016-11-24 00:00:05 │ 0 │ -│ 1109 │ 2016-11-24 00:00:09 │ 4 │ -│ 1110 │ 2016-11-24 00:00:10 │ 1 │ -└─────────┴─────────────────────┴───────┘ -%% - -==arrayJoin function== - -This is a very unusual function. - -Normal functions don't change a set of rows, but just change the values in each row (map). Aggregate functions compress a set of rows (fold or reduce). -The 'arrayJoin' function takes each row and generates a set of rows (unfold). - -This function takes an array as an argument, and propagates the source row to multiple rows for the number of elements in the array. -All the values in columns are simply copied, except the values in the column where this function is applied - it is replaced with the corresponding array value. - -A query can use multiple 'arrayJoin' functions. In this case, the transformation is performed multiple times. - -Note the ARRAY JOIN syntax in the SELECT query, which provides broader possibilities. - -Example: - -%% -:) SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src - -SELECT - arrayJoin([1, 2, 3] AS src) AS dst, - 'Hello', - src - -┌─dst─┬─\'Hello\'─┬─src─────┐ -│ 1 │ Hello │ [1,2,3] │ -│ 2 │ Hello │ [1,2,3] │ -│ 3 │ Hello │ [1,2,3] │ -└─────┴───────────┴─────────┘ -%% - -
    -
    -

    Aggregate functions

    -
    -
    - -==count()== - -Counts the number of rows. Accepts zero arguments and returns UInt64. -The syntax COUNT(DISTINCT x) is not supported. The separate 'uniq' aggregate function exists for this purpose. - -A 'SELECT count() FROM table' query is not optimized, because the number of entries in the table is not stored separately. It will select some small column from the table and count the number of values in it. - - -==any(x)== - -Selects the first encountered value. -The query can be executed in any order and even in a different order each time, so the result of this function is indeterminate. -To get a determinate result, you can use the 'min' or 'max' function instead of 'any'. - -In some cases, you can rely on the order of execution. This applies to cases when SELECT comes from a subquery that uses ORDER BY. - -When a SELECT query has the GROUP BY clause or at least one aggregate function, ClickHouse (in contrast to MySQL) requires that all expressions in the SELECT, HAVING, and ORDER BY clauses be calculated from keys or from aggregate functions. That is, each column selected from the table must be used either in keys, or inside aggregate functions. To get behavior like in MySQL, you can put the other columns in the 'any' aggregate function. - - -==anyLast(x)== - -Selects the last value encountered. -The result is just as indeterminate as for the 'any' function. - - -==min(x)== - -Calculates the minimum. - - -==max(x)== - -Calculates the maximum. - - -==argMin(arg, val)== - -Calculates the 'arg' value for a minimal 'val' value. If there are several different values of 'arg' for minimal values of 'val', the first of these values encountered is output. - - -==argMax(arg, val)== - -Calculates the 'arg' value for a maximum 'val' value. If there are several different values of 'arg' for maximum values of 'val', the first of these values encountered is output. - - -==sum(x)== - -Calculates the sum. -Only works for numbers. - - -==avg(x)== - -Calculates the average. -Only works for numbers. -The result is always Float64. - - -==uniq(x)== - -Calculates the approximate number of different values of the argument. Works for numbers, strings, dates, and dates with times. - -Uses an adaptive sampling algorithm: for the calculation state, it uses a sample of element hash values with a size up to 65535. -Compared with the widely known HyperLogLog algorithm, this algorithm is less effective in terms of accuracy and memory consumption (even up to proportionality), but it is adaptive. This means that with fairly high accuracy, it consumes less memory during simultaneous computation of cardinality for a large number of data sets whose cardinality has power law distribution (i.e. in cases when most of the data sets are small). This algorithm is also very accurate for data sets with small cardinality (up to 65536) and very efficient on CPU (when computing not too many of these functions, using 'uniq' is almost as fast as using other aggregate functions). - -There is no compensation for the bias of an estimate, so for large data sets the results are systematically deflated. This function is normally used for computing the number of unique visitors in Yandex.Metrica, so this bias does not play a role. - -The result is determinate (it doesn't depend on the order of query execution). - - -==uniqHLL12(x)== - -Uses the HyperLogLog algorithm to approximate the number of different values of the argument. It uses 212 5-bit cells. The size of the state is slightly more than 2.5 KB. - -The result is determinate (it doesn't depend on the order of query execution). - -In most cases, use the 'uniq' function. You should only use this function if you understand its advantages well. - - -==uniqExact(x)== - -Calculates the number of different values of the argument, exactly. -There is no reason to fear approximations, so it's better to use the 'uniq' function. -You should use the 'uniqExact' function if you definitely need an exact result. - -The 'uniqExact' function uses more memory than the 'uniq' function, because the size of the state has unbounded growth as the number of different values increases. - - -==groupArray(x)== - -Creates an array of argument values. -Values can be added to the array in any (indeterminate) order. - -In some cases, you can rely on the order of execution. This applies to cases when SELECT comes from a subquery that uses ORDER BY. - - -==groupUniqArray(x)== - -Creates an array from different argument values. Memory consumption is the same as for the 'uniqExact' function. - - -==median(x)== - -Approximates the median. Also see the similar 'quantile' function. -Works for numbers, dates, and dates with times. -For numbers it returns Float64, for dates - a date, and for dates with times - a date with time. - -Uses reservoir sampling with a reservoir size up to 8192. -If necessary, the result is output with linear approximation from the two neighboring values. -This algorithm proved to be more practical than another well-known algorithm - QDigest. - -The result depends on the order of running the query, and is nondeterministic. - - -==medianTiming(x)== - -Calculates the median with fixed accuracy. -Works for numbers. Intended for calculating medians of page loading time in milliseconds. -Also see the similar 'quantileTiming' function. - -If the value is greater than 30,000 (a page loading time of more than 30 seconds), the result is equated to 30,000. -If the value is less than 1024, the calculation is exact. -If the value is from 1025 to 29,000, the calculation is rounded to a multiple of 16. - -In addition, if the total number of values passed to the aggregate function was less than 32, the calculation is exact. - -When passing negative values to the function, the behavior is undefined. - -The returned value has the Float32 type. If no values were passed to the function (when using 'medianTimingIf' or 'quantileTimingIf'), 'nan' is returned. The purpose of this is to differentiate these instances from zeros. See the note on sorting NaNs in "ORDER BY clause". - -The result is determinate (it doesn't depend on the order of query execution). - -For its purpose (calculating quantiles of page loading times), using this function is more effective and the result is more accurate than for the 'median/quantile' function. - - -==medianDeterministic(x, determinator)== - -This function works similarly to the 'median' function - it approximates the median. However, in contrast to 'median', the result is deterministic and does not depend on the order of query execution. - -To achieve this, the function takes a second argument - the "determinator". This is a number whose hash is used instead of a random number generator in the reservoir sampling algorithm. For the function to work correctly, the same determinator value should not occur too often. For the determinator, you can use an event ID, user ID, and so on. - -Don't use this function for calculating timings. The 'medianTiming', 'quantileTiming', and 'quantilesTiming' functions are better suited to this purpose. - - -==medianTimingWeighted(x, weight)== - -Differs from the 'medianTiming' function in that it has a second argument - "weights". Weight is a non-negative integer. -The result is calculated as if the 'x' value were passed 'weight' number of times to the 'medianTiming' function. - - -==varSamp(x)== - -Calculates the amount Σ((x - x̅)2) / (n - 1), where 'n' is the sample size and 'x̅' is the average value of 'x'. - -It represents an unbiased estimate of the variance of a random variable, if the values passed to the function are a sample of this random amount. - -Returns Float64. If n <= 1, it returns +∞. - - -==varPop(x)== - -Calculates the amount Σ((x - x̅)2) / n, where 'n' is the sample size and 'x̅' is the average value of 'x'. - -In other words, dispersion for a set of values. Returns Float64. - - -==stddevSamp(x)== - -The result is equal to the square root of 'varSamp(x)'. - - -==stddevPop(x)== - -The result is equal to the square root of 'varPop(x)'. - - -==covarSamp(x, y)== - -Calculates the value of %%Σ((x - x̅)(y - y̅)) / (n - 1)%%. - -Returns Float64. If n <= 1, it returns +∞. - - -==covarPop(x, y)== - -Calculates the value of %%Σ((x - x̅)(y - y̅)) / n%%. - - -==corr(x, y)== - -Calculates the Pearson correlation coefficient: Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)2) * Σ((y - y̅)2)). - - -==Parametric aggregate functions== - -Some aggregate functions can accept not only argument columns (used for compression), but a set of parameters - constants for initialization. The syntax is two pairs of brackets instead of one. The first is for parameters, and the second is for arguments. - - -==quantile(level)(x)== - -Approximates the 'level' quantile. 'level' is a constant, a floating-point number from 0 to 1. We recommend using a 'level' value in the range of 0.01 .. 0.99. -Don't use a 'level' value equal to 0 or 1 - use the 'min' and 'max' functions for these cases. - -The algorithm is the same as for the 'median' function. Actually, 'quantile' and 'median' are internally the same function. You can use the 'quantile' function without parameters - in this case, it calculates the median, and you can use the 'median' function with parameters - in this case, it calculates the quantile of the set level. - -When using multiple 'quantile' and 'median' functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the 'quantiles' function. - - -==quantiles(level1, level2, ...)(x)== - -Approximates quantiles of all specified levels. -The result is an array containing the corresponding number of values. - - -==quantileTiming(level)(x)== - -Calculates the quantile of 'level' using the same algorithm as the 'medianTiming' function. - - -==quantilesTiming(level1, level2, ...)(x)== - -Calculates the quantiles of all specified levels using the same algorithm as the 'medianTiming' function. - - -==quantileTimingWeighted(level)(x, weight)== - -Calculates the quantile of 'level' using the same algorithm as the 'medianTimingWeighted' function. - - -==quantilesTimingWeighted(level1, level2, ...)(x, weight)== - -Calculates the quantiles of all specified levels using the same algorithm as the 'medianTimingWeighted' function. - - -==quantileDeterministic(level)(x, determinator)== - -Calculates the quantile of 'level' using the same algorithm as the 'medianDeterministic' function. - - -==quantilesDeterministic(level1, level2, ...)(x, determinator)== - -Calculates the quantiles of all specified levels using the same algorithm as the 'medianDeterministic' function. - - -==sequenceMatch(pattern)(time, cond1, cond2, ...)== - -Pattern matching for event chains. - -'pattern' is a string containing a pattern to match. The pattern is similar to a regular expression. -'time' is the event time of the DateTime type. -'cond1, cond2 ...' are from one to 32 arguments of the UInt8 type that indicate whether an event condition was met. - -The function collects a sequence of events in RAM. Then it checks whether this sequence matches the pattern. -It returns UInt8 - 0 if the pattern isn't matched, or 1 if it matches. - -Example: %%sequenceMatch('(?1).*(?2)')(EventTime, URL LIKE '%company%', URL LIKE '%cart%')%% -- whether there was a chain of events in which pages with the address in %%company%% were visited earlier than pages with the address in %%cart%%. - -This is a degenerate example. You could write it using other aggregate functions: -%%minIf(EventTime, URL LIKE '%company%') < maxIf(EventTime, URL LIKE '%cart%')%%. -However, there is no such solution for more complex situations. - -Pattern syntax: -%%(?1)%% - Reference to a condition (any number in place of 1). -%%.*%% - Any number of events. -(?t>=1800) - Time condition. -Any quantity of any type of events is allowed over the specified time. -The operators <, >, <= may be used instead of >=. -Any number may be specified in place of 1800. - -Events that occur during the same second may be put in the chain in any order. This may affect the result of the function. - -==uniqUpTo(N)(x)== - -Calculates the number of different argument values, if it is less than or equal to N. -If the number of different argument values is greater than N, it returns N + 1. - -Recommended for use with small Ns, up to 10. The maximum N value is 100. - -For the state of an aggregate function, it uses the amount of memory equal to 1 + N * the size of one value of bytes. -For strings, it stores a non-cryptographic hash of 8 bytes. That is, the calculation is approximated for strings. - -It works as fast as possible, except for cases when a large N value is used and the number of unique values is slightly less than N. - -Usage example: -Problem: Generate a report that shows only keywords that produced at least 5 unique users. -Solution: Write in the query GROUP BY SearchPhrase HAVING uniqUpTo(4)(UserID) >= 5 - -==topK(N)(x)== - -Returns the K most frequent argument values as an array sorted by their relative frequency. - -Recommended for use with small Ns, up to 10. The maximum N value is 65536. - -For the state of an aggregate function, it uses approximately the amount of memory equal to K * (the size of the key + 16) for counters, and 48 * N bytes for alpha value map. - -Usage example: -Problem: Generate a report that shows top 5 frequent queries. -Solution: Write in the query SELECT topK(5)(SearchPhrase) - -==Aggregate function combinators== - -The name of an aggregate function can have a suffix appended to it. This changes the way the aggregate function works. -There are %%If%% and %%Array%% combinators. See the sections below. - - -==-If combinator. Conditional aggregate functions== - -The suffix -%%If%% can be appended to the name of any aggregate function. In this case, the aggregate function accepts an extra argument - a condition (Uint8 type). The aggregate function processes only the rows that trigger the condition. If the condition was not triggered even once, it returns a default value (usually zeros or empty strings). - -Examples: %%sumIf(column, cond)%%, %%countIf(cond)%%, %%avgIf(x, cond)%%, %%quantilesTimingIf(level1, level2)(x, cond)%%, %%argMinIf(arg, val, cond)%% and so on. - -You can use aggregate functions to calculate aggregates for multiple conditions at once, without using subqueries and JOINs. -For example, in Yandex.Metrica, we use conditional aggregate functions for implementing segment comparison functionality. - - -==-Array combinator. Aggregate functions for array arguments== - -The -%%Array%% suffix can be appended to any aggregate function. In this case, the aggregate function takes arguments of the 'Array(T)' type (arrays) instead of 'T' type arguments. If the aggregate function accepts multiple arguments, this must be arrays of equal lengths. When processing arrays, the aggregate function works like the original aggregate function across all array elements. - -Example 1: %%sumArray(arr)%% - Totals all the elements of all 'arr' arrays. In this example, it could have been written more simply: %%sum(arraySum(arr))%%. -Example 2: %%uniqArray(arr)%% - Count the number of unique elements in all 'arr' arrays. This could be done an easier way: %%uniq(arrayJoin(arr))%%, but it's not always possible to add 'arrayJoin' to a query. - -The -%%If%% and -%%Array%% combinators can be used together. However, 'Array' must come first, then 'If'. Examples: %%uniqArrayIf(arr, cond)%%, %%quantilesTimingArrayIf(level1, level2)(arr, cond)%%. Due to this order, the 'cond' argument can't be an array. - - -==-State combinator== - -If this combinator is used, the aggregate function returns a non-completed/non-finished value (for example, in the case of the `uniq` function, the number of unique values), and the intermediate aggregation state (for example, in the case of the `uniq` function, a hash table for calculating the number of unique values), which has type of %%AggregateFunction(...)%% and can be used for further processing or can be saved to a table for subsequent pre-aggregation - see the sections "AggregatingMergeTree" and "functions for working with intermediate aggregation states". - - -==-Merge combinator== - -In the case of using this combinator, the aggregate function will take as an argument the intermediate state of an aggregation, pre-aggregate (combine together) these states, and return the finished/complete value. - - -==-MergeState combinator== - -Merges the intermediate aggregation states, similar to the -Merge combinator, but returns a non-complete value, but an intermediate aggregation state, similar to the -State combinator. - - -
    -
    -

    Dictionaries

    -
    -
    - -A dictionary is a mapping (key -> attributes) that can be used in a query as functions. You can think of this as a more convenient and efficient type of JOIN with dimension tables. - -There are built-in (internal) and add-on (external) dictionaries. - -==Internal dictionaries== - -ClickHouse contains a built-in feature for working with a geobase. - -This allows you to: -- Use a region's ID to get its name in the desired language. -- Use a region's ID to get the ID of a city, area, federal district, country, or continent. -- Check whether a region is part of another region. -- Get a chain of parent regions. - -All the functions support "translocality," the ability to simultaneously use different perspectives on region ownership. For more information, see the section "Functions for working with Yandex.Metrica dictionaries". - -The internal dictionaries are disabled in the default package. -To enable them, uncomment the parameters 'path_to_regions_hierarchy_file' and 'path_to_regions_names_files' in the server config file. - -The geobase is loaded from text files. -If you are Yandex employee, to create them, use the following instructions: -https://github.yandex-team.ru/raw/Metrika/ClickHouse_private/master/doc/create_embedded_geobase_dictionaries.txt - -Put the regions_hierarchy*.txt files in the path_to_regions_hierarchy_file directory. This configuration parameter must contain the path to the regions_hierarchy.txt file (the default regional hierarchy), and the other files (regions_hierarchy_ua.txt) must be located in the same directory. - -Put the regions_names_*.txt files in the path_to_regions_names_files directory. - -You can also create these files yourself. The file format is as follows: - -regions_hierarchy*.txt: TabSeparated (no header), columns: -- Region ID (UInt32) -- Parent region ID (UInt32) -- Region type (UInt8): 1 - continent, 3 - country, 4 - federal district, 5 - region, 6 - city; other types don't have values. -- Population (UInt32) - Optional column. - -regions_names_*.txt: TabSeparated (no header), columns: -- Region ID (UInt32) -- Region name (String) - Can't contain tabs or line breaks, even escaped ones. - -A flat array is used for storing in RAM. For this reason, IDs shouldn't be more than a million. - -Dictionaries can be updated without the server restart. However, the set of available dictionaries is not updated. For updates, the file modification times are checked. If a file has changed, the dictionary is updated. -The interval to check for changes is configured in the 'builtin_dictionaries_reload_interval' parameter. -Dictionary updates (other than loading at first use) do not block queries. During updates, queries use the old versions of dictionaries. If an error occurs during an update, the error is written to the server log, while queries continue using the old version of dictionaries. - -We recommend periodically updating the dictionaries with the geobase. During an update, generate new files and write them to a separate location. When everything is ready, rename them to the files used by the server. - -There are also functions for working with OS identifiers and Yandex.Metrica search engines, but they shouldn't be used. - - -==External dictionaries== - -It is possible to add your own dictionaries from various data sources. The data source for a dictionary can be a file in the local file system, the ClickHouse server, or a MySQL server. -A dictionary can be stored completely in RAM and updated regularly, or it can be partially cached in RAM and dynamically load missing values. - -The configuration of external dictionaries is in a separate file or files specified in the 'dictionaries_config' configuration parameter. -This parameter contains the absolute or relative path to the file with the dictionary configuration. A relative path is relative to the directory with the server config file. The path can contain wildcards * and ?, in which case all matching files are found. Example: dictionaries/*.xml. - -The dictionary configuration, as well as the set of files with the configuration, can be updated without restarting the server. The server checks updates every 5 seconds. This means that dictionaries can be enabled dynamically. - -Dictionaries can be created when starting the server, or at first use. This is defined by the 'dictionaries_lazy_load' parameter in the main server config file. This parameter is optional, 'true' by default. If set to 'true', each dictionary is created at first use. If dictionary creation failed, the function that was using the dictionary throws an exception. If 'false', all dictionaries are created when the server starts, and if there is an error, the server shuts down. - -The dictionary config file has the following format: - -%% -<dictionaries> - <comment>Optional element with any content; completely ignored.</comment> - - <!--You can set any number of different dictionaries. --> - <dictionary> - <!-- Dictionary name. The dictionary will be accessed for use by this name. --> - <name>os</name> - - <!-- Data source. --> - <source> - <!-- Source is a file in the local file system. --> - <file> - <!-- Path on the local file system. --> - <path>/opt/dictionaries/os.tsv</path> - <!-- Which format to use for reading the file. --> - <format>TabSeparated</format> - </file> - - <!-- or the source is a table on a MySQL server. - <mysql> - <!- - These parameters can be specified outside (common for all replicas) or inside a specific replica - -> - <port>3306</port> - <user>clickhouse</user> - <password>qwerty</password> - <!- - Specify from one to any number of replicas for fault tolerance. - -> - <replica> - <host>example01-1</host> - <priority>1</priority> <!- - The lower the value, the higher the priority. - -> - </replica> - <replica> - <host>example01-2</host> - <priority>1</priority> - </replica> - <db>conv_main</db> - <table>counters</table> - </mysql> - --> - - <!-- or the source is a table on the ClickHouse server. - <clickhouse> - <host>example01-01-1</host> - <port>9000</port> - <user>default</user> - <password></password> - <db>default</db> - <table>counters</table> - </clickhouse> - <!- - If the address is similar to localhost, the request is made without network interaction. For fault tolerance, you can create a Distributed table on localhost and enter it. - -> - --> - - <!-- or the source is a executable. If layout.complex_key_cache - list of needed keys will be written in STDIN of program --> - <executable> - <!-- Path on the local file system or name located in one of env PATH dirs. --> - <command>cat /opt/dictionaries/os.tsv</command> - <!-- Which format to use for reading/writing stream. --> - <format>TabSeparated</format> - </executable> - - <!-- or the source is a http server. If layout.complex_key_cache - list of needed keys will be sent as POST --> - <http> - <!-- Host. --> - <url>http://[::1]/os.tsv</url> - <!-- Which format to use for reading answer and making POST. --> - <format>TabSeparated</format> - </http> - - </source> - - <!-- Update interval for fully loaded dictionaries. 0 - never update. --> - <lifetime> - <min>300</min> - <max>360</max> - <!-- The update interval is selected uniformly randomly between min and max, in order to spread out the load when updating dictionaries on a large number of servers. --> - </lifetime> - - <!-- or <!- - The update interval for fully loaded dictionaries or invalidation time for cached dictionaries. 0 - never update. - -> - <lifetime>300</lifetime> - --> - - <layout> <!-- Method for storing in memory. --> - <flat /> - <!-- or <hashed /> - or - <cache> - <!- - Cache size in number of cells; rounded up to a degree of two. - -> - <size_in_cells>1000000000</size_in_cells> - </cache> --> - </layout> - - <!-- Structure. --> - <structure> - <!-- Description of the column that serves as the dictionary identifier (key). --> - <id> - <!-- Column name with ID. --> - <name>Id</name> - </id> - - <attribute> - <!-- Column name. --> - <name>Name</name> - <!-- Column type. (How the column is understood when loading. For MySQL, a table can have TEXT, VARCHAR, and BLOB, but these are all loaded as String) --> - <type>String</type> - <!-- Value to use for a non-existing element. In the example, an empty string. --> - <null_value></null_value> - </attribute> - <!-- Any number of attributes can be specified. --> - <attribute> - <name>ParentID</name> - <type>UInt64</type> - <null_value>0</null_value> - <!-- Whether it defines a hierarchy - mapping to the parent ID (by default, false). --> - <hierarchical>true</hierarchical> - <!-- The mapping id -> attribute can be considered injective, in order to optimize GROUP BY. (by default, false) --> - <injective>true</injective> - </attribute> - </structure> - </dictionary> -</dictionaries> -%% - -The dictionary identifier (key attribute) should be a number that fits into UInt64. Also, you can use arbitrary tuples as keys (see section "Dictionaries with complex keys"). Note: you can use complex keys consisting of just one element. This allows using e.g. Strings as dictionary keys. - -There are six ways to store dictionaries in memory. - -1. %%flat%% - As flat arrays. -This is the most effective method. It works if all keys are smaller than 500,000. If a larger key is discovered when creating the dictionary, an exception is thrown and the dictionary is not created. The dictionary is loaded to RAM in its entirety. The dictionary uses the amount of memory proportional to maximum key value. With the limit of 500,000, memory consumption is not likely to be high. All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety. - -2. %%hashed%% - As hash tables. -This method is slightly less effective than the first one. The dictionary is also loaded to RAM in its entirety, and can contain any number of items with any identifiers. In practice, it makes sense to use up to tens of millions of items, while there is enough RAM. -All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety. - -3. %%cache%% - This is the least effective method. It is appropriate if the dictionary doesn't fit in RAM. It is a cache of a fixed number of cells, where frequently-used data can be located. MySQL, ClickHouse, executable, http sources are supported, but file sources are not supported. When searching a dictionary, the cache is searched first. For each data block, all keys not found in the cache (or expired keys) are collected in a package, which is sent to the source with the query %%SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)%%. The received data is then written to the cache. - -4. %%range_hashed%% - -5. %%complex_key_hashed%% - The same as %%hashed%%, but for complex keys. - -6. %%complex_key_cache%% - The same as %%cache%%, but for complex keys. - -===Notes=== - -We recommend using the flat method when possible, or hashed. The speed of the dictionaries is impeccable with this type of memory storage. - -Use the cache method only in cases when it is unavoidable. The speed of the cache depends strongly on correct settings and the usage scenario. A cache type dictionary only works normally for high enough hit rates (recommended 99% and higher). You can view the average hit rate in the system.dictionaries table. Set a large enough cache size. You will need to experiment to find the right number of cells - select a value, use a query to get the cache completely full, look at the memory consumption (this information is in the system.dictionaries table), then proportionally increase the number of cells so that a reasonable amount of memory is consumed. We recommend MySQL as the source for the cache, because ClickHouse doesn't handle requests with random reads very well. - -In all cases, performance is better if you call the function for working with a dictionary after GROUP BY, and if the attribute being fetched is marked as injective. For a dictionary cache, performance improves if you call the function after LIMIT. To do this, you can use a subquery with LIMIT, and call the function with the dictionary from the outside. - -An attribute is called injective if different attribute values correspond to different keys. So when GROUP BY uses a function that fetches an attribute value by the key, this function is automatically taken out of GROUP BY. - -When updating dictionaries from a file, first the file modification time is checked, and it is loaded only if the file has changed. -When updating from MySQL, for flat and hashed dictionaries, first a SHOW TABLE STATUS query is made, and the table update time is checked. If it is not NULL, it is compared to the stored time. This works for MyISAM tables, but for InnoDB tables the update time is unknown, so loading from InnoDB is performed on each update. - -For cache dictionaries, the expiration (lifetime) of data in the cache can be set. If more time than 'lifetime' has passed since loading the data in a cell, the cell's value is not used, and it is re-requested the next time it needs to be used. - -If a dictionary couldn't be loaded even once, an attempt to use it throws an exception. -If an error occurred during a request to a cached source, an exception is thrown. -Dictionary updates (other than loading for first use) do not block queries. During updates, the old version of a dictionary is used. If an error occurs during an update, the error is written to the server log, and queries continue using the old version of dictionaries. - -You can view the list of external dictionaries and their status in the system.dictionaries table. - -To use external dictionaries, see the section "Functions for working with external dictionaries". - -Note that you can convert values for a small dictionary by specifying all the contents of the dictionary directly in a SELECT query (see the section "transform function"). This functionality is not related to external dictionaries. - -===Dictionaries with complex keys=== - -You can use tuples consisting of fields of arbitrary types as keys. Configure your dictionary with %%complex_key_hashed%% or %%complex_key_cache%% layout in this case. - -Key structure is configured not in the %%<id>%% element but in the %%<key>%% element. Fields of the key tuple are configured analogously to dictionary attributes. Example: - -%% -<structure> - <key> - <attribute> - <name>field1</name> - <type>String</type> - </attribute> - <attribute> - <name>field2</name> - <type>UInt32</type> - </attribute> - ... - </key> - ... -%% - -When using such dictionary, use a Tuple of field values as a key in dictGet* functions. Example: %%dictGetString('dict_name', 'attr_name', tuple('field1_value', 123))%%. - -
    -
    -

    Settings

    -
    -
    - -In this section, we review settings that you can make using a SET query or in a config file. Remember that these settings can be set for a session or globally. Settings that can only be made in the server config file are not covered here. - - -==max_block_size== - -In ClickHouse, data is processed by blocks (sets of column parts). The internal processing cycles for a single block are efficient enough, but there are noticeable expenditures on each block. 'max_block_size' is a recommendation for what size of block (in number of rows) to load from tables. The block size shouldn't be too small, so that the expenditures on each block are still noticeable, but not too large, so that the query with LIMIT that is completed after the first block is processed quickly, so that too much memory isn't consumed when extracting a large number of columns in multiple threads, and so that at least some cache locality is preserved. - -By default, it is 65,536. - -Blocks the size of 'max_block_size' are not always loaded from the table. If it is obvious that less data needs to be retrieved, a smaller block is processed. - - -==preferred_block_size_bytes== -Similar to %%max_block_size%%, but sets reccomended size of blocks in bytes, adaptively estimating number of required rows in a block. -Wherein size of the block cannot be greater than %%max_block_size%% rows. -The setting is off by default (set to 0), it works only during the reads froms MergeTree-engines. - - -==max_insert_block_size== - -The size of blocks to form for insertion into a table. -This setting only applies in cases when the server forms the blocks. -For example, for an INSERT via the HTTP interface, the server parses the data format and forms blocks of the specified size. -But when using clickhouse-client, the client parses the data itself, and the 'max_insert_block_size' setting on the server doesn't affect the size of the inserted blocks. -The setting also doesn't have a purpose when using INSERT SELECT, since data is inserted in the same blocks that are formed after SELECT. - -By default, it is 1,048,576. - -This is slightly more than 'max_block_size'. The reason for this is because certain table engines (*MergeTree) form a data part on the disk for each inserted block, which is a fairly large entity. Similarly, *MergeTree tables sort data during insertion, and a large enough block size allows sorting more data in RAM. - - -==max_threads== - -The maximum number of query processing threads -- excluding threads for retrieving data from remote servers (see the 'max_distributed_connections' parameter). - -This parameter applies to threads that perform the same stages of the query execution pipeline in parallel. -For example, if reading from a table, evaluating expressions with functions, filtering with WHERE and pre-aggregating for GROUP BY can all be done in parallel using at least 'max_threads' number of threads, then 'max_threads' are used. - -By default, 8. - -If less than one SELECT query is normally run on a server at a time, set this parameter to a value slightly less than the actual number of processor cores. - -For queries that are completed quickly because of a LIMIT, you can set a lower 'max_threads'. For example, if the necessary number of entries are located in every block and max_threads = 8, 8 blocks are retrieved, although it would have been enough to read just one. - -The smaller the 'max_threads' value, the less memory is consumed. - - -==max_compress_block_size== - -The maximum size of blocks of uncompressed data before compressing for writing to a table. By default, 1,048,576 (1 MiB). If the size is reduced, the compression rate is significantly reduced, the compression and decompression speed increases slightly due to cache locality, and memory consumption is reduced. There usually isn't any reason to change this setting. - -Don't confuse blocks for compression (a chunk of memory consisting of bytes) and blocks for query processing (a set of rows from a table). - - -==min_compress_block_size== - -For *MergeTree tables. In order to reduce latency when processing queries, a block is compressed when writing the next mark if its size is at least 'min_compress_block_size'. By default, 65,536. - -The actual size of the block, if the uncompressed data less than 'max_compress_block_size' is no less than this value and no less than the volume of data for one mark. - -Let's look at an example. Assume that 'index_granularity' was set to 8192 during table creation. - -We are writing a UInt32-type column (4 bytes per value). When writing 8192 rows, the total will be 32 KB of data. Since min_compress_block_size = 65,536, a compressed block will be formed for every two marks. - -We are writing a URL column with the String type (average size of 60 bytes per value). When writing 8192 rows, the average will be slightly less than 500 KB of data. Since this is more than 65,536, a compressed block will be formed for each mark. In this case, when reading data from the disk in the range of a single mark, extra data won't be decompressed. - -There usually isn't any reason to change this setting. - - -==max_query_size== - -The maximum part of a query that can be taken to RAM for parsing with the SQL parser. -The INSERT query also contains data for INSERT that is processed by a separate stream parser (that consumes O(1) RAM), which is not included in this restriction. - -By default, 256 KiB. - - -==interactive_delay== - -The interval in microseconds for checking whether request execution has been canceled and sending the progress. -By default, 100,000 (check for canceling and send progress ten times per second). - - -==connect_timeout== -==receive_timeout== -==send_timeout== - -Timeouts in seconds on the socket used for communicating with the client. -By default, 10, 300, 300. - - -==poll_interval== - -Lock in a wait loop for the specified number of seconds. -By default, 10. - - -==max_distributed_connections== - -The maximum number of simultaneous connections with remote servers for distributed processing of a single query to a single Distributed table. We recommend setting a value no less than the number of servers in the cluster. - -By default, 100. - - -The following parameters are only used when creating Distributed tables (and when launching a server), so there is no reason to change them at runtime. - -==distributed_connections_pool_size== - -The maximum number of simultaneous connections with remote servers for distributed processing of all queries to a single Distributed table. We recommend setting a value no less than the number of servers in the cluster. - -By default, 128. - - -==connect_timeout_with_failover_ms== - -The timeout in milliseconds for connecting to a remote server for a Distributed table engine, if the 'shard' and 'replica' sections are used in the cluster definition. -If unsuccessful, several attempts are made to connect to various replicas. -By default, 50. - - -==connections_with_failover_max_tries== - -The maximum number of connection attempts with each replica, for the Distributed table engine. -By default, 3. - - -==extremes== - -Whether to count extreme values (the minimums and maximums in columns of a query result). -Accepts 0 or 1. By default, 0 (disabled). -For more information, see the section "Extreme values". - - -==use_uncompressed_cache== - -Whether to use a cache of uncompressed blocks. Accepts 0 or 1. By default, 0 (disabled). -The uncompressed cache (only for tables in the MergeTree family) allows significantly reducing latency and increasing throughput when working with a large number of short queries. Enable this setting for users who send frequent short requests. Also pay attention to the 'uncompressed_cache_size' configuration parameter (only set in the config file) - the size of uncompressed cache blocks. By default, it is 8 GiB. The uncompressed cache is filled in as needed; the least-used data is automatically deleted. - -For queries that read at least a somewhat large volume of data (one million rows or more), the uncompressed cache is disabled automatically in order to save space for truly small queries. So you can keep the 'use_uncompressed_cache' setting always set to 1. - - -==replace_running_query== - -When using the HTTP interface, the 'query_id' parameter can be passed. This is any string that serves as the query identifier. -If a query from the same user with the same 'query_id' already exists at this time, the behavior depends on the 'replace_running_query' parameter. - -0 (default) - Throw an exception (don't allow the query to run if a query with the same 'query_id' is already running). -1 - Cancel the old query and start running the new one. - -Yandex.Metrica uses this parameter set to 1 for implementing suggestions for segmentation conditions. After entering the next character, if the old query hasn't finished yet, it should be canceled. - - -==load_balancing== - -Which replicas (among healthy replicas) to preferably send a query to (on the first attempt) for distributed processing. - -random (default) - -The number of errors is counted for each replica. The query is sent to the replica with the fewest errors, and if there are several of these, to any one of them. -Disadvantages: Server proximity is not accounted for; if the replicas have different data, you will also get different data. - -nearest_hostname - -The number of errors is counted for each replica. Every 5 minutes, the number of errors is integrally divided by 2. Thus, the number of errors is calculated for a recent time with exponential smoothing. If there is one replica with a minimal number of errors (i.e. errors occurred recently on the other replicas), the query is sent to it. If there are multiple replicas with the same minimal number of errors, the query is sent to the replica with a host name that is most similar to the server's host name in the config file (for the number of different characters in identical positions, up to the minimum length of both host names). - -As an example, example01-01-1 and example01-01-2.yandex.ru are different in one position, while example01-01-1 and example01-02-2 differ in two places. -This method might seem a little stupid, but it doesn't use external data about network topology, and it doesn't compare IP addresses, which would be complicated for our IPv6 addresses. - -Thus, if there are equivalent replicas, the closest one by name is preferred. -We can also assume that when sending a query to the same server, in the absence of failures, a distributed query will also go to the same servers. So even if different data is placed on the replicas, the query will return mostly the same results. - -in_order - -Replicas are accessed in the same order as they are specified. The number of errors does not matter. This method is appropriate when you know exactly which replica is preferable. - - -==totals_mode== - -How to calculate TOTALS when HAVING is present, as well as when max_rows_to_group_by and group_by_overflow_mode = 'any' are present. -See the section "WITH TOTALS modifier". - -==totals_auto_threshold== - -The threshold for totals_mode = 'auto'. -See the section "WITH TOTALS modifier". - - -==default_sample== - -A floating-point number from 0 to 1. By default, 1. -Allows setting a default sampling coefficient for all SELECT queries. -(For tables that don't support sampling, an exception will be thrown.) -If set to 1, default sampling is not performed. - -==input_format_skip_unknown_fields== - -If the parameter is true, INSERT operation will skip columns with unknown names from input. -Otherwise, an exception will be generated, it is default behavior. -The parameter works only for JSONEachRow and TSKV input formats. - -==output_format_json_quote_64bit_integers== - -If the parameter is true (default value), UInt64 and Int64 numbers are printed as quoted strings in all JSON output formats. -Such behavior is compatible with most JavaScript interpreters that stores all numbers as double-precision floating point numbers. -Otherwise, they are printed as regular numbers. - -==input_format_allow_errors_num== -==input_format_allow_errors_ratio== - -Maximum amount of errors while reading text formats (like CSV, TSV). -In case of error, if both values are non-zero, and at least absolute or relative amount of errors is lower than corresponding value, will skip until next line and continue. - -==Restrictions on query complexity== - -Restrictions on query complexity are part of the settings. -They are used in order to provide safer execution from the user interface. -Almost all the restrictions only apply to SELECTs. -For distributed query processing, restrictions are applied on each server separately. - -Restrictions on the "maximum amount of something" can take the value 0, which means "unrestricted". -Most restrictions also have an 'overflow_mode' setting, meaning what to do when the limit is exceeded. -It can take one of two values: 'throw' or 'break'. Restrictions on aggregation (group_by_overflow_mode) also have the value 'any'. -throw - Throw an exception (default). -break - Stop executing the query and return the partial result, as if the source data ran out. -any (only for group_by_overflow_mode) - Continuing aggregation for the keys that got into the set, but don't add new keys to the set. - - -===readonly=== - -If set to 0, allows to run any queries. -If set to 1, allows to run only queries that don't change data or settings (e.g. SELECT or SHOW). INSERT and SET are forbidden. -If set to 2, allows to run queries that don't change data (SELECT, SHOW) and allows to change settings (SET). - -After you set the read-only mode, you won't be able to disable it in the current session. - -When using the GET method in the HTTP interface, 'readonly = 1' is set automatically. In other words, for queries that modify data, you can only use the POST method. You can send the query itself either in the POST body, or in the URL parameter. - -===max_memory_usage=== - -The maximum amount of memory consumption when running a query on a single server. By default, 10 GB. - -The setting doesn't consider the volume of available memory or the total volume of memory on the machine. -The restriction applies to a single query within a single server. -You can use SHOW PROCESSLIST to see the current memory consumption for each query. -In addition, the peak memory consumption is tracked for each query and written to the log. - -Certain cases of memory consumption are not tracked: -- Large constants (for example, a very long string constant). -- The states of 'groupArray' aggregate functions, and also 'quantile' (it is tracked for 'quantileTiming'). - -Memory consumption is not fully considered for aggregate function states 'min', 'max', 'any', 'anyLast', 'argMin', and 'argMax' from String and Array arguments. - - -===max_rows_to_read=== - -The following restrictions can be checked on each block (instead of on each row). That is, the restrictions can be broken a little. -When running a query in multiple threads, the following restrictions apply to each thread separately. - -Maximum number of rows that can be read from a table when running a query. - -===max_bytes_to_read=== - -Maximum number of bytes (uncompressed data) that can be read from a table when running a query. - -===read_overflow_mode=== - -What to do when the volume of data read exceeds one of the limits: 'throw' or 'break'. By default, throw. - -===max_rows_to_group_by=== - -Maximum number of unique keys received from aggregation. This setting lets you limit memory consumption when aggregating. - -===group_by_overflow_mode=== - -What to do when the number of unique keys for aggregation exceeds the limit: 'throw', 'break', or 'any'. By default, throw. -Using the 'any' value lets you run an approximation of GROUP BY. The quality of this approximation depends on the statistical nature of the data. - -===max_rows_to_sort=== - -Maximum number of rows before sorting. This allows you to limit memory consumption when sorting. - -===max_bytes_to_sort=== - -Maximum number of bytes before sorting. - -===sort_overflow_mode=== - -What to do if the number of rows received before sorting exceeds one of the limits: 'throw' or 'break'. By default, throw. - -===max_result_rows=== - -Limit on the number of rows in the result. Also checked for subqueries, and on remote servers when running parts of a distributed query. - -===max_result_bytes=== - -Limit on the number of bytes in the result. The same as the previous setting. - -===result_overflow_mode=== - -What to do if the volume of the result exceeds one of the limits: 'throw' or 'break'. By default, throw. -Using 'break' is similar to using LIMIT. - -===max_execution_time=== - -Maximum query execution time in seconds. -At this time, it is not checked for one of the sorting stages, or when merging and finalizing aggregate functions. - -===timeout_overflow_mode=== - -What to do if the query is run longer than 'max_execution_time': 'throw' or 'break'. By default, throw. - -===min_execution_speed=== - -Minimal execution speed in rows per second. Checked on every data block when 'timeout_before_checking_execution_speed' expires. If the execution speed is lower, an exception is thrown. - -===timeout_before_checking_execution_speed=== - -Checks that execution speed is not too slow (no less than 'min_execution_speed'), after the specified time in seconds has expired. - -===max_columns_to_read=== - -Maximum number of columns that can be read from a table in a single query. If a query requires reading a greater number of columns, it throws an exception. - -===max_temporary_columns=== - -Maximum number of temporary columns that must be kept in RAM at the same time when running a query, including constant columns. If there are more temporary columns than this, it throws an exception. - -===max_temporary_non_const_columns=== - -The same thing as 'max_temporary_columns', but without counting constant columns. -Note that constant columns are formed fairly often when running a query, but they require approximately zero computing resources. - -===max_subquery_depth=== - -Maximum nesting depth of subqueries. If subqueries are deeper, an exception is thrown. By default, 100. - -===max_pipeline_depth=== - -Maximum pipeline depth. Corresponds to the number of transformations that each data block goes through during query processing. Counted within the limits of a single server. If the pipeline depth is greater, an exception is thrown. By default, 1000. - -===max_ast_depth=== - -Maximum nesting depth of a query syntactic tree. If exceeded, an exception is thrown. At this time, it isn't checked during parsing, but only after parsing the query. That is, a syntactic tree that is too deep can be created during parsing, but the query will fail. By default, 1000. - -===max_ast_elements=== - -Maximum number of elements in a query syntactic tree. If exceeded, an exception is thrown. -In the same way as the previous setting, it is checked only after parsing the query. By default, 50,000. - -===max_rows_in_set=== - -Maximum number of rows for a data set in the IN clause created from a subquery. - -===max_bytes_in_set=== - -Maximum number of bytes (uncompressed data) used by a set in the IN clause created from a subquery. - -===set_overflow_mode=== - -What to do when the amount of data exceeds one of the limits: 'throw' or 'break'. By default, throw. - -===max_rows_in_distinct=== - -Maximum number of different rows when using DISTINCT. - -===max_bytes_in_distinct=== - -Maximum number of bytes used by a hash table when using DISTINCT. - -===distinct_overflow_mode=== - -What to do when the amount of data exceeds one of the limits: 'throw' or 'break'. By default, throw. - -===max_rows_to_transfer=== - -Maximum number of rows that can be passed to a remote server or saved in a temporary table when using GLOBAL IN. - -===max_bytes_to_transfer=== - -Maximum number of bytes (uncompressed data) that can be passed to a remote server or saved in a temporary table when using GLOBAL IN. - -===transfer_overflow_mode=== - -What to do when the amount of data exceeds one of the limits: 'throw' or 'break'. By default, throw. - - -==Settings profiles== - -A settings profile is a collection of settings grouped under the same name. Each ClickHouse user has a profile. -To apply all the settings in a profile, set 'profile'. Example: - -%% -SET profile = 'web' -%% - -- Load the 'web' profile. That is, set all the options belonging to the 'web' profile. - -Settings profiles are declared in the user config file. This is normally 'users.xml'. -Example: - -%% -<!-- Settings profiles. --> -<profiles> - <!-- Default settings --> - <default> - <!-- Maximum number of threads for executing a single query. --> - <max_threads>8</max_threads> - </default> - <!-- Settings for queries from the user interface --> - <web> - <max_rows_to_read>1000000000</max_rows_to_read> - <max_bytes_to_read>100000000000</max_bytes_to_read> - <max_rows_to_group_by>1000000</max_rows_to_group_by> - <group_by_overflow_mode>any</group_by_overflow_mode> - <max_rows_to_sort>1000000</max_rows_to_sort> - <max_bytes_to_sort>1000000000</max_bytes_to_sort> - <max_result_rows>100000</max_result_rows> - <max_result_bytes>100000000</max_result_bytes> - <result_overflow_mode>break</result_overflow_mode> - <max_execution_time>600</max_execution_time> - <min_execution_speed>1000000</min_execution_speed> - <timeout_before_checking_execution_speed>15</timeout_before_checking_execution_speed> - <max_columns_to_read>25</max_columns_to_read> - <max_temporary_columns>100</max_temporary_columns> - <max_temporary_non_const_columns>50</max_temporary_non_const_columns> - <max_subquery_depth>2</max_subquery_depth> - <max_pipeline_depth>25</max_pipeline_depth> - <max_ast_depth>50</max_ast_depth> - <max_ast_elements>100</max_ast_elements> - <readonly>1</readonly> - </web> -</profiles> -%% - -In the example, two profiles are set: 'default' and 'web'. The 'default' profile has a special purpose - it must always be present and is applied when starting the server. In other words, the 'default' profile contains default settings. The 'web' profile is a regular profile that can be set using the SET query or using a URL parameter in an HTTP query. - -Settings profiles can inherit from each other. To use inheritance, indicate the 'profile' setting before the other settings that are listed in the profile. - -
    -
    -

    Configuration files

    -
    -
    - -The main server config file is 'config.xml'. It resides in the /etc/clickhouse-server/ directory. - -Certain settings can be overridden in the *.xml and *.conf files from the 'conf.d' and 'config.d' directories next to the config. -The 'replace' and 'remove' attributes can be specified for the elements of these config files. -If neither is specified, it combines the contents of elements recursively, replacing values of duplicate children. -If 'replace' is specified, it replaces the entire element with the specified one. -If 'remove' is specified, it deletes the element. - -The config can also define "substitutions". If an element has the 'incl' attribute, the corresponding substitution from the file will be used as the value. By default, the path to the file with substitutions is '/etc/metrika.xml'. This can be changed in the config in the 'include_from' element. The substitution values are specified in '/yandex/substitution_name' elements of this file. - -You can also perform substitutions from ZooKeeper nodes. To do that add the from_zk="/path/to/node" attribute to a config element. Element contents will be substituted with the contents of the /path/to/node ZooKeeper node. The ZooKeeper node can contain a whole XML subtree, and it will be inserted as a child of the substituted node. - -The 'config.xml' file can specify a separate config with user settings, profiles, and quotas. The relative path to this config is set in the 'users_config' element. By default, it is 'users.xml'. If 'users_config' is omitted, the user settings, profiles, and quotas are specified directly in 'config.xml'. For 'users_config', overrides and substitutions may also exist in files from the 'users_config.d' directory (for example, 'users.d'). - -For each config file, the server also generates file-preprocessed.xml files on launch. These files contain all the completed substitutions and overrides, and they are intended for informational use. If ZooKeeper substitutions were used in a config file and the ZooKeeper is unavailable during server startup, the configuration is loaded from the respective preprocessed file. - -The server tracks changes to config files and files and ZooKeeper nodes that were used for substitutions and overrides and reloads users and clusters configurations in runtime. That is, you can add or change users, clusters and their settings without relaunching the server. - -
    -
    -

    Access rights

    -
    -
    - -Users and access rights are set up in the user config. This is usually 'users.xml'. - -Users are recorded in the 'users' section. Let's look at part of the 'users.xml' file: - -%% -<!-- Users and ACL. --> -<users> - <!-- If the username is not specified, the default user is used. --> - <default> - <!-- Password (in plaintext). May be empty. --> - <password></password> - - <!-- List of networks that access is allowed from. Each list item has one of the following forms: - <ip> IP address or subnet mask. For example, 222.111.222.3 or 10.0.0.1/8 or 2a02:6b8::3 or 2a02:6b8::3/64. - <host> Host name. Example: example01. A DNS query is made for verification, and all received address are compared to the client address. - <host_regexp> Regex for host names. For example, ^example\d\d-\d\d-\d\.yandex\.ru$ - A DNS PTR query is made to verify the client address and the regex is applied to the result. - Then another DNS query is made for the result of the PTR query, and all received address are compared to the client address. - We strongly recommend that the regex ends with \.yandex\.ru$. If you are installing ClickHouse independently, here you should specify: - <networks> - <ip>::/0</ip> - </networks> --> - - <networks incl="networks" /> - <!-- Settings profile for the user. --> - <profile>default</profile> - <!-- Quota for the user. --> - <quota>default</quota> - </default> - - <!-- For queries from the user interface. --> - <web> - <password></password> - <networks incl="networks" /> - <profile>web</profile> - <quota>default</quota> - </web> -%% - -Here we can see that two users are declared: 'default' and 'web'. We added the 'web' user ourselves. -The 'default' user is chosen in cases when the username is not passed, so this user must be present in the config file. The 'default' user is also used for distributed query processing - the system accesses remote servers under this username. So the 'default' user must have an empty password and must not have substantial restrictions or quotas - otherwise, distributed queries will fail. - -The password is specified in plain text directly in the config. In this regard, you should not consider these passwords as providing security against potential malicious attacks. Rather, they are necessary for protection from Yandex employees. - -A list of networks is specified that access is allowed from. In this example, the list of networks for both users is loaded from a separate file (/etc/metrika.xml) containing the 'networks' substitution. Here is a fragment of it: - -%% -<yandex> - ... - <networks> - <ip>::/64</ip> - <ip>93.158.111.111/26</ip> - <ip>2a02:6b8:0:1::/64</ip> - ... - </networks> -</yandex> -%% - -We could have defined this list of networks directly in 'users.xml', or in a file in the 'users.d' directory (for more information, see the section "Configuration files"). - -The config includes comments explaining how to open access from everywhere. - -For use in production, only specify IP elements (IP addresses and their masks), since using 'host' and 'host_regexp' might cause extra latency. - -Next the user settings profile is specified (see the section "Settings profiles"). You can specify the default profile, 'default'. The profile can have any name. You can specify the same profile for different users. The most important thing you can write in the settings profile is 'readonly' set to 1, which provides read-only access. - -After this, the quota is defined (see the section "Quotas"). You can specify the default quota, 'default'. It is set in the config by default so that it only counts resource usage, but does not restrict it. The quota can have any name. You can specify the same quota for different users - in this case, resource usage is calculated for each user individually. - -
    -
    -

    Quotas

    -
    -
    - -Quotas allow you to limit resource usage over a period of time, or simply track the use of resources. -Quotas are set up in the user config. This is usually 'users.xml'. - -The system also has a feature for limiting the complexity of a single query (see the section "Restrictions on query complexity"). -In contrast to query complexity restrictions, quotas: -- place restrictions on a set of queries that can be run over a period of time, instead of limiting a single query. -- account for resources spent on all remote servers for distributed query processing. - -Let's look at the section of the 'users.xml' file that defines quotas. - -%% -<!-- Quotas. --> -<quotas> - <!-- Quota name. --> - <default> - <!-- Restrictions for a time period. You can set multiple time intervals with various restrictions. --> - <interval> - <!-- Length of time. --> - <duration>3600</duration> - - <!-- No restrictions. Just collect data for the specified time interval. --> - <queries>0</queries> - <errors>0</errors> - <result_rows>0</result_rows> - <read_rows>0</read_rows> - <execution_time>0</execution_time> - </interval> - </default> -%% - -By default, the quota just tracks resource consumption for each hour, without limiting usage. - -%% -<statbox> - <!-- Restrictions for a time period. You can set multiple time intervals with various restrictions. --> - <interval> - <!-- Length of time.--> - <duration>3600</duration> - <queries>1000</queries> - <errors>100</errors> - <result_rows>1000000000</result_rows> - <read_rows>100000000000</read_rows> - <execution_time>900</execution_time> - </interval> - <interval> - <duration>86400</duration> - <queries>10000</queries> - <errors>1000</errors> - <result_rows>5000000000</result_rows> - <read_rows>500000000000</read_rows> - <execution_time>7200</execution_time> - </interval> -</statbox> -%% - -For the 'statbox' quota, restrictions are set for every hour and for every 24 hours (86,400 seconds). The time interval is counted starting from an implementation-defined fixed moment in time. In other words, the 24-hour interval doesn't necessarily begin at midnight. - -When the interval ends, all collected values are cleared. For the next hour, the quota calculation starts over. - -Let's examine the amounts that can be restricted: - -queries - The overall number of queries. -errors - The number of queries that threw exceptions. -result_rows - The total number of rows output in results. -read_rows - The total number of source rows retrieved from tables for running a query, on all remote servers. -execution_time - The total time of query execution, in seconds (wall time). - -If the limit is exceeded for at least one time interval, an exception is thrown with a text about which restriction was exceeded, for which interval, and when the new interval begins (when queries can be sent again). - -Quotas can use the "quota key" feature in order to report on resources for multiple keys independently. Here is an example of this: - -%% -<!-- For the global report builder. --> -<web_global> - <!-- keyed - the quota_key "key" is passed in the query parameter, and the quota is tracked separately for each key value. - For example, you can pass a Metrica username as the key, so the quota will be counted separately for each username. - Using keys makes sense only if quota_key is transmitted by the program, not by a user. - You can also write <keyed_by_ip /> so the IP address is used as the quota key. - (But keep in mind that users can change the IPv6 address fairly easily.) --> - <keyed /> -%% - -The quota is assigned to users in the 'users' section of the config. See the section "Access rights". - -For distributed query processing, the accumulated amounts are stored on the requestor server. So if the user goes to another server, the quota there will "start over". - -When the server is restarted, quotas are reset. - -
    - - -
    - -Yandex.Metrica - -
    - - - - - diff --git a/website/deprecated/reference_ru.html b/website/deprecated/reference_ru.html deleted file mode 100644 index 12bc967e3e1..00000000000 --- a/website/deprecated/reference_ru.html +++ /dev/null @@ -1,7742 +0,0 @@ - - - - - ClickHouse — руководство - - - - - - - - - - - - - - - - - - -
    - - - - -

    ClickHouse

    -

    Руководство

    -
    — Алексей Миловидов
    -
    - -
    -

    Содержание

    -
    -
    -
    - -
    -

    Введение

    -
    - -
    -==Что такое ClickHouse== -ClickHouse - столбцовая СУБД для OLAP (Columnar DBMS). - -В обычной, "строковой" СУБД, данные хранятся в таком порядке: - -
    -5123456789123456789     1       Евробаскет - Греция - Босния и Герцеговина - example.com      1       2011-09-01 01:03:02     6274717   1294101174      11409   612345678912345678      0       33      6       http://www.example.com/basketball/team/123/match/456789.html http://www.example.com/basketball/team/123/match/987654.html       0       1366    768     32      10      3183      0       0       13      0\0     1       1       0       0                       2011142 -1      0               0       01321     613     660     2011-09-01 08:01:17     0       0       0       0       utf-8   1466    0       0       0       5678901234567890123               277789954       0       0       0       0       0
    -5234985259563631958     0       Консалтинг, налогообложение, бухгалтерский учет, право       1       2011-09-01 01:03:02     6320881   2111222333      213     6458937489576391093     0       3       2       http://www.example.ru/         0       800     600       16      10      2       153.1   0       0       10      63      1       1       0       0                       2111678 000       0       588     368     240     2011-09-01 01:03:17     4       0       60310   0       windows-1251    1466    0       000               778899001       0       0       0       0       0
    -...
    -
    - -То есть, значения, относящиеся к одной строке, хранятся рядом. -Примеры строковых СУБД: MySQL, Postgres, MS SQL Server и т. п. - -В столбцовых СУБД, данные хранятся в таком порядке: - -
    -WatchID:    5385521489354350662     5385521490329509958     5385521489953706054     5385521490476781638     5385521490583269446     5385521490218868806     5385521491437850694   5385521491090174022      5385521490792669254     5385521490420695110     5385521491532181574     5385521491559694406     5385521491459625030     5385521492275175494   5385521492781318214      5385521492710027334     5385521492955615302     5385521493708759110     5385521494506434630     5385521493104611398
    -JavaEnable: 1       0       1       0       0       0       1       0       1       1       1       1       1       1       0       1       0       0       1       1
    -Title:      Yandex  Announcements - Investor Relations - Yandex     Yandex — Contact us — Moscow    Yandex — Mission        Ru      Yandex — History — History of Yandex    Yandex Financial Releases - Investor Relations - Yandex Yandex — Locations      Yandex Board of Directors - Corporate Governance - Yandex       Yandex — Technologies
    -GoodEvent:  1       1       1       1       1       1       1       1       1       1       1       1       1       1       1       1       1       1       1       1
    -EventTime:  2016-05-18 05:19:20     2016-05-18 08:10:20     2016-05-18 07:38:00     2016-05-18 01:13:08     2016-05-18 00:04:06     2016-05-18 04:21:30     2016-05-18 00:34:16     2016-05-18 07:35:49     2016-05-18 11:41:59     2016-05-18 01:13:32
    -...
    -
    - -В примерах изображён только порядок расположения данных. -То есть, значения из разных столбцов хранятся отдельно, а данные одного столбца - вместе. -Примеры столбцовых СУБД: Vertica, Paraccel (Actian Matrix) (Amazon Redshift), Sybase IQ, Exasol, Infobright, InfiniDB, MonetDB (VectorWise) (Actian Vector), LucidDB, SAP HANA, Google Dremel, Google PowerDrill, Druid, kdb+ и т. п. - -Разный порядок хранения данных лучше подходит для разных сценариев работы. -Сценарий работы с данными - это то, какие производятся запросы, как часто и в каком соотношении; сколько читается данных на запросы каждого вида - строк, столбцов, байт; как соотносятся чтения и обновления данных; какой рабочий размер данных и насколько локально он используется; используются ли транзакции и с какой изолированностью; какие требования к дублированию данных и логической целостности; требования к задержкам на выполнение и пропускной способности запросов каждого вида и т. п. - -Чем больше нагрузка на систему, тем более важной становится специализация под сценарий работы, и тем более конкретной становится эта специализация. Не существует системы, одинаково хорошо подходящей под существенно различные сценарии работы. Если система подходит под широкое множество сценариев работы, то при достаточно большой нагрузке, система будет справляться со всеми сценариями работы плохо, или справляться хорошо только с одним из сценариев работы. - -Будем говорить, что OLAP (онлайн обработка аналитических запросов) сценарий работы - это: -- подавляющее большинство запросов - на чтение; -- данные обновляются достаточно большими пачками (> 1000 строк), а не по одной строке, или не обновляются вообще; -- данные добавляются в БД, но не изменяются; -- при чтении, вынимается достаточно большое количество строк из БД, но только небольшое подмножество столбцов; -- таблицы являются "широкими", то есть, содержат большое количество столбцов; -- запросы идут сравнительно редко (обычно не более сотни в секунду на сервер); -- при выполнении простых запросов, допустимы задержки в районе 50 мс; -- значения в столбцах достаточно мелкие - числа и небольшие строки (пример - 60 байт на URL); -- требуется высокая пропускная способность при обработке одного запроса (до миллиардов строк в секунду на один сервер); -- транзакции отсутствуют; -- низкие требования к консистентности данных; -- в запросе одна большая таблица, все таблицы кроме одной маленькие; -- результат выполнения запроса существенно меньше исходных данных - то есть, данные фильтруются или агрегируются; результат выполнения помещается в оперативку на одном сервере; - -Легко видеть, что OLAP сценарий работы существенно отличается от других распространённых сценариев работы (например, OLTP или Key-Value сценариев работы). Таким образом, не имеет никакого смысла пытаться использовать OLTP или Key-Value БД для обработки аналитических запросов, если вы хотите получить приличную производительность ("выше плинтуса"). Например, если вы попытаетесь использовать для аналитики MongoDB или Elliptics - вы получите анекдотически низкую производительность по сравнению с OLAP-СУБД. - -Столбцовые СУБД лучше (от 100 раз по скорости обработки большинства запросов) подходят для OLAP сценария работы по следующим причинам: - -1. По I/O. -1.1. Для выполнения аналитического запроса, требуется прочитать небольшое количество столбцов таблицы. В столбцовой БД для этого можно читать только нужные данные. Например, если вам требуется только 5 столбцов из 100, то следует рассчитывать на 20-кратное уменьшение ввода-вывода. -1.2. Так как данные читаются пачками, то их проще сжимать. Данные, лежащие по столбцам также лучше сжимаются. За счёт этого, дополнительно уменьшается объём ввода-вывода. -1.3. За счёт уменьшения ввода-вывода, больше данных влезает в системный кэш. - -Для примера, для запроса "посчитать количество записей для каждой рекламной системы", требуется прочитать один столбец "идентификатор рекламной системы", который занимает 1 байт в несжатом виде. Если большинство переходов было не с рекламных систем, то можно рассчитывать хотя бы на десятикратное сжатие этого столбца. При использовании быстрого алгоритма сжатия, возможно разжатие данных со скоростью более нескольких гигабайт несжатых данных в секунду. То есть, такой запрос может выполняться со скоростью около нескольких миллиардов строк в секунду на одном сервере. На практике, такая скорость действительно достигается. - - -
    -milovidov@████████.yandex.ru:~$ clickhouse-client
    -ClickHouse client version 0.0.52053.
    -Connecting to localhost:9000.
    -Connected to ClickHouse server version 0.0.52053.
    -
    -:) SELECT CounterID, count() FROM hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20
    -
    -SELECT
    -    CounterID,
    -    count()
    -FROM hits
    -GROUP BY CounterID
    -ORDER BY count() DESC
    -LIMIT 20
    -
    -┌─CounterID─┬──count()─┐
    -│    114208 │ 56057344 │
    -│    115080 │ 51619590 │
    -│      3228 │ 44658301 │
    -│     38230 │ 42045932 │
    -│    145263 │ 42042158 │
    -│     91244 │ 38297270 │
    -│    154139 │ 26647572 │
    -│    150748 │ 24112755 │
    -│    242232 │ 21302571 │
    -│    338158 │ 13507087 │
    -│     62180 │ 12229491 │
    -│     82264 │ 12187441 │
    -│    232261 │ 12148031 │
    -│    146272 │ 11438516 │
    -│    168777 │ 11403636 │
    -│   4120072 │ 11227824 │
    -│  10938808 │ 10519739 │
    -│     74088 │  9047015 │
    -│    115079 │  8837972 │
    -│    337234 │  8205961 │
    -└───────────┴──────────┘
    -
    -20 rows in set. Elapsed: 0.153 sec. Processed 1.00 billion rows, 4.00 GB (6.53 billion rows/s., 26.10 GB/s.)
    -
    -:)
    -
    - -2. По CPU. -Так как для выполнения запроса надо обработать достаточно большое количество строк, становится актуальным диспетчеризовывать все операции не для отдельных строк, а для целых векторов, или реализовать движок выполнения запроса так, чтобы издержки на диспетчеризацию были примерно нулевыми. Если этого не делать, то при любой не слишком плохой дисковой подсистеме, интерпретатор запроса неизбежно упрётся в CPU. -Имеет смысл не только хранить данные по столбцам, но и обрабатывать их, по возможности, тоже по столбцам. - -Есть два способа это сделать: -1. Векторный движок. Все операции пишутся не для отдельных значений, а для векторов. То есть, вызывать операции надо достаточно редко, и издержки на диспетчеризацию становятся пренебрежимо маленькими. Код операции содержит в себе хорошо оптимизированный внутренний цикл. -2. Кодогенерация. Для запроса генерируется код, в котором подставлены все косвенные вызовы. - -В "обычных" БД этого не делается, так как не имеет смысла при выполнении простых запросов. Хотя есть исключения. Например, в MemSQL кодогенерация используется для уменьшения latency при выполнении SQL запросов. (Для сравнения - в аналитических СУБД, требуется оптимизация throughput, а не latency). - -Стоит заметить, что для эффективности по CPU требуется, чтобы язык запросов был декларативным (SQL, MDX) или хотя бы векторным (J, K). То есть, чтобы запрос содержал циклы только в неявном виде, открывая возможности для оптимизации. - - -==Отличительные возможности ClickHouse== - -1. По-настоящему столбцовая СУБД. -2. Сжатие данных. -3. Хранение данных на диске. -4. Параллельная обработка запроса на многих процессорных ядрах. -5. Распределённая обработка запроса на многих серверах. -6. Поддержка SQL. -7. Векторный движок. -8. Обновление данных в реальном времени. -9. Наличие индексов. -10. Подходит для онлайн запросов. -11. Поддержка приближённых вычислений. -12. Поддержка вложенных структур данных. Поддержка массивов в качестве типов данных. -13. Поддержка ограничений на сложность запросов, а также квот. -14. Репликация данных, поддержка целостности данных на репликах. - - -Рассмотрим некоторые возможности подробнее. - -

    1. По-настоящему столбцовая СУБД.

    - -В по-настоящему столбцовой СУБД рядом со значениями не хранится никакого "мусора". Например, должны поддерживаться значения постоянной длины, чтобы не хранить рядом со значениями типа "число" их длины. Для примера, миллиард значений типа UInt8 должен действительно занимать в несжатом виде около 1GB, иначе это сильно ударит по эффективности использования CPU. Очень важно хранить данные компактно (без "мусора") в том числе в несжатом виде, так как скорость разжатия (использование CPU) зависит, в основном, от объёма несжатых данных. - -Этот пункт пришлось выделить, так как существуют системы, которые могут хранить значения отдельных столбцов по отдельности, но не могут эффективно выполнять аналитические запросы в силу оптимизации под другой сценарий работы. Примеры: HBase, BigTable, Cassandra, HyperTable. В этих системах вы получите throughput в районе сотен тысяч строк в секунду, но не сотен миллионов строк в секунду. - -Также стоит заметить, что ClickHouse является СУБД, а не одной базой данных. То есть, ClickHouse позволяет создавать таблицы и базы данных в runtime, загружать данные и выполнять запросы без переконфигурирования и перезапуска сервера. - -

    2. Сжатие данных.

    - -Некоторые столбцовые СУБД (InfiniDB CE, MonetDB) не используют сжатие данных. Но сжатие данных действительно серьёзно увеличивает производительность. - -

    3. Хранение данных на диске.

    - -Многие столбцовые СУБД (SAP HANA, Google PowerDrill) могут работать только в оперативке. Но оперативки (даже на тысячах серверах) слишком мало для хранения всех хитов и визитов в Яндекс.Метрике. - -

    4. Параллельная обработка запроса на многих процессорных ядрах.

    - -Большие запросы естественным образом распараллеливаются. - -

    5. Распределённая обработка запроса на многих серверах.

    - -Почти все перечисленные ранее столбцовые СУБД не поддерживают распределённую обработку запроса. -В ClickHouse данные могут быть расположены на разных шардах. Каждый шард может представлять собой группу реплик, которые используются для отказоустойчивости. Запрос будет выполнен на всех шардах параллельно. Это делается прозрачно для пользователя. - -

    6. Поддержка SQL.

    - -Если вы знаете, что такое стандартный SQL, то говорить о поддержке SQL всё-таки нельзя. -Не поддерживаются NULL-ы. Все функции названы по-другому. -Тем не менее, это - декларативный язык запросов на основе SQL и во многих случаях не отличимый от SQL. -Поддерживаются JOIN-ы. Поддерживаются подзапросы в секциях FROM, IN, JOIN, а также скалярные подзапросы. -Зависимые подзапросы не поддерживаются. - -

    7. Векторный движок.

    - -Данные не только хранятся по столбцам, но и обрабатываются по векторам - кусочкам столбцов. За счёт этого достигается высокая эффективность по CPU. - -

    8. Обновление данных в реальном времени.

    - -ClickHouse поддерживает таблицы с первичным ключом. Для того, чтобы можно было быстро выполнять запросы по диапазону первичного ключа, данные инкрементально сортируются с помощью merge дерева. За счёт этого, поддерживается постоянное добавление данных в таблицу. Блокировки при добавлении данных отсутствуют. - -

    9. Наличие индексов.

    - -Наличие первичного ключа позволяет, например, вынимать данные для конкретных клиентов (счётчиков Метрики), для заданного диапазона времени, с низкими задержками - менее десятков миллисекунд. - -

    10. Подходит для онлайн запросов.

    - -Это позволяет использовать систему в качестве бэкенда для веб-интерфейса. Низкие задержки позволяют не откладывать выполнение запроса, а выполнять его в момент загрузки страницы интерфейса Яндекс.Метрики. То есть, в режиме онлайн. - -

    11. Поддержка приближённых вычислений.

    - -1. Система содержит агрегатные функции для приближённого вычисления количества различных значений, медианы и квантилей. -2. Поддерживается возможность выполнить запрос на основе части (выборки) данных и получить приближённый результат. При этом, с диска будет считано пропорционально меньше данных. -3. Поддерживается возможность выполнить агрегацию не для всех ключей, а для ограниченного количества первых попавшихся ключей. При выполнении некоторых условий на распределение ключей в данных, это позволяет получить достаточно точный результат с использованием меньшего количества ресурсов. - -

    14. Репликация данных, поддержка целостности данных на репликах.

    - -Используется асинхронная multimaster репликация. После записи на любую доступную реплику, данные распространяются на все остальные реплики. Система поддерживает полную идентичность данных на разных репликах. Восстановление после сбоя осуществляется автоматически, а в сложных случаях - "по кнопке". -Подробнее смотрите раздел "Репликация данных". - -==Особенности ClickHouse, которые могут считаться недостатками== - -1. Отсутствие транзакций. - -2. Необходимо, чтобы результат выполнения запроса, в случае агрегации, помещался в оперативку на одном сервере. -Объём исходных данных для запроса, при этом, может быть сколь угодно большим. - -3. Отсутствие полноценной реализации UPDATE/DELETE. - -==Постановка задачи в Яндекс.Метрике== - -Нужно получать произвольные отчёты на основе хитов и визитов, с произвольными сегментами, задаваемыми пользователем. Данные для отчётов обновляются в реальном времени. Запросы должны выполняться сразу (в режиме онлайн). Отчёты должно быть возможно строить за произвольный период. Требуется вычислять сложные агрегаты типа количества уникальных посетителей. -На данный момент (апрель 2014), каждый день в Яндекс.Метрику поступает около 12 миллиардов событий (хитов и кликов мыши). Все эти события должны быть сохранены для возможности строить произвольные отчёты. Один запрос может потребовать просканировать сотни миллионов строк за время не более нескольких секунд, или миллионы строк за время не более нескольких сотен миллисекунд. - -===Агрегированные и неагрегированные данные=== - -Существует мнение, что для того, чтобы эффективно считать статистику, данные нужно агрегировать, так как это позволяет уменьшить объём данных. - -Но агрегированные данные являются очень ограниченным решением, по следующим причинам: -- вы должны заранее знать перечень отчётов, необходимых пользователю; -- то есть, пользователь не может построить произвольный отчёт; -- при агрегации по большому количеству ключей, объём данных не уменьшается и агрегация бесполезна; -- при большом количестве отчётов, получается слишком много вариантов агрегации (комбинаторный взрыв); -- при агрегации по ключам высокой кардинальности (например, URL) объём данных уменьшается не сильно (менее чем в 2 раза); -- из-за этого, объём данных при агрегации может не уменьшиться, а вырасти; -- пользователи будут смотреть не все отчёты, которые мы для них посчитаем - то есть, большая часть вычислений бесполезна; -- возможно нарушение логической целостности данных для разных агрегаций; - -Как видно, если ничего не агрегировать, и работать с неагрегированными данными, то это даже может уменьшить объём вычислений. - -Впрочем, при агрегации, существенная часть работы выносится в оффлайне, и её можно делать сравнительно спокойно. Для сравнения, при онлайн вычислениях, вычисления надо делать так быстро, как это возможно, так как именно в момент вычислений пользователь ждёт результата. - -В Яндекс.Метрике есть специализированная система для агрегированных данных - Metrage, на основе которой работает большинство отчётов. -Также в Яндекс.Метрике с 2009 года использовалась специализированная OLAP БД для неагрегированных данных - OLAPServer, на основе которой раньше работал конструктор отчётов. -OLAPServer хорошо подходил для неагрегированных данных, но содержал много ограничений, не позволяющих использовать его для всех отчётах так, как хочется: отсутствие поддержки типов данных (только числа), невозможность инкрементального обновления данных в реальном времени (только перезаписью данных за сутки). OLAPServer не является СУБД, а является специализированной БД. - -Чтобы снять ограничения OLAPServer-а и решить задачу работы с неагрегированными данными для всех отчётов, разработана СУБД ClickHouse. - -==Использование в Яндекс.Метрике и других отделах Яндекса== - -В Яндекс.Метрике ClickHouse используется для нескольких задач. -Основная задача - построение отчётов в режиме онлайн по неагрегированным данным. Для решения этой задачи используется кластер из 374 серверов, хранящий более 20,3 триллионов строк в базе данных. Объём сжатых данных, без учёта дублирования и репликации, составляет около 2 ПБ. Объём несжатых данных (в формате tsv) составил бы, приблизительно, 17 ПБ. - -Также ClickHouse используется: -- для хранения данных Вебвизора; -- для обработки промежуточных данных; -- для построения глобальных отчётов Аналитиками; -- для выполнения запросов в целях отладки движка Метрики; -- для анализа логов работы API и пользовательского интерфейса. - - -ClickHouse имеет более десятка инсталляций в других отделах Яндекса: в Вертикальных сервисах, Маркете, Директе, БК, Бизнес аналитике, Мобильной разработке, AdFox, Персональных сервисах и т п. - - -==Возможные аналоги== - -Доступных аналогов нет. -На данный момент (май 2016) не существует доступных (open-source, бесплатных) систем, обладающих всеми перечисленными выше возможностями. -Но эти возможности являются абсолютно строго необходимыми для Яндекс.Метрики. - - -==Возможные глупые вопросы== - -

    1. Почему бы не использовать системы типа map-reduce?

    - -Системами типа map-reduce будем называть системы распределённых вычислений, в которых операция reduce сделана на основе распределённой сортировки. Таким образом, к ним относятся YAMR, Hadoop, YT. - -Такие системы не подходят для онлайн запросов в силу слишком большой latency. То есть, не могут быть использованы в качестве бэкенда для веб-интерфейса. -Такие системы не подходят для обновления данных в реальном времени. -Распределённая сортировка не является оптимальным способом выполнения операции reduce, если результат выполнения операции и все промежуточные результаты, при их наличии, помещаются в оперативку на одном сервере, как обычно бывает в запросах, выполняющихся в режиме онлайн. В таком случае, оптимальным способом выполнения операции reduce является хэш-таблица. Частым способом оптимизации map-reduce задач является предагрегация (частичный reduce) с использованием хэш-таблицы в оперативке. Эта оптимизация делается пользователем в ручном режиме. -Распределённая сортировка является основной причиной тормозов при выполнении несложных map-reduce задач. - -Системы типа map-reduce позволяют выполнять произвольный код на кластере. Но для OLAP задач лучше подходит декларативный язык запросов, который позволяет быстро проводить исследования. Для примера, для Hadoop существует Hive и Pig. Также смотрите Cloudera Impala, Shark (устаревший) для Spark а также Spark SQL, Presto, Apache Drill. Впрочем, производительность при выполнении таких задач является сильно неоптимальной по сравнению со специализированными системами, а сравнительно высокая latency не позволяет использовать эти системы в качестве бэкенда для веб-интерфейса. - -YT позволяет хранить группы столбцов по отдельности. Но YT нельзя назвать по-настоящему столбцовой системой, так как в системе отсутствуют типы данных постоянной длины (чтобы можно было эффективно хранить числа без "мусора"), а также за счёт отсутствия векторного движка. Задачи в YT выполняются с помощью произвольного кода в режиме streaming, то есть, не могут быть достаточно оптимизированы (до сотен миллионов строк в секунду на один сервер). В YT в 2014-2016 годах находится в разработке функциональность "динамических сортированных таблиц" с использованием Merge Tree, строгой типизацией значений и языком запросов типа SQL. Динамические сортированные таблицы не подходят для OLAP задач, так как данные в них хранятся по строкам. Разработка языка запросов в YT всё ещё находится в зачаточной стадии, что не позволяет ориентироваться на эту функциональность. Разработчики YT рассматривают динамические сортированные таблицы для применения в OLTP и Key-Value сценариях работы. - - -==Производительность== - -По результатам внутреннего тестирования, ClickHouse обладает наиболее высокой производительностью (как наиболее высоким throughput на длинных запросах, так и наиболее низкой latency на коротких запросах), при соответствующем сценарии работы, среди доступных для тестирования систем подобного класса. Результаты тестирования можно посмотреть на отдельной странице. - - -===Пропускная способность при обработке одного большого запроса=== - -Пропускную способность можно измерять в строчках в секунду и в мегабайтах в секунду. При условии, что данные помещаются в page cache, не слишком сложный запрос обрабатывается на современном железе со скоростью около 2-10 GB/sec. несжатых данных на одном сервере (в простейшем случае скорость может достигать 30 GB/sec). Если данные не помещаются в page cache, то скорость работы зависит от скорости дисковой подсистемы и коэффициента сжатия данных. Например, если дисковая подсистема позволяет читать данные со скоростью 400 MB/sec., а коэффициент сжатия данных составляет 3, то скорость будет около 1.2GB/sec. Для получения скорости в строчках в секунду, следует поделить скорость в байтах в секунду на суммарный размер используемых в запросе столбцов. Например, если вынимаются столбцы на 10 байт, то скорость будет в районе 100-200 млн. строчек в секунду. - -При распределённой обработке запроса, скорость обработки запроса растёт почти линейно, но только при условии, что в результате агрегации или при сортировке получается не слишком большое множество строчек. - -===Задержки при обработке коротких запросов.=== - -Если запрос использует первичный ключ, и выбирает для обработки не слишком большое количество строчек (сотни тысяч), и использует не слишком большое количество столбцов, то вы можете рассчитывать на latency менее 50 миллисекунд (от единиц миллисекунд в лучшем случае), при условии, что данные помещаются в page cache. Иначе latency вычисляется из количества seek-ов. Если вы используйте вращающиеся диски, то на не слишком сильно нагруженной системе, latency вычисляется по формуле: seek time (10 мс.) * количество столбцов в запросе * количество кусков с данными. - -===Пропускная способность при обработке большого количества коротких запросов.=== - -При тех же условиях, ClickHouse может обработать несколько сотен (до нескольких тысяч в лучшем случае) запросов в секунду на одном сервере. Так как такой сценарий работы не является типичным для аналитических СУБД, рекомендуется рассчитывать не более чем на 100 запросов в секунду. - -===Производительность при вставке данных.=== - -Данные рекомендуется вставлять пачками не менее 1000 строк или не более одного запроса в секунду. При вставке в таблицу типа MergeTree из tab-separated дампа, скорость вставки будет в районе 50-200 МБ/сек. Если вставляются строчки размером около 1 КБ, то скорость будет в районе 50 000 - 200 000 строчек в секунду. Если строчки маленькие - производительность в строчках в секунду будет выше (на данных БК - > 500 000 строк в секунду, на данных Graphite - > 1 000 000 строк в секунду). Для увеличения производительности, можно производить несколько запросов INSERT параллельно - при этом производительность растёт линейно. - -
    - -
    -

    Начало работы

    -
    - -
    - -==Системные требования== - -Система некроссплатформенная. Требуется ОС Linux Ubuntu не более старая, чем Precise (12.04); архитектура x86_64 с поддержкой набора инструкций SSE 4.2. -Для проверки наличия SSE 4.2, выполните: -%%grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not supported"%% - -Рекомендуется использовать Ubuntu Trusty или Ubuntu Xenial или Ubuntu Precise. -Терминал должен работать в кодировке UTF-8 (как по умолчанию в Ubuntu). - - -==Установка== - -В целях тестирования и разработки, система может быть установлена на один сервер или на рабочий компьютер. - - -===Установка из пакетов=== - -Пропишите в %%/etc/apt/sources.list%% (или в отдельный файл %%/etc/apt/sources.list.d/clickhouse.list%%) репозитории: - -На Ubuntu Trusty (14.04): - -%% -deb http://repo.yandex.ru/clickhouse/trusty stable main -%% - -На других версиях Ubuntu, замените %%trusty%% на %%xenial%% или %%precise%%. - - -Затем выполните: - -%% -sudo apt-key adv --keyserver keyserver.ubuntu.com --recv E0C56BD4 # optional -sudo apt-get update -sudo apt-get install -y clickhouse-client clickhouse-server -%% - -Также можно скачать и установить пакеты вручную, отсюда: -http://repo.yandex.ru/clickhouse/trusty/pool/main/c/clickhouse/, -http://repo.yandex.ru/clickhouse/xenial/pool/main/c/clickhouse/, -http://repo.yandex.ru/clickhouse/precise/pool/main/c/clickhouse/. - -ClickHouse содержит настройки ограничения доступа. Они расположены в файле users.xml (рядом с config.xml). -По умолчанию, разрешён доступ отовсюду для пользователя default без пароля. См. секцию users/default/networks. -Подробнее смотрите в разделе "конфигурационные файлы". - - -===Установка из исходников=== - -Для сборки воспользуйтесь инструкцией build.md (для Linux) или build_osx.md (для Mac OS X). - -Вы можете собрать пакеты и установить их. -Также вы можете использовать программы без установки пакетов. - -Клиент: dbms/programs/clickhouse-client -Сервер: dbms/programs/clickhouse-server - -Для сервера создаёте директории с данными, например: - -%% -/var/lib/clickhouse/data/default/ -/var/lib/clickhouse/metadata/default/ -%% - -(Настраивается в конфиге сервера.) -Сделайте chown под нужного пользователя. - -Обратите внимание на путь к логам в конфиге сервера (src/dbms/programs/server/config.xml). - - -===Другие методы установки=== - -Docker образ: https://hub.docker.com/r/yandex/clickhouse-server/ - -Gentoo overlay: https://github.com/kmeaw/clickhouse-overlay - - -===Запуск=== - -Для запуска сервера (в качестве демона), выполните: - -
    -sudo service clickhouse-server start
    -
    - -Смотрите логи в директории - -%% -/var/log/clickhouse-server/ -%% - -Если сервер не стартует - проверьте правильность конфигурации в файле - -%% -/etc/clickhouse-server/config.xml -%% - -Также можно запустить сервер из консоли: - -
    -clickhouse-server --config-file=/etc/clickhouse-server/config.xml
    -
    - -При этом, лог будет выводиться в консоль - удобно для разработки. -Если конфигурационный файл лежит в текущей директории, то указывать параметр --config-file не требуется - по умолчанию будет использован файл ./config.xml - -Соединиться с сервером можно с помощью клиента командной строки: - -
    -clickhouse-client
    -
    - -Параметры по умолчанию обозначают - соединяться с localhost:9000, от имени пользователя default без пароля. -Клиент может быть использован для соединения с удалённым сервером. Пример: - -
    -clickhouse-client --host=example.com
    -
    - -Подробнее смотри раздел "Клиент командной строки". - -Проверим работоспособность системы: - -
    -milovidov@milovidov-Latitude-E6320:~/work/metrica/src/dbms/src/Client$ ./clickhouse-client
    -ClickHouse client version 0.0.18749.
    -Connecting to localhost:9000.
    -Connected to ClickHouse server version 0.0.18749.
    -
    -:) SELECT 1
    -
    -SELECT 1
    -
    -┌─1─┐
    -│ 1 │
    -└───┘
    -
    -1 rows in set. Elapsed: 0.003 sec.
    -
    -:)
    -
    - -Поздравляю, система работает! - -==Тестовые данные== - -Если вы сотрудник Яндекса, вы можете воспользоваться тестовыми данными Яндекс.Метрики для изучения возможностей системы. -Как загрузить тестовые данные, написано здесь. - -Если вы внешний пользователь системы, вы можете воспользоваться использовать общедоступные данные, способы загрузки которых указаны здесь. - - -==Если возникли вопросы== - -Если вы являетесь сотрудником Яндекса, обращайтесь на внутреннюю рассылку по ClickHouse. -Вы можете подписаться на эту рассылку, чтобы получать анонсы, быть в курсе нововведений, а также видеть вопросы, которые возникают у других пользователей. - -Иначе вы можете задавать вопросы на Stack Overflow или участвовать в обсуждениях на Google Groups. Также вы можете отправить приватное сообщение для разработчиков по адресу clickhouse-feedback@yandex-team.com. - - - -
    -
    -

    Интерфейсы

    -
    - -
    - -Для изучения возможностей системы, загрузки данных в таблицы, ручных запросов, используйте программу clickhouse-client. - - -Для удаленного доступа к ClickHouse пропишите в конфиг сервера: -
    <listen_host>::</listen_host>
    - -==HTTP интерфейс== - - -HTTP интерфейс позволяет использовать ClickHouse на любой платформе, из любого языка программирования. У нас он используется для работы из Java и Perl, а также из shell-скриптов. В других отделах, HTTP интерфейс используется из Perl, Python и Go. HTTP интерфейс более ограничен по сравнению с родным интерфейсом, но является более совместимым. - -По умолчанию, clickhouse-server слушает HTTP на порту 8123 (это можно изменить в конфиге). -Если запросить GET / без параметров, то вернётся строка "Ok." (с переводом строки на конце). Это может быть использовано в скриптах проверки живости. - -
    -$ curl 'http://localhost:8123/'
    -Ok.
    -
    - -Запрос отправляется в виде параметра URL query. Или POST-ом. Или начало запроса в параметре query, а продолжение POST-ом (зачем это нужно, будет объяснено ниже). Размер URL ограничен 16KB, это следует учитывать при отправке длинных запросов в параметре query. - -В случае успеха, вам вернётся код ответа 200 и результат обработки запроса в теле ответа. -В случае ошибки, вам вернётся код ответа 500 и текст с описанием ошибки в теле ответа. - -При использовании метода GET, выставляется настройка readonly. То есть, для запросов, модифицирующие данные, можно использовать только метод POST. Сам запрос при этом можно отправлять как в теле POST-а, так и в параметре URL. - -Примеры: - -
    -$ curl 'http://localhost:8123/?query=SELECT%201'
    -1
    -
    -$ wget -O- -q 'http://localhost:8123/?query=SELECT 1'
    -1
    -
    -$ GET 'http://localhost:8123/?query=SELECT 1'
    -1
    -
    -$ echo -ne 'GET /?query=SELECT%201 HTTP/1.0\r\n\r\n' | nc localhost 8123
    -HTTP/1.0 200 OK
    -Connection: Close
    -Date: Fri, 16 Nov 2012 19:21:50 GMT
    -
    -1
    -
    - -Как видно, curl немного неудобен тем, что надо экранировать пробелы в URL. -wget экранирует самостоятельно, но его не рекомендуется использовать, так как он плохо работает по HTTP 1.1 при использовании keep-alive и Transfer-Encoding: chunked. - -
    -$ echo 'SELECT 1' | curl 'http://localhost:8123/' --data-binary @-
    -1
    -
    -$ echo 'SELECT 1' | curl 'http://localhost:8123/?query=' --data-binary @-
    -1
    -
    -$ echo '1' | curl 'http://localhost:8123/?query=SELECT' --data-binary @-
    -1
    -
    - -Если часть запроса отправляется в параметре, а часть POST-ом, то между этими двумя кусками данных ставится перевод строки. -Пример (так работать не будет): - -
    -$ echo 'ECT 1' | curl 'http://localhost:8123/?query=SEL' --data-binary @-
    -Code: 59, e.displayText() = DB::Exception: Syntax error: failed at position 0: SEL
    -ECT 1
    -, expected One of: SHOW TABLES, SHOW DATABASES, SELECT, INSERT, CREATE, ATTACH, RENAME, DROP, DETACH, USE, SET, OPTIMIZE., e.what() = DB::Exception
    -
    - -По умолчанию, данные возвращаются в формате TabSeparated (подробнее смотри раздел "Форматы"). -Можно попросить любой другой формат - с помощью секции FORMAT запроса. - -
    -$ echo 'SELECT 1 FORMAT Pretty' | curl 'http://localhost:8123/?' --data-binary @-
    -┏━━━┓
    -┃ 1 ┃
    -┡━━━┩
    -│ 1 │
    -└───┘
    -
    - -Возможность передавать данные POST-ом нужна для INSERT-запросов. В этом случае вы можете написать начало запроса в параметре URL, а вставляемые данные передать POST-ом. Вставляемыми данными может быть, например, tab-separated дамп, полученный из MySQL. Таким образом, запрос INSERT заменяет LOAD DATA LOCAL INFILE из MySQL. - -Примеры: - -Создаём таблицу: - -
    -echo 'CREATE TABLE t (a UInt8) ENGINE = Memory' | POST 'http://localhost:8123/'
    -
    - -Используем привычный запрос INSERT для вставки данных: - -
    -echo 'INSERT INTO t VALUES (1),(2),(3)' | POST 'http://localhost:8123/'
    -
    - -Данные можно отправить отдельно от запроса: - -
    -echo '(4),(5),(6)' | POST 'http://localhost:8123/?query=INSERT INTO t VALUES'
    -
    - -Можно указать любой формат для данных. Формат Values - то же, что используется при записи INSERT INTO t VALUES: - -
    -echo '(7),(8),(9)' | POST 'http://localhost:8123/?query=INSERT INTO t FORMAT Values'
    -
    - -Можно вставить данные из tab-separated дампа, указав соответствующий формат: - -
    -echo -ne '10\n11\n12\n' | POST 'http://localhost:8123/?query=INSERT INTO t FORMAT TabSeparated'
    -
    - -Прочитаем содержимое таблицы. Данные выводятся в произвольном порядке из-за параллельной обработки запроса: - -
    -$ GET 'http://localhost:8123/?query=SELECT a FROM t'
    -7
    -8
    -9
    -10
    -11
    -12
    -1
    -2
    -3
    -4
    -5
    -6
    -
    - -Удаляем таблицу. - -
    -echo 'DROP TABLE t' | POST 'http://localhost:8123/'
    -
    - -Для запросов, которые не возвращают таблицу с данными, в случае успеха, выдаётся пустое тело ответа. - -Вы можете использовать сжатие при передаче данных. Формат сжатых данных нестандартный, и вам придётся использовать для работы с ним специальную программу clickhouse-compressor (%%sudo apt-get install clickhouse-utils%%). - -Если вы указали в URL compress=1, то сервер будет сжимать отправляемые вам данные. -Если вы указали в URL decompress=1, то сервер будет разжимать те данные, которые вы передаёте ему POST-ом. - -Это может быть использовано для уменьшения трафика по сети при передаче большого количества данных, а также для создания сразу сжатых дампов. - -В параметре URL database может быть указана БД по умолчанию. - -
    -$ echo 'SELECT number FROM numbers LIMIT 10' | curl 'http://localhost:8123/?database=system' --data-binary @-
    -0
    -1
    -2
    -3
    -4
    -5
    -6
    -7
    -8
    -9
    -
    - -По умолчанию используется БД, которая прописана в настройках сервера, как БД по умолчанию. По умолчанию, это - БД default. Также вы всегда можете указать БД через точку перед именем таблицы. - -Имя пользователя и пароль могут быть указаны в одном из двух вариантов: -1. С использованием HTTP Basic Authentification. Пример: -
    -echo 'SELECT 1' | curl 'http://user:password@localhost:8123/' -d @-
    -
    -2. В параметрах URL user и password. Пример: -
    -echo 'SELECT 1' | curl 'http://localhost:8123/?user=user&password=password' -d @-
    -
    -Если имя пользователя не указано, то используется имя пользователя default. Если пароль не указан, то используется пустой пароль. - -Также в параметрах URL вы можете указать любые настройки, которые будут использованы для обработки одного запроса, или целые профили настроек. Пример: - -%%http://localhost:8123/?profile=web&max_rows_to_read=1000000000&query=SELECT+1%% - -Подробнее см. раздел "Настройки". - -
    -$ echo 'SELECT number FROM system.numbers LIMIT 10' | curl 'http://localhost:8123/?' --data-binary @-
    -0
    -1
    -2
    -3
    -4
    -5
    -6
    -7
    -8
    -9
    -
    - -Об остальных параметрах смотри раздел "SET". - -В отличие от родного интерфейса, HTTP интерфейс не поддерживает понятие сессии и настройки в пределах сессии, не позволяет (вернее, позволяет лишь в некоторых случаях) прервать выполнение запроса, не показывает прогресс выполнения запроса. Парсинг и форматирование данных производится на стороне сервера и использование сети может быть неэффективным. - -Может быть передан необязательный параметр query_id - идентификатор запроса, произвольная строка. Подробнее смотрите раздел "Настройки, replace_running_query". - -Может быть передан необязательный параметр quota_key - ключ квоты, произвольная строка. Подробнее смотрите раздел "Квоты". - -HTTP интерфейс позволяет передать внешние данные (внешние временные таблицы) для использования запроса. Подробнее смотрите раздел "Внешние данные для обработки запроса" - -===Буферизация ответа=== -Существует возможность включить буферизацию ответа на стороне сервера. Для этого предусмотрены параметры URL buffer_size и wait_end_of_query. - -buffer_size определяет количество байт результата которые будут буферизованы в памяти сервера. Если тело результата больше этого порога, то буфер будет переписан в HTTP канал, а оставшиеся данные будут отправляться в HTTP-канал напрямую. - -Чтобы гарантировать буферизацию всего ответа необходимо выставить wait_end_of_query=1. В этом случае данные, не поместившиеся в памяти, будут буферизованы во временном файле сервера. - -Пример: -
    -curl -sS 'http://localhost:8123/?max_result_bytes=4000000&buffer_size=3000000&wait_end_of_query=1' -d 'SELECT toUInt8(number) FROM system.numbers LIMIT 9000000 FORMAT RowBinary'
    -
    - -Буферизация позволяет избежать ситуации когда код ответа и HTTP-заголовки были отправлены клиенту, после чего возникла ошибка выполнения запроса. В такой ситуации сообщение об ошибке записывается в конце тела ответа, и на стороне клиента ошибка может быть обнаружена только на этапе парсинга. - -==JDBC драйвер== - -Для ClickHouse существует официальный JDBC драйвер. Смотрите здесь. - - -==Библиотеки от сторонних разработчиков== - -Существуют библиотеки для работы с ClickHouse для Python (1, 2), PHP (1, 2, 3), Go (1, 2), Node.js (1, 2), Perl (1, 2, 3), Ruby (1), R (1), .NET (1), C++ (1). - -Библиотеки не тестировались нами. Порядок перечисления произвольный. - - -==GUI от сторонних разработчиков== - -Существует open source проект Tabix от компании СМИ2, в котором реализован графический веб-интерфейс для работы с ClickHouse. - -Ключевые возможности Tabix: -- работа с ClickHouse из браузера напрямую, без установки дополнительного ПО; -- редактор запросов, поддерживающий подсветку SQL-синтаксиса ClickHouse, автодополнения по всем объектам, включая словари, и контекстные подсказки для встроенных функций; -- построение графиков, диаграмм и отображение геокоординат для результатов запросов; -- интерактивный конструктор сводных таблиц (pivot) для результатов запросов; -- графические средства для анализа состояния ClickHouse; -- два цветовых оформления: светлое и темное. - -Документация Tabix - - -==Родной интерфейс (TCP)== - -Родной интерфейс используется в клиенте командной строки clickhouse-client, при межсерверном взаимодействии для распределённой обработки запроса, а также в программах на C++. Будет рассмотрен только клиент командной строки. - - -==Клиент командной строки== - -
    -$ clickhouse-client
    -ClickHouse client version 0.0.26176.
    -Connecting to localhost:9000.
    -Connected to ClickHouse server version 0.0.26176.
    -
    -:) SELECT 1
    -
    - -Программа clickhouse-client принимает следующие параметры, все из которых являются необязательными: - ---host, -h - имя сервера, по умолчанию - localhost. -Вы можете использовать как имя, так и IPv4 или IPv6 адрес. - ---port - порт, к которому соединяться, по умолчанию - 9000. -Замечу, что для HTTP и родного интерфейса используются разные порты. - ---user, -u - имя пользователя, по умолчанию - default. - ---password - пароль, по умолчанию - пустая строка. - ---query, -q - запрос для выполнения, при использовании в неинтерактивном режиме. - ---database, -d - выбрать текущую БД, по умолчанию - текущая БД из настроек сервера (по умолчанию - БД default). - ---multiline, -m - если указано - разрешить многострочные запросы, не отправлять запрос по нажатию Enter. - ---multiquery, -n - если указано - разрешить выполнять несколько запросов, разделённых точкой с запятой. -Работает только в неинтерактивном режиме. - ---format, -f - использовать указанный формат по умолчанию для вывода результата. - ---vertical, -E - если указано, использовать формат Vertical по умолчанию для вывода результата. То же самое, что --format=Vertical. В этом формате каждое значение выводится на отдельной строке, что удобно для отображения широких таблиц. - ---time, -t - если указано, в неинтерактивном режиме вывести время выполнения запроса в stderr. - ---stacktrace - если указано, в случае исключения, выводить также его стек трейс. - ---config-file - имя конфигурационного файла, в котором есть дополнительные настройки или изменены умолчания для настроек, указанных выше. -По умолчанию, ищутся файлы в следующем порядке: -./clickhouse-client.xml -~/.clickhouse-client/config.xml -/etc/clickhouse-client/config.xml -Настройки берутся только из первого найденного файла. - -Также вы можете указать любые настройки, которые будут использованы для обработки запросов. Например, %%clickhouse-client --max_threads=1%%. Подробнее см. раздел "Настройки". - -Клиент может быть использован в интерактивном и неинтерактивном (batch) режиме. -Чтобы использовать batch режим, укажите параметр query, или отправьте данные в stdin (проверяется, что stdin - не терминал), или и то, и другое. -Аналогично HTTP интерфейсу, при использовании одновременно параметра query и отправке данных в stdin, запрос составляется из конкатенации параметра query, перевода строки, и данных в stdin. Это удобно для больших INSERT запросов. - -Примеры использования клиента для вставки данных: - -%% -echo -ne "1, 'some text', '2016-08-14 00:00:00'\n2, 'some more text', '2016-08-14 00:00:01'" | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; - -cat <<_EOF | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; -3, 'some text', '2016-08-14 00:00:00' -4, 'some more text', '2016-08-14 00:00:01' -_EOF - -cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FORMAT CSV"; -%% - -В batch режиме в качестве формата данных по умолчанию используется формат TabSeparated. Формат может быть указан в секции FORMAT запроса. - -По умолчанию, в batch режиме вы можете выполнить только один запрос. Чтобы выполнить несколько запросов из "скрипта", используйте параметр --multiquery. Это работает для всех запросов кроме INSERT. Результаты запросов выводятся подряд без дополнительных разделителей. -Также, при необходимости выполнить много запросов, вы можете запускать clickhouse-client на каждый запрос. Заметим, что запуск программы clickhouse-client может занимать десятки миллисекунд. - -В интерактивном режиме, вы получите командную строку, в которую можно вводить запросы. - -Если не указано multiline (по умолчанию): -Чтобы выполнить запрос, нажмите Enter. Точка с запятой на конце запроса не обязательна. Чтобы ввести запрос, состоящий из нескольких строк, перед переводом строки, введите символ обратного слеша: %%\%% - тогда после нажатия Enter, вам предложат ввести следующую строку запроса. - -Если указано multiline (многострочный режим): -Чтобы выполнить запрос, завершите его точкой с запятой и нажмите Enter. Если в конце введённой строки не было точки с запятой, то вам предложат ввести следующую строчку запроса. - -Исполняется только один запрос, поэтому всё, что введено после точки с запятой, игнорируется. - -Вместо или после точки с запятой может быть указано %%\G%%. Это обозначает использование формата Vertical. В этом формате каждое значение выводится на отдельной строке, что удобно для широких таблиц. Столь необычная функциональность добавлена для совместимости с MySQL CLI. - -Командная строка сделана на основе readline (и history) (или libedit, или без какой-либо библиотеки, в зависимости от сборки) - то есть, в ней работают привычные сочетания клавиш, а также присутствует история. -История пишется в ~/.clickhouse-client-history. - -По умолчанию, в качестве формата, используется формат PrettyCompact (красивые таблички). Вы можете изменить формат с помощью секции FORMAT запроса, или с помощью указания \G на конце запроса, с помощью аргумента командной строки --format или --vertical, или с помощью конфигурационного файла клиента. - -Чтобы выйти из клиента, нажмите Ctrl+D (или Ctrl+C), или наберите вместо запроса одно из: -"exit", "quit", "logout", "учше", "йгше", "дщпщге", "exit;", "quit;", "logout;", "учшеж", "йгшеж", "дщпщгеж", "q", "й", "\q", "\Q", ":q", "\й", "\Й", "Жй" - -При выполнении запроса, клиент показывает: -1. Прогресс выполнение запроса, который обновляется не чаще, чем 10 раз в секунду (по умолчанию). При быстрых запросах, прогресс может не успеть отобразиться. -2. Отформатированный запрос после его парсинга - для отладки. -3. Результат в заданном формате. -4. Количество строк результата, прошедшее время, а также среднюю скорость выполнения запроса. - -Вы можете прервать длинный запрос, нажав Ctrl+C. При этом вам всё равно придётся чуть-чуть подождать, пока сервер остановит запрос. На некоторых стадиях выполнения, запрос невозможно прервать. Если вы не дождётесь и нажмёте Ctrl+C второй раз, то клиент будет завершён. - -Клиент командной строки позволяет передать внешние данные (внешние временные таблицы) для использования запроса. Подробнее смотрите раздел "Внешние данные для обработки запроса" - - -
    -
    -

    Язык запросов

    -
    - -
    - -==Программа clickhouse-local== - -Программа clickhouse-local позволяет выполнять быструю обработку локальных файлов, хранящих таблицы, не прибегая к развертыванию и настройке clickhouse-server ... - -==Синтаксис== - -В системе есть два вида парсеров: полноценный парсер SQL (recursive descent parser) и парсер форматов данных (быстрый потоковый парсер). -Во всех случаях кроме запроса INSERT, используется только полноценный парсер SQL. -В запросе INSERT используется оба парсера: - -%%INSERT INTO t VALUES (1, 'Hello, world'), (2, 'abc'), (3, 'def')%% - -Фрагмент %%INSERT INTO t VALUES%% парсится полноценным парсером, а данные %%(1, 'Hello, world'), (2, 'abc'), (3, 'def')%% - быстрым потоковым парсером. -Данные могут иметь любой формат. При получении запроса, сервер заранее считывает в оперативку не более max_query_size байт запроса (по умолчанию, 1МБ), а всё остальное обрабатывается потоково. -Таким образом, в системе нет проблем с большими INSERT запросами, как в MySQL. - -При использовании формата Values в INSERT запросе может сложиться иллюзия, что данные парсятся также, как выражения в запросе SELECT, но это не так - формат Values гораздо более ограничен. - -Далее пойдёт речь о полноценном парсере. О парсерах форматов, смотри раздел "Форматы". - -===Пробелы=== - -Между синтаксическими конструкциями (в том числе, в начале и конце запроса) может быть расположено произвольное количество пробельных символов. К пробельным символам относятся пробел, таб, перевод строки, CR, form feed. - -===Комментарии=== - -Поддерживаются комментарии в SQL-стиле и C-стиле. -Комментарии в SQL-стиле: от %%--%% до конца строки. Пробел после %%--%% может не ставиться. -Комментарии в C-стиле: от %%/*%% до %%*/%%. Такие комментарии могут быть многострочными. Пробелы тоже не обязательны. - -===Ключевые слова=== - -Ключевые слова (например, SELECT) регистронезависимы. Всё остальное (имена столбцов, функций и т. п.), в отличие от стандарта SQL, регистрозависимо. Ключевые слова не зарезервированы (а всего лишь парсятся как ключевые слова в соответствующем контексте). - -===Идентификаторы=== - -Идентификаторы (имена столбцов, функций, типов данных) могут быть квотированными или не квотированными. -Не квотированные идентификаторы начинаются на букву латинского алфавита или подчёркивание; продолжаются на букву латинского алфавита или подчёркивание или цифру. Короче говоря, должны соответствовать регулярному выражению %%^[a-zA-Z_][0-9a-zA-Z_]*$%%. Примеры: %%x%%, %%_1%%, %%X_y__Z123_%%. -Квотированные идентификаторы расположены в обратных кавычках %%`id`%% (также, как в MySQL), и могут обозначать произвольный (непустой) набор байт. При этом, внутри записи такого идентификатора, символы (например, символ обратной кавычки) могут экранироваться с помощью обратного слеша. Правила экранирования такие же, как в строковых литералах (см. ниже). -Рекомендуется использовать идентификаторы, которые не нужно квотировать. - -===Литералы=== - -Бывают числовые, строковые и составные литералы. - -

    Числовые литералы

    - -Числовой литерал пытается распарситься: -- сначала как 64-битное число без знака, с помощью функции strtoull; -- если не получилось - то как 64-битное число со знаком, с помощью функции strtoll; -- если не получилось - то как число с плавающей запятой, с помощью функции strtod; -- иначе - ошибка. - -Соответствующее значение будет иметь тип минимального размера, который вмещает значение. -Например, 1 парсится как UInt8, а 256 - как UInt16. Подробнее смотрите "Типы данных". - -Примеры: %%1%%, %%18446744073709551615%%, %%0xDEADBEEF%%, %%01%%, %%0.1%%, %%1e100%%, %%-1e-100%%, %%inf%%, %%nan%%. - -

    Строковые литералы

    - -Поддерживаются только строковые литералы в одинарных кавычках. Символы внутри могут быть экранированы с помощью обратного слеша. Следующие escape-последовательности имеют соответствующее специальное значение: %%\b%%, %%\f%%, %%\r%%, %%\n%%, %%\t%%, %%\0%%, %%\a%%, %%\v%%, \xHH. Во всех остальных случаях, последовательности вида \c, где c - любой символ, преобразуется в c. Таким образом, могут быть использованы последовательности %%\'%% и %%\\%%. Значение будет иметь тип String. - -Минимальный набор символов, которых вам необходимо экранировать в строковых литералах: %%'%% and %%\%%. - -

    Составные литералы

    - -Поддерживаются конструкции для массивов: %%[1, 2, 3]%% и кортежей: %%(1, 'Hello, world!', 2)%%. -На самом деле, это вовсе не литералы, а выражение с оператором создания массива и оператором создания кортежа, соответственно. -Подробнее смотри в разделе "Операторы2". -Массив должен состоять хотя бы из одного элемента, а кортеж - хотя бы из двух. -Кортежи носят служебное значение для использования в секции IN запроса SELECT. Кортежи могут быть получены в качестве результата запроса, но не могут быть сохранены в базу (за исключением таблиц типа Memory). - -===Функции=== - -Функции записываются как идентификатор со списком аргументов (возможно, пустым) в скобках. В отличие от стандартного SQL, даже в случае пустого списка аргументов, скобки обязательны. Пример: %%now()%%. -Бывают обычные и агрегатные функции (смотрите раздел "Агрегатные функции"). Некоторые агрегатные функции могут содержать два списка аргументов в круглых скобках. Пример: %%quantile(0.9)(x)%%. Такие агрегатные функции называются "параметрическими", а первый список аргументов называется "параметрами". Синтаксис агрегатных функций без параметров ничем не отличается от обычных функций. - -===Операторы=== - -Операторы преобразуются в соответствующие им функции во время парсинга запроса, с учётом их приоритета и ассоциативности. -Например, выражение %%1 + 2 * 3 + 4%% преобразуется в %%plus(plus(1, multiply(2, 3)), 4)%%. -Подробнее смотрите раздел "Операторы2" ниже. - -===Типы данных и движки таблиц=== - -Типы данных и движки таблиц в запросе CREATE записываются также, как идентификаторы или также как функции. То есть, могут содержать или не содержать список аргументов в круглых скобках. Подробнее смотрите разделы "Типы данных", "Движки таблиц", "CREATE". - -===Синонимы=== - -В запросе SELECT, в выражениях могут быть указаны синонимы с помощью ключевого слова AS. Слева от AS стоит любое выражение. Справа от AS стоит идентификатор - имя для синонима. В отличие от стандартного SQL, синонимы могут объявляться не только на верхнем уровне выражений: - -%%SELECT (1 AS n) + 2, n%% - -В отличие от стандартного SQL, синонимы могут использоваться во всех секциях запроса, а не только SELECT. - -===Звёздочка=== - -В запросе SELECT, вместо выражения может стоять звёздочка. Подробнее смотрите раздел "SELECT". - -===Выражения=== - -Выражение представляет собой функцию, идентификатор, литерал, применение оператора, выражение в скобках, подзапрос, звёздочку; и может содержать синоним. -Список выражений - одно выражение или несколько выражений через запятую. -Функции и операторы, в свою очередь, в качестве аргументов, могут иметь произвольные выражения. - - -==Запросы== - - -===CREATE DATABASE=== - -%%CREATE DATABASE [IF NOT EXISTS] db_name%% - -- создаёт базу данных db_name. База данных - это просто директория для таблиц. -Если написано IF NOT EXISTS, то запрос не будет возвращать ошибку, если база данных уже существует. - -===CREATE TABLE=== - -Запрос CREATE TABLE может иметь несколько форм. - -%%CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name -( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], - ... -) ENGINE = engine%% - -Создаёт таблицу с именем name в БД db или текущей БД, если db не указана, со структурой, указанной в скобках, и движком engine. -Структура таблицы представляет список описаний столбцов. Индексы, если поддерживаются движком, указываются в качестве параметров для движка таблицы. - -Описание столбца, это %%name type%%, в простейшем случае. Пример: %%RegionID UInt32%%. -Также могут быть указаны выражения для значений по умолчанию - смотрите ниже. - -%%CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name AS [db2.]name2 [ENGINE = engine]%% - -Создаёт таблицу с такой же структурой, как другая таблица. Можно указать другой движок для таблицы. Если движок не указан, то будет выбран такой же движок, как у таблицы db2.name2. - -%%CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name ENGINE = engine AS SELECT ...%% - -Создаёт таблицу со структурой, как результат запроса SELECT, с движком engine, и заполняет её данными из SELECT-а. - -Во всех случаях, если указано IF NOT EXISTS, то запрос не будет возвращать ошибку, если таблица уже существует. В этом случае, запрос будет ничего не делать. - -

    Значения по умолчанию

    - -В описании столбца, может быть указано выражение для значения по умолчанию, одного из следующих видов: -%%DEFAULT expr%%, %%MATERIALIZED expr%%, %%ALIAS expr%%. -Пример: %%URLDomain String DEFAULT domain(URL)%%. - -Если выражение для значения по умолчанию не указано, то в качестве значений по умолчанию будут использоваться нули для чисел, пустые строки для строк, пустые массивы для массивов, а также 0000-00-00 для дат и 0000-00-00 00:00:00 для дат с временем. NULL-ы не поддерживаются. - -В случае, если указано выражение по умолчанию, то указание типа столбца не обязательно. При отсутствии явно указанного типа, будет использован тип выражения по умолчанию. Пример: %%EventDate DEFAULT toDate(EventTime)%% - для столбца EventDate будет использован тип Date. - -При наличии явно указанного типа данных и выражения по умолчанию, это выражение будет приводиться к указанному типу с использованием функций приведения типа. Пример: %%Hits UInt32 DEFAULT 0%% - имеет такой же смысл, как %%Hits UInt32 DEFAULT toUInt32(0)%%. - -В качестве выражения для умолчания, может быть указано произвольное выражение от констант и столбцов таблицы. При создании и изменении структуры таблицы, проверяется, что выражения не содержат циклов. При INSERT-е проверяется разрешимость выражений - что все столбцы, из которых их можно вычислить, переданы. - -%%DEFAULT expr%% - -Обычное значение по умолчанию. Если в запросе INSERT не указан соответствующий столбец, то он будет заполнен путём вычисления соответствующего выражения. - -%%MATERIALIZED expr%% - -Материализованное выражение. Такой столбец не может быть указан при INSERT-е, то есть, он всегда вычисляется. -При INSERT-е без указания списка столбцов, такие столбцы не рассматриваются. -Также этот столбец не подставляется при использовании звёздочки в запросе SELECT - чтобы сохранить инвариант, что дамп, полученный путём SELECT *, можно вставить обратно в таблицу INSERT-ом без указания списка столбцов. - -%%ALIAS expr%% - -Синоним. Такой столбец вообще не хранится в таблице. -Его значения не могут быть вставлены в таблицу, он не подставляется при использовании звёздочки в запросе SELECT. -Он может быть использован в SELECT-ах - в таком случае, во время разбора запроса, алиас раскрывается. - -При добавлении новых столбцов с помощью запроса ALTER, старые данные для этих столбцов не записываются. Вместо этого, при чтении старых данных, для которых отсутствуют значения новых столбцов, выполняется вычисление выражений по умолчанию налету. При этом, если выполнение выражения требует использования других столбцов, не указанных в запросе, то эти столбцы будут дополнительно прочитаны, но только для тех блоков данных, для которых это необходимо. - -Если добавить в таблицу новый столбец, а через некоторое время изменить его выражение по умолчанию, то используемые значения для старых данных (для данных, где значения не хранились на диске) поменяются. Также заметим, что при выполнении фоновых слияний, данные для столбцов, отсутствующих в одном из сливаемых кусков, записываются в объединённый кусок. - -Отсутствует возможность задать значения по умолчанию для элементов вложенных структур данных. - - -

    Временные таблицы

    - -Во всех случаях, если указано TEMPORARY, то будет создана временная таблица. Временные таблицы обладают следующими особенностями: -- временные таблицы исчезают после завершения сессии; в том числе, при обрыве соединения; -- временная таблица создаётся с движком Memory; все остальные движки таблиц не поддерживаются; -- для временной таблицы нет возможности указать БД: она создаётся вне баз данных; -- если временная таблица имеет то же имя, что и некоторая другая, то, при упоминании в запросе без указания БД, будет использована временная таблица; -- при распределённой обработке запроса, используемые в запросе временные таблицы, передаются на удалённые серверы. - -В большинстве случаев, временные таблицы создаются не вручную, а при использовании внешних данных для запроса, или при распределённом (GLOBAL) IN. Подробнее см. соответствующие разделы. - -===CREATE VIEW=== - -%%CREATE [MATERIALIZED] VIEW [IF NOT EXISTS] [db.]name [ENGINE = engine] [POPULATE] AS SELECT ...%% - -Создаёт представление. Представления бывают двух видов - обычные и материализованные (MATERIALIZED). - -Обычные представления не хранят никаких данных, а всего лишь производят чтение из другой таблицы. То есть, обычное представление - не более чем сохранённый запрос. При чтении из представления, этот сохранённый запрос, используется в качестве подзапроса в секции FROM. - -Для примера, пусть вы создали представление: -%%CREATE VIEW view AS SELECT ...%% -и написали запрос: -%%SELECT a, b, c FROM view%% -Этот запрос полностью эквивалентен использованию подзапроса: -%%SELECT a, b, c FROM (SELECT ...)%% - - -Материализованные (MATERIALIZED) представления хранят данные, преобразованные соответствующим запросом SELECT. - -При создании материализованного представления, можно указать ENGINE - движок таблицы для хранения данных. По умолчанию, будет использован тот же движок, что и у таблицы, из которой делается запрос SELECT. - -Материализованное представление устроено следующим образом: при вставке данных в таблицу, указанную в SELECT-е, кусок вставляемых данных преобразуется этим запросом SELECT, и полученный результат вставляется в представление. - -Если указано POPULATE, то при создании представления, в него будут вставлены имеющиеся данные таблицы, как если бы был сделан запрос CREATE TABLE ... AS SELECT ... . Иначе, представление будет содержать только данные, вставляемые в таблицу после создания представления. Не рекомендуется использовать POPULATE, так как вставляемые в таблицу данные во время создания представления, не попадут в него. - -Запрос SELECT может содержать DISTINCT, GROUP BY, ORDER BY, LIMIT... Следует иметь ввиду, что соответствующие преобразования будут выполняться независимо, на каждый блок вставляемых данных. Например, при наличии GROUP BY, данные будут агрегироваться при вставке, но только в рамках одной пачки вставляемых данных. Далее, данные не будут доагрегированы. Исключение - использование ENGINE, производящего агрегацию данных самостоятельно, например, SummingMergeTree. - -Недоработано выполнение запросов ALTER над материализованными представлениями, поэтому они могут быть неудобными для использования. - -Представления выглядят так же, как обычные таблицы. Например, они перечисляются в результате запроса SHOW TABLES. - -Отсутствует отдельный запрос для удаления представлений. Чтобы удалить представление, следует использовать DROP TABLE. - -===ATTACH=== - -Запрос полностью аналогичен запросу CREATE, но -- вместо слова CREATE используется слово ATTACH; -- запрос не создаёт данные на диске, а предполагает, что данные уже лежат в соответствующих местах, и всего лишь добавляет информацию о таблице в сервер. -После выполнения запроса ATTACH, сервер будет знать о существовании таблицы. - -Этот запрос используется при старте сервера. Сервер хранит метаданные таблиц в виде файлов с запросами ATTACH, которые он просто исполняет при запуске (за исключением системных таблиц, создание которых явно вписано в сервер). - - -===DROP=== - -Запрос имеет два вида: DROP DATABASE и DROP TABLE. - -%%DROP DATABASE [IF EXISTS] db%% - -Удаляет все таблицы внутри базы данных db, а затем саму базу данных db. -Если указано IF EXISTS - не выдавать ошибку, если база данных не существует. - -%%DROP TABLE [IF EXISTS] [db.]name%% - -Удаляет таблицу. -Если указано IF EXISTS - не выдавать ошибку, если таблица не существует или база данных не существует. - - -===DETACH=== - -%%DETACH TABLE [IF EXISTS] [db.]name%% - -Удаляет из сервера информацию о таблице. Сервер перестаёт знать о существовании таблицы. -Но ни данные, ни метаданные таблицы не удаляются. При следующем запуске сервера, сервер прочитает метаданные и снова узнает о таблице. -Также, "отцепленную" таблицу можно прицепить заново запросом ATTACH (за исключением системных таблиц, для которых метаданные не хранятся). - -Запроса DETACH DATABASE нет. - - -===RENAME=== - -%%RENAME TABLE [db11.]name11 TO [db12.]name12, [db21.]name21 TO [db22.]name22, ...%% - -Переименовывает одну или несколько таблиц. Все таблицы переименовываются под глобальной блокировкой. Переименовывание таблицы является лёгкой операцией. Если вы указали после TO другую базу данных, то таблица будет перенесена в эту базу данных. При этом, директории с базами данных должны быть расположены в одной файловой системе (иначе возвращается ошибка). - - -===ALTER=== - -Запрос ALTER поддерживается только для таблиц типа *MergeTree, а также Merge и Distributed. Запрос имеет несколько вариантов. - -

    Манипуляции со столбцами

    - -%%ALTER TABLE [db].name ADD|DROP|MODIFY COLUMN ...%% - -Позволяет изменить структуру таблицы. В запросе указывается список из одного или более действий через запятую. -Каждое действие - операция над столбцом. - -Существуют следующие действия: - -%%ADD COLUMN name [type] [default_expr] [AFTER name_after]%% - -Добавляет в таблицу новый столбец с именем name, типом type и выражением для умолчания default_expr (смотрите раздел "Значения по умолчанию"). Если указано AFTER name_after (имя другого столбца), то столбец добавляется (в список столбцов таблицы) после указанного. Иначе, столбец добавляется в конец таблицы. Внимательный читатель может заметить, что отсутствует возможность добавить столбец в начало таблицы. Для цепочки действий, name_after может быть именем столбца, который добавляется в одном из предыдущих действий. - -Добавление столбца всего лишь меняет структуру таблицы, и не производит никаких действий с данными - соответствующие данные не появляются на диске после ALTER-а. При чтении из таблицы, если для какого-либо столбца отсутствуют данные, то он заполняется значениями по умолчанию (выполняя выражение по умолчанию, если такое есть, или нулями, пустыми строками). Также, столбец появляется на диске при слиянии кусков данных (см. MergeTree). - -Такая схема позволяет добиться мгновенной работы запроса ALTER и отсутствия необходимости увеличивать объём старых данных. - -%%DROP COLUMN name%% - -Удаляет столбец с именем name. - -Удаляет данные из файловой системы. Так как это представляет собой удаление целых файлов, запрос выполняется почти мгновенно. - -%%MODIFY COLUMN name [type] [default_expr]%% - -Изменяет тип столбца name на type и/или выражение для умолчания на default_expr. При изменении типа, значения преобразуются так, как если бы к ним была применена функция toType. - -Если изменяется только выражение для умолчания, то запрос не делает никакой сложной работы и выполняется мгновенно. - -Изменение типа столбца - это единственное действие, которое выполняет сложную работу - меняет содержимое файлов с данными. Для больших таблиц, выполнение может занять длительное время. - -Выполнение производится в несколько стадий: -- подготовка временных (новых) файлов с изменёнными данными; -- переименование старых файлов; -- переименование временных (новых) файлов в старые; -- удаление старых файлов. - -Из них, длительной является только первая стадия. Если на этой стадии возникнет сбой, то данные не поменяются. -Если на одной из следующих стадий возникнет сбой, то данные будет можно восстановить вручную. За исключением случаев, когда старые файлы удалены из файловой системы, а данные для новых файлов не доехали на диск и потеряны. - -Не поддерживается изменение типа столбца у массивов и вложенных структур данных. - -Запрос ALTER позволяет создавать и удалять отдельные элементы (столбцы) вложенных структур данных, но не вложенные структуры данных целиком. Для добавления вложенной структуры данных, вы можете добавить столбцы с именем вида name.nested_name и типом Array(T) - вложенная структура данных полностью эквивалентна нескольким столбцам-массивам с именем, имеющим одинаковый префикс до точки. - -Отсутствует возможность удалять столбцы, входящие в первичный ключ или ключ для сэмплирования (в общем, входящие в выражение ENGINE). Изменение типа у столбцов, входящих в первичный ключ возможно только в том случае, если это изменение не приводит к изменению данных (например, разрешено добавление значения в Enum или изменение типа с DateTime на UInt32). - -Если возможностей запроса ALTER не хватает для нужного изменения таблицы, вы можете создать новую таблицу, скопировать туда данные с помощью запроса INSERT SELECT, затем поменять таблицы местами с помощью запроса RENAME, и удалить старую таблицу. - -Запрос ALTER блокирует все чтения и записи для таблицы. То есть, если на момент запроса ALTER, выполнялся долгий SELECT, то запрос ALTER сначала дождётся его выполнения. И в это время, все новые запросы к той же таблице, будут ждать, пока завершится этот ALTER. - -Для таблиц, которые не хранят данные самостоятельно (типа Merge и Distributed), ALTER всего лишь меняет структуру таблицы, но не меняет структуру подчинённых таблиц. Для примера, при ALTER-е таблицы типа Distributed, вам также потребуется выполнить запрос ALTER для таблиц на всех удалённых серверах. - -Запрос ALTER на изменение столбцов реплицируется. Соответствующие инструкции сохраняются в ZooKeeper, и затем каждая реплика их применяет. Все запросы ALTER выполняются в одном и том же порядке. Запрос ждёт выполнения соответствующих действий на всех репликах. Но при этом, запрос на изменение столбцов в реплицируемой таблице можно прервать, и все действия будут осуществлены асинхронно. - - -

    Манипуляции с партициями и кусками

    - -Работает только для таблиц семейства MergeTree. Существуют следующие виды операций: - -%%DETACH PARTITION%% - перенести партицию в директорию detached и забыть про неё. -%%DROP PARTITION%% - удалить партицию. -%%ATTACH PART|PARTITION%% - добавить в таблицу новый кусок или партицию из директории detached. -%%FREEZE PARTITION%% - создать бэкап партиции. -%%FETCH PARTITION%% - скачать партицию с другого сервера. - -Ниже будет рассмотрен каждый вид запроса по-отдельности. - -Партицией (partition) в таблице называются данные за один календарный месяц. Это определяется значениями ключа-даты, указанной в параметрах движка таблицы. Данные за каждый месяц хранятся отдельно, чтобы упростить всевозможные манипуляции с этими данными. - -Куском (part) в таблице называется часть данных одной партиции, отсортированная по первичному ключу. - -Чтобы посмотреть набор кусков и партиций таблицы, можно воспользоваться системной таблицей system.parts: - -%%SELECT * FROM system.parts WHERE active%% - -active - учитывать только активные куски. Неактивными являются, например, исходные куски оставшиеся после слияния в более крупный кусок - такие куски удаляются приблизительно через 10 минут после слияния. - -Другой способ посмотреть набор кусков и партиций - зайти в директорию с данными таблицы. -Директория с данными - /var/lib/clickhouse/data/database/table/, -где /var/lib/clickhouse/ - путь к данным ClickHouse, database - имя базы данных, table - имя таблицы. Пример: - -%% -$ ls -l /var/lib/clickhouse/data/test/visits/ -total 48 -drwxrwxrwx 2 clickhouse clickhouse 20480 мая 13 02:58 20140317_20140323_2_2_0 -drwxrwxrwx 2 clickhouse clickhouse 20480 мая 13 02:58 20140317_20140323_4_4_0 -drwxrwxrwx 2 clickhouse clickhouse 4096 мая 13 02:55 detached --rw-rw-rw- 1 clickhouse clickhouse 2 мая 13 02:58 increment.txt -%% - -Здесь 20140317_20140323_2_2_0, 20140317_20140323_4_4_0 - директории кусков. - -Рассмотрим по порядку имя первого куска: 20140317_20140323_2_2_0. -20140317 - минимальная дата данных куска -20140323 - максимальная дата данных куска -2 - минимальный номер блока данных -2 - максимальный номер блока данных -0 - уровень куска - глубина дерева слияний, которыми он образован - -Каждый кусок относится к одной партиции и содержит данные только за один месяц. -201403 - имя партиции. Партиция представляет собой набор кусков за один месяц. - -При работающем сервере, нельзя вручную изменять набор кусков или их данные на файловой системе, так как сервер не будет об этом знать. -Для нереплицируемых таблиц, вы можете это делать при остановленном сервере, хотя это не рекомендуется. -Для реплицируемых таблиц, набор кусков нельзя менять в любом случае. - -Директория detached содержит куски, не используемые сервером - отцепленные от таблицы с помощью запроса ALTER ... DETACH. Также в эту директорию переносятся куски, признанные повреждёнными, вместо их удаления. Вы можете в любое время добавлять, удалять, модифицировать данные в директории detached - сервер не будет об этом знать, пока вы не сделаете запрос ALTER TABLE ... ATTACH. - - -%%ALTER TABLE [db.]table DETACH PARTITION 'name'%% - -Перенести все данные для партиции с именем name в директорию detached и забыть про них. -Имя партиции указывается в формате YYYYMM. Оно может быть указано в одинарных кавычках или без них. - -После того, как запрос будет выполнен, вы можете самостоятельно сделать что угодно с данными в директории detached, например, удалить их из файловой системы, или ничего не делать. - -Запрос реплицируется - данные будут перенесены в директорию detached и забыты на всех репликах. Запрос может быть отправлен только на реплику-лидер. Вы можете узнать, является ли реплика лидером, сделав SELECT в системную таблицу system.replicas. Или, проще, вы можете выполнить запрос на всех репликах, и на всех кроме одной, он кинет исключение. - - -%%ALTER TABLE [db.]table DROP PARTITION 'name'%% - -Аналогично операции DETACH. Удалить данные из таблицы. Куски с данными будут помечены как неактивные и будут полностью удалены примерно через 10 минут. Запрос реплицируется - данные будут удалены на всех репликах. - - -%%ALTER TABLE [db.]table ATTACH PARTITION|PART 'name'%% - -Добавить данные в таблицу из директории detached. - -Существует возможность добавить данные для целой партиции (PARTITION) или отдельный кусок (PART). В случае PART, укажите полное имя куска в одинарных кавычках. - -Запрос реплицируется. Каждая реплика проверяет, если ли данные в директории detached. Если данные есть - проверяет их целостность, проверяет их соответствие данным на сервере-инициаторе запроса, и если всё хорошо, то добавляет их. Если нет, то скачивает данные с реплики-инициатора запроса, или с другой реплики, на которой уже добавлены эти данные. - -То есть, вы можете разместить данные в директории detached на одной реплике и, с помощью запроса ALTER ... ATTACH добавить их в таблицу на всех репликах. - - -%%ALTER TABLE [db.]table FREEZE PARTITION 'name'%% - -Создаёт локальный бэкап одной или нескольких партиций. В качестве имени может быть указано полное имя партиции (например, 201403) или его префикс (например, 2014) - тогда бэкап будет создан для всех соответствующих партиций. - -Запрос делает следующее: для снэпшота данных на момент его выполнения, создаёт hardlink-и на данные таблиц в директории /var/lib/clickhouse/shadow/N/... -/var/lib/clickhouse/ - рабочая директория ClickHouse из конфига. -N - инкрементальный номер бэкапа. -Структура директорий внутри бэкапа создаётся такой же, как внутри /var/lib/clickhouse/. -Также делает chmod всех файлов, запрещая запись в них. - -Создание бэкапа происходит почти мгновенно (но сначала дожидается окончания выполняющихся в данный момент запросов к соответствующей таблице). Бэкап изначально не занимает места на диске. При дальнейшей работе системы, бэкап может отнимать место на диске, по мере модификации данных. Если бэкап делается для достаточно старых данных, то он не будет отнимать место на диске. - -После создания бэкапа, данные из /var/lib/clickhouse/shadow/ можно скопировать на удалённый сервер и затем удалить на локальном сервере. -Весь процесс бэкапа не требует остановки сервера. - -Запрос ALTER ... FREEZE PARTITION не реплицируется. То есть, локальный бэкап создаётся только на локальном сервере. - -В качестве альтернативного варианта, вы можете скопировать данные из директории /var/lib/clickhouse/data/database/table вручную. -Но если это делать при запущенном сервере, то возможны race conditions при копировании директории с добавляющимися/изменяющимися файлами, и бэкап может быть неконсистентным. Этот вариант может использоваться, если сервер не запущен - тогда полученные данные будут такими же, как после запроса ALTER TABLE t FREEZE PARTITION. - -ALTER TABLE ... FREEZE PARTITION копирует только данные, но не метаданные таблицы. Чтобы сделать бэкап метаданных таблицы, скопируйте файл /var/lib/clickhouse/metadata/database/table.sql - -Для восстановления из бэкапа: -- создайте таблицу, если её нет, с помощью запроса CREATE. Запрос можно взять из .sql файла (замените в нём ATTACH на CREATE); -- скопируйте данные из директории data/database/table/ внутри бэкапа в директорию /var/lib/clickhouse/data/database/table/detached/ -- выполните запросы ALTER TABLE ... ATTACH PARTITION YYYYMM, где YYYYMM - месяц, для каждого месяца. - -Таким образом, данные из бэкапа будут добавлены в таблицу. -Восстановление из бэкапа, так же, не требует остановки сервера. - -Бэкапы и репликация - -Репликация защищает от аппаратных сбоев. В случае, если на одной из реплик у вас исчезли все данные, то восстановление делается по инструкции в разделе "Восстановление после сбоя". - -Для защиты от аппаратных сбоев, обязательно используйте репликацию. Подробнее про репликацию написано в разделе "Репликация данных". - -Бэкапы защищают от человеческих ошибок (случайно удалили данные, удалили не те данные или не на том кластере, испортили данные). -Для баз данных большого объёма, бывает затруднительно копировать бэкапы на удалённые серверы. В этих случаях, для защиты от человеческой ошибки, можно держать бэкап на том же сервере (он будет лежать в /var/lib/clickhouse/shadow/). - - -%%ALTER TABLE [db.]table FETCH PARTITION 'name' FROM 'path-in-zookeeper'%% - -Запрос работает только для реплицируемых таблиц. - -Скачивает указанную партицию с шарда, путь в ZooKeeper к которому указан в секции FROM и помещает в директорию detached указанной таблицы. - -Не смотря на то, что запрос называется ALTER TABLE, он не изменяет структуру таблицы, и не изменяет сразу доступные данные в таблице. - -Данные помещаются в директорию detached, и их можно прикрепить с помощью запроса ALTER TABLE ... ATTACH. - -В секции FROM указывается путь в ZooKeeper. Например, %%/clickhouse/tables/01-01/visits%%. -Перед скачиванием проверяется существование партиции и совпадение структуры таблицы. Автоматически выбирается наиболее актуальная реплика среди живых реплик. - -Запрос ALTER ... FETCH PARTITION не реплицируется. То есть, партиция будет скачана в директорию detached только на локальном сервере. Заметим, что если вы после этого добавите данные в таблицу с помощью запроса ALTER TABLE ... ATTACH, то данные будут добавлены на всех репликах (на одной из реплик будут добавлены из директории detached, а на других - загружены с соседних реплик). - - -

    Синхронность запросов ALTER

    - -Для нереплицируемых таблиц, все запросы ALTER выполняются синхронно. Для реплицируемых таблиц, запрос всего лишь добавляет инструкцию по соответствующим действиям в ZooKeeper, а сами действия осуществляются при первой возможности. Но при этом, запрос может ждать завершения выполнения этих действий на всех репликах. - -Для запросов ALTER ... ATTACH|DETACH|DROP можно настроить ожидание, с помощью настройки replication_alter_partitions_sync. -Возможные значения: 0 - не ждать, 1 - ждать выполнения только у себя (по умолчанию), 2 - ждать всех. - - - -===SHOW DATABASES=== - -%%SHOW DATABASES [INTO OUTFILE filename] [FORMAT format]%% - -Выводит список всех баз данных. -Запрос полностью аналогичен запросу SELECT name FROM system.databases [INTO OUTFILE filename] [FORMAT format] -Смотрите также раздел "Форматы". - - -===SHOW TABLES=== - -%%SHOW TABLES [FROM db] [LIKE 'pattern'] [INTO OUTFILE filename] [FORMAT format]%% - -Выводит список таблиц -- из текущей БД или из БД db, если указано FROM db; -- всех, или имя которых соответствует шаблону pattern, если указано LIKE 'pattern'; - -Запрос полностью аналогичен запросу: SELECT name FROM system.tables WHERE database = 'db' [AND name LIKE 'pattern'] [INTO OUTFILE filename] [FORMAT format] -Смотрите также раздел "Оператор LIKE". - - -===SHOW PROCESSLIST=== - -%%SHOW PROCESSLIST [INTO OUTFILE filename] [FORMAT format]%% - -Выводит список запросов, выполняющихся в данный момент времени, кроме запросов SHOW PROCESSLIST. - -Выдаёт таблицу, содержащую столбцы: - -user - пользователь, под которым был задан запрос. Следует иметь ввиду, что при распределённой обработке запроса на удалённые серверы запросы отправляются под пользователем default. И SHOW PROCESSLIST показывает имя пользователя для конкретного запроса, а не для запроса, который данный запрос инициировал. - -address - имя хоста, с которого был отправлен запрос. При распределённой обработке запроса на удалённых серверах — это имя хоста-инициатора запроса. Чтобы проследить, откуда был задан распределённый запрос изначально, следует смотреть SHOW PROCESSLIST на сервере-инициаторе запроса. - -elapsed - время выполнения запроса, в секундах. Запросы выводятся упорядоченными по убыванию времени выполнения. - -rows_read, bytes_read - сколько было прочитано строк, байт несжатых данных при обработке запроса. При распределённой обработке запроса суммируются данные со всех удалённых серверов. Именно эти данные используются для ограничений и квот. - -memory_usage - текущее потребление оперативки в байтах. Смотрите настройку max_memory_usage. - -query - сам запрос. В запросах INSERT данные для вставки не выводятся. - -query_id - идентификатор запроса. Непустой, только если был явно задан пользователем. При распределённой обработке запроса идентификатор запроса не передаётся на удалённые серверы. - -Запрос полностью аналогичен запросу: SELECT * FROM system.processes [INTO OUTFILE filename] [FORMAT format]. - -Полезный совет (выполните в консоли): -%%watch -n1 "clickhouse-client --query='SHOW PROCESSLIST'"%% - - -===SHOW CREATE TABLE=== - -%%SHOW CREATE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format]%% - -Возвращает один столбец statement типа String, содержащий одно значение - запрос CREATE, с помощью которого создана указанная таблица. - - -===DESCRIBE TABLE=== - -%%DESC|DESCRIBE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format]%% - -Возвращает два столбца: name, type типа String, в которых описаны имена и типы столбцов указанной таблицы. - -Вложенные структуры данных выводятся в "развёрнутом" виде. То есть, каждый столбец - по отдельности, с именем через точку. - - -===EXISTS=== - -%%EXISTS TABLE [db.]name [INTO OUTFILE filename] [FORMAT format]%% - -Возвращает один столбец типа UInt8, содержащий одно значение - 0, если таблицы или БД не существует и 1, если таблица в указанной БД существует. - - -===USE=== - -%%USE db%% - -Позволяет установить текущую базу данных для сессии. -Текущая база данных используется для поиска таблиц, если база данных не указана в запросе явно через точку перед именем таблицы. -При использовании HTTP протокола, запрос не может быть выполнен, так как понятия сессии не существует. - - -===SET=== - -%%SET param = value%% - -Позволяет установить настройку param в значение value. Также можно одним запросом установить все настройки из заданного профиля настроек - для этого, укажите в качестве имени настройки profile. Подробнее смотри раздел "Настройки". -Настройка устанавливается на сессию, или на сервер (глобально), если указано GLOBAL. -При установке глобальной настройки, настройка на все уже запущенные сессии, включая текущую сессию, не устанавливается, а будет использована только для новых сессий. - -При перезапуске сервера, теряются глобальные настройки, установленные с помощью SET. -Установить настройки, которые переживут перезапуск сервера, можно только с помощью конфигурационного файла сервера. - - -===OPTIMIZE=== - -%%OPTIMIZE TABLE [db.]name [PARTITION partition] [FINAL]%% - -Просит движок таблицы сделать что-нибудь, что может привести к более оптимальной работе. -Поддерживается только движками *MergeTree, в котором выполнение этого запроса инициирует внеочередное слияние кусков данных. -Если указан PARTITION, то оптимизация будет производиться только для указаной партиции. -Если указан FINAL, то оптимизация будет производиться даже когда все данные уже лежат в одном куске. - - -===INSERT=== - -Запрос имеет несколько вариантов. - -%%INSERT INTO [db.]table [(c1, c2, c3)] VALUES (v11, v12, v13), (v21, v22, v23), ...%% - -Вставляет в таблицу table строчки с перечисленными значениями. -Запрос полностью аналогичен запросу вида: - -%%INSERT INTO [db.]table [(c1, c2, c3)] FORMAT Values (v11, v12, v13), (v21, v22, v23), ...%% - -%%INSERT INTO [db.]table [(c1, c2, c3)] FORMAT format ...%% - -Вставка данных в произвольном указанном формате. -Сами данные идут после format, после всех пробельных символов до первого перевода строки, если он есть, включая его, или после всех пробельных символов, если переводов строки нет. Рекомендуется писать данные начиная со следующей строки (это важно, если данные начинаются с пробельных символов). - -Пример: - -%%INSERT INTO t FORMAT TabSeparated -11 Hello, world! -22 Qwerty -%% - -Подробнее про форматы данных смотрите в разделе "Форматы". -В разделе "Интерфейсы" описано, как можно вставлять данные отдельно от запроса, при использовании клиента командной строки или HTTP интерфейса. - -В запросе может быть опционально указан список столбцов для вставки. В этом случае, в остальные столбцы записываются значения по умолчанию. -Значения по умолчанию вычисляются из DEFAULT выражений, указанных в определении таблицы, или, если DEFAULT не прописан явно - используются нули, пустые строки. Если настройка strict_insert_defaults выставлена в 1, то все столбцы, для которых нет явных DEFAULT-ов, должны быть указаны в запросе. - -%%INSERT INTO [db.]table [(c1, c2, c3)] SELECT ...%% - -Вставка в таблицу результата запроса SELECT. -Имена и типы данных результата выполнения SELECT-а должны точно совпадать со структурой таблицы, в которую вставляются данные, или с указанным списком столбцов. -Для изменения имён столбцов следует использовать синонимы (AS) в запросе SELECT. -Для изменения типов данных следует использовать функции преобразования типов (смотрите раздел "Функции"). - -Ни один из форматов данных не позволяет использовать в качестве значений выражения. -То есть, вы не можете написать INSERT INTO t VALUES (now(), 1 + 1, DEFAULT). - -Не поддерживаются другие запросы на модификацию части данных: UPDATE, DELETE, REPLACE, MERGE, UPSERT, INSERT UPDATE. -Впрочем, вы можете удалять старые данные с помощью запроса ALTER TABLE ... DROP PARTITION. - - -===SELECT=== - -Его величество, запрос SELECT. - -%%SELECT [DISTINCT] expr_list - [FROM [db.]table | (subquery) | table_function] [FINAL] - [SAMPLE sample_coeff] - [ARRAY JOIN ...] - [GLOBAL] ANY|ALL INNER|LEFT JOIN (subquery)|table USING columns_list - [PREWHERE expr] - [WHERE expr] - [GROUP BY expr_list] [WITH TOTALS] - [HAVING expr] - [ORDER BY expr_list] - [LIMIT [n, ]m] - [UNION ALL ...] - [INTO OUTFILE filename] - [FORMAT format]%% - -Все секции, кроме списка выражений сразу после SELECT, являются необязательными. -Ниже секции будут описаны в порядке, почти соответствующем конвейеру выполнения запроса. - -Если в запросе отсутствуют секции DISTINCT, GROUP BY, ORDER BY, подзапросы в IN и JOIN, то запрос будет обработан полностью потоково, с использованием O(1) количества оперативки. -Иначе запрос может съесть много оперативки, если не указаны подходящие ограничения max_memory_usage, max_rows_to_group_by, max_rows_to_sort, max_rows_in_distinct, max_bytes_in_distinct, max_rows_in_set, max_bytes_in_set, max_rows_in_join, max_bytes_in_join, max_bytes_before_external_sort, max_bytes_before_external_group_by. Подробнее смотрите в разделе "Настройки". Присутствует возможность использовать внешнюю сортировку (с сохранением временных данных на диск) и внешнюю агрегацию. Merge join в системе нет. - -

    Секция FROM

    - -Если секция FROM отсутствует, то данные будут читаться из таблицы system.one. -Таблица system.one содержит ровно одну строку (то есть, эта таблица выполняет такую же роль, как таблица DUAL, которую можно найти в других СУБД). - -В секции FROM указывается таблица, из которой будут читаться данные, либо подзапрос, либо табличная функция; дополнительно могут присутствовать ARRAY JOIN и обычный JOIN (смотрите ниже). - -Вместо таблицы, может быть указан подзапрос SELECT в скобках. -В этом случае, конвейер обработки подзапроса будет встроен в конвейер обработки внешнего запроса. -В отличие от стандартного SQL, после подзапроса не нужно указывать его синоним. Для совместимости, присутствует возможность написать AS name после подзапроса, но указанное имя нигде не используется. - -Вместо таблицы, может быть указана табличная функция. Подробнее смотрите раздел "Табличные функции". - -Для выполнения запроса, из соответствующей таблицы, вынимаются все столбцы, перечисленные в запросе. Из подзапросов выкидываются столбцы, не нужные для внешнего запроса. -Если в запросе не перечислено ни одного столбца (например, SELECT count() FROM t), то из таблицы всё равно вынимается один какой-нибудь столбец (предпочитается самый маленький), для того, чтобы можно было хотя бы посчитать количество строк. - -Модификатор FINAL может быть использован только при SELECT-е из таблицы типа CollapsingMergeTree. При указании FINAL, данные будут выбираться полностью "сколлапсированными". Стоит учитывать, что использование FINAL приводит к выбору кроме указанных в SELECT-е столбцов также столбцов, относящихся к первичному ключу. Также, запрос будет выполняться в один поток, и при выполнении запроса будет выполняться слияние данных. Это приводит к тому, что при использовании FINAL, запрос выполняется медленнее. В большинстве случаев, следует избегать использования FINAL. Подробнее смотрите раздел "Движок CollapsingMergeTree". - -

    Секция SAMPLE

    - -Секция SAMPLE позволяет выполнить запрос приближённо. Приближённое выполнение запроса поддерживается только таблицами типа MergeTree* и только если при создании таблицы было указано выражение, по которому производится выборка (смотрите раздел "Движок MergeTree"). - -SAMPLE имеет вид %%SAMPLE k%%, где k - дробное число в интервале от 0 до 1, или %%SAMPLE n%%, где n - достаточно большое целое число. - -В первом случае, запрос будет выполнен по k-доле данных. Например, если указано %%SAMPLE 0.1%%, то запрос будет выполнен по 10% данных. -Во втором случае, запрос будет выполнен по выборке из не более n строк. Например, если указано %%SAMPLE 10000000%%, то запрос будет выполнен по не более чем 10 000 000 строкам. - -Пример: - -%%SELECT - Title, - count() * 10 AS PageViews -FROM hits_distributed -SAMPLE 0.1 -WHERE - CounterID = 34 - AND toDate(EventDate) >= toDate('2013-01-29') - AND toDate(EventDate) <= toDate('2013-02-04') - AND NOT DontCountHits - AND NOT Refresh - AND Title != '' -GROUP BY Title -ORDER BY PageViews DESC LIMIT 1000%% - -В этом примере, запрос выполняется по выборке из 0.1 (10%) данных. Значения агрегатных функций не корректируются автоматически, поэтому для получения приближённого результата, значение count() вручную домножается на 10. - -При использовании варианта вида %%SAMPLE 10000000%%, нет информации, какая относительная доля данных была обработана, и на что следует домножить агрегатные функции, поэтому такой способ записи подходит не для всех случаев. - -Выборка с указанием относительного коэффициента является "согласованной": если рассмотреть все возможные данные, которые могли бы быть в таблице, то выборка (при использовании одного выражения сэмплирования, указанного при создании таблицы), с одинаковым коэффициентом, выбирает всегда одно и то же подмножество этих всевозможных данных. То есть, выборка из разных таблиц, на разных серверах, в разное время, делается одинаковым образом. - -Например, выборка по идентификаторам посетителей, выберет из разных таблиц строки с одинаковым подмножеством всех возможных идентификаторов посетителей. Это позволяет использовать выборку в подзапросах в секции IN, а также при ручном сопоставлении результатов разных запросов с выборками. - -

    Секция ARRAY JOIN

    - -Позволяет выполнить JOIN с массивом или вложенной структурой данных. Смысл похож на функцию arrayJoin, но функциональность более широкая. - -ARRAY JOIN - это, по сути, INNER JOIN с массивом. Пример: - -%% -:) CREATE TABLE arrays_test (s String, arr Array(UInt8)) ENGINE = Memory - -CREATE TABLE arrays_test -( - s String, - arr Array(UInt8) -) ENGINE = Memory - -Ok. - -0 rows in set. Elapsed: 0.001 sec. - -:) INSERT INTO arrays_test VALUES ('Hello', [1,2]), ('World', [3,4,5]), ('Goodbye', []) - -INSERT INTO arrays_test VALUES - -Ok. - -3 rows in set. Elapsed: 0.001 sec. - -:) SELECT * FROM arrays_test - -SELECT * -FROM arrays_test - -┌─s───────┬─arr─────┐ -│ Hello │ [1,2] │ -│ World │ [3,4,5] │ -│ Goodbye │ [] │ -└─────────┴─────────┘ - -3 rows in set. Elapsed: 0.001 sec. - -:) SELECT s, arr FROM arrays_test ARRAY JOIN arr - -SELECT s, arr -FROM arrays_test -ARRAY JOIN arr - -┌─s─────┬─arr─┐ -│ Hello │ 1 │ -│ Hello │ 2 │ -│ World │ 3 │ -│ World │ 4 │ -│ World │ 5 │ -└───────┴─────┘ - -5 rows in set. Elapsed: 0.001 sec. -%% - -Для массива в секции ARRAY JOIN может быть указан алиас. В этом случае, элемент массива будет доступен под этим алиасом, а сам массив - под исходным именем. Пример: - -%% -:) SELECT s, arr, a FROM arrays_test ARRAY JOIN arr AS a - -SELECT s, arr, a -FROM arrays_test -ARRAY JOIN arr AS a - -┌─s─────┬─arr─────┬─a─┐ -│ Hello │ [1,2] │ 1 │ -│ Hello │ [1,2] │ 2 │ -│ World │ [3,4,5] │ 3 │ -│ World │ [3,4,5] │ 4 │ -│ World │ [3,4,5] │ 5 │ -└───────┴─────────┴───┘ - -5 rows in set. Elapsed: 0.001 sec. -%% - -В секции ARRAY JOIN может быть указано несколько массивов одинаковых размеров через запятую. В этом случае, JOIN делается с ними одновременно (прямая сумма, а не прямое произведение). Пример: - -%% -:) SELECT s, arr, a, num, mapped FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(x -> x + 1, arr) AS mapped - -SELECT s, arr, a, num, mapped -FROM arrays_test -ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num, arrayMap(lambda(tuple(x), plus(x, 1)), arr) AS mapped - -┌─s─────┬─arr─────┬─a─┬─num─┬─mapped─┐ -│ Hello │ [1,2] │ 1 │ 1 │ 2 │ -│ Hello │ [1,2] │ 2 │ 2 │ 3 │ -│ World │ [3,4,5] │ 3 │ 1 │ 4 │ -│ World │ [3,4,5] │ 4 │ 2 │ 5 │ -│ World │ [3,4,5] │ 5 │ 3 │ 6 │ -└───────┴─────────┴───┴─────┴────────┘ - -5 rows in set. Elapsed: 0.002 sec. - -:) SELECT s, arr, a, num, arrayEnumerate(arr) FROM arrays_test ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num - -SELECT s, arr, a, num, arrayEnumerate(arr) -FROM arrays_test -ARRAY JOIN arr AS a, arrayEnumerate(arr) AS num - -┌─s─────┬─arr─────┬─a─┬─num─┬─arrayEnumerate(arr)─┐ -│ Hello │ [1,2] │ 1 │ 1 │ [1,2] │ -│ Hello │ [1,2] │ 2 │ 2 │ [1,2] │ -│ World │ [3,4,5] │ 3 │ 1 │ [1,2,3] │ -│ World │ [3,4,5] │ 4 │ 2 │ [1,2,3] │ -│ World │ [3,4,5] │ 5 │ 3 │ [1,2,3] │ -└───────┴─────────┴───┴─────┴─────────────────────┘ - -5 rows in set. Elapsed: 0.002 sec. -%% - -ARRAY JOIN также работает с вложенными структурами данных. Пример: - -%% -:) CREATE TABLE nested_test (s String, nest Nested(x UInt8, y UInt32)) ENGINE = Memory - -CREATE TABLE nested_test -( - s String, - nest Nested( - x UInt8, - y UInt32) -) ENGINE = Memory - -Ok. - -0 rows in set. Elapsed: 0.006 sec. - -:) INSERT INTO nested_test VALUES ('Hello', [1,2], [10,20]), ('World', [3,4,5], [30,40,50]), ('Goodbye', [], []) - -INSERT INTO nested_test VALUES - -Ok. - -3 rows in set. Elapsed: 0.001 sec. - -:) SELECT * FROM nested_test - -SELECT * -FROM nested_test - -┌─s───────┬─nest.x──┬─nest.y─────┐ -│ Hello │ [1,2] │ [10,20] │ -│ World │ [3,4,5] │ [30,40,50] │ -│ Goodbye │ [] │ [] │ -└─────────┴─────────┴────────────┘ - -3 rows in set. Elapsed: 0.001 sec. - -:) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest - -SELECT s, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN nest - -┌─s─────┬─nest.x─┬─nest.y─┐ -│ Hello │ 1 │ 10 │ -│ Hello │ 2 │ 20 │ -│ World │ 3 │ 30 │ -│ World │ 4 │ 40 │ -│ World │ 5 │ 50 │ -└───────┴────────┴────────┘ - -5 rows in set. Elapsed: 0.001 sec. -%% - -При указании имени вложенной структуры данных в ARRAY JOIN, смысл такой же, как ARRAY JOIN со всеми элементами-массивами, из которых она состоит. Пример: - -%% -:) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest.x, nest.y - -SELECT s, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN `nest.x`, `nest.y` - -┌─s─────┬─nest.x─┬─nest.y─┐ -│ Hello │ 1 │ 10 │ -│ Hello │ 2 │ 20 │ -│ World │ 3 │ 30 │ -│ World │ 4 │ 40 │ -│ World │ 5 │ 50 │ -└───────┴────────┴────────┘ - -5 rows in set. Elapsed: 0.001 sec. -%% - -Такой вариант тоже имеет смысл: - -%% -:) SELECT s, nest.x, nest.y FROM nested_test ARRAY JOIN nest.x - -SELECT s, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN `nest.x` - -┌─s─────┬─nest.x─┬─nest.y─────┐ -│ Hello │ 1 │ [10,20] │ -│ Hello │ 2 │ [10,20] │ -│ World │ 3 │ [30,40,50] │ -│ World │ 4 │ [30,40,50] │ -│ World │ 5 │ [30,40,50] │ -└───────┴────────┴────────────┘ - -5 rows in set. Elapsed: 0.001 sec. -%% - -Алиас для вложенной структуры данных можно использовать, чтобы выбрать как результат JOIN-а, так и исходный массив. Пример: - -%% -:) SELECT s, n.x, n.y, nest.x, nest.y FROM nested_test ARRAY JOIN nest AS n - -SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y` -FROM nested_test -ARRAY JOIN nest AS n - -┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┐ -│ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ -│ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ -│ World │ 3 │ 30 │ [3,4,5] │ [30,40,50] │ -│ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │ -│ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │ -└───────┴─────┴─────┴─────────┴────────────┘ - -5 rows in set. Elapsed: 0.001 sec. -%% - -Пример использования функции arrayEnumerate: - -%% -:) SELECT s, n.x, n.y, nest.x, nest.y, num FROM nested_test ARRAY JOIN nest AS n, arrayEnumerate(nest.x) AS num - -SELECT s, `n.x`, `n.y`, `nest.x`, `nest.y`, num -FROM nested_test -ARRAY JOIN nest AS n, arrayEnumerate(`nest.x`) AS num - -┌─s─────┬─n.x─┬─n.y─┬─nest.x──┬─nest.y─────┬─num─┐ -│ Hello │ 1 │ 10 │ [1,2] │ [10,20] │ 1 │ -│ Hello │ 2 │ 20 │ [1,2] │ [10,20] │ 2 │ -│ World │ 3 │ 30 │ [3,4,5] │ [30,40,50] │ 1 │ -│ World │ 4 │ 40 │ [3,4,5] │ [30,40,50] │ 2 │ -│ World │ 5 │ 50 │ [3,4,5] │ [30,40,50] │ 3 │ -└───────┴─────┴─────┴─────────┴────────────┴─────┘ - -5 rows in set. Elapsed: 0.002 sec. -%% - -В запросе может быть указано не более одной секции ARRAY JOIN. - -Соответствующее преобразование может выполняться как до секции WHERE/PREWHERE (если его результат нужен в этой секции), так и после выполнения WHERE/PREWHERE (чтобы уменьшить объём вычислений). - -

    Секция JOIN

    - -Обычный JOIN, не имеет отношения к ARRAY JOIN, который описан выше. - -%% -[GLOBAL] ANY|ALL INNER|LEFT [OUTER] JOIN (subquery)|table USING columns_list -%% - -Выполняет соединение с данными из подзапроса. В начале выполнения запроса, выполняется подзапрос, указанный после JOIN, и его результат сохраняется в память. Затем производится чтение из "левой" таблицы, указанной в секции FROM, и во время этого чтения, для каждой прочитанной строчки из "левой" таблицы, из таблицы-результата подзапроса ("правой" таблицы) выбираются строчки, соответствующие условию на совпадение значений столбцов, указанных в USING. - -Вместо подзапроса может быть указано имя таблицы. Это эквивалентно подзапросу SELECT * FROM table, кроме особого случая, когда таблица имеет движок Join - подготовленное множество для соединения. - -Из подзапроса удаляются все ненужные для JOIN-а столбцы. - -JOIN-ы бывают нескольких видов: - -INNER или LEFT - тип: -Если указано INNER, то в результат попадают только строки, для которых найдена соответствующая строка в "правой" таблице. -Если указано LEFT, то для строчек "левой" таблицы, для которых нет соответствующих в "правой" таблице, будут присоединены значения "по умолчанию" - нули, пустые строки. Вместо LEFT может быть написано LEFT OUTER - слово OUTER ни на что не влияет. - -ANY или ALL - строгость: -Если указано ANY, то при наличии в "правой" таблице нескольких соответствующих строк, будет присоединена только первая попавшаяся. -Если указано ALL, то при наличии в "правой" таблице нескольких соответствующих строк, данные будут размножены по количеству этих строк. - -Использование ALL соответствует обычной семантике JOIN-а из стандартного SQL. -Использование ANY является более оптимальным. Если известно, что в "правой" таблице есть не более одной подходящей строки, то результаты ANY и ALL совпадают. Обязательно необходимо указать ANY или ALL (ни один из этих вариантов не выбран по умолчанию). - -GLOBAL - распределённость: - -При использовании обычного %%JOIN%%-а, запрос отправляется на удалённые серверы, и на каждом из них выполняются подзапросы для формирования "правой" таблицы, и с этой таблицей выполняется соединение. То есть, "правая" таблица формируется на каждом сервере отдельно. - -При использовании %%GLOBAL ... JOIN%%-а, сначала, на сервере-инициаторе запроса, выполняется подзапрос для вычисления "правой" таблицы, и затем эта временная таблица передаётся на каждый удалённый сервер, и на них выполняются запросы, с использованием этих переданных временных данных. - -Следует быть аккуратным при использовании GLOBAL JOIN-ов. Подробнее читайте в разделе "Распределённые подзапросы" ниже. - -Возможны все комбинации JOIN-ов. Например, %%GLOBAL ANY LEFT OUTER JOIN%%. - -При выполнении JOIN-а отсутствует оптимизация порядка выполнения по отношению к другим стадиям запроса: соединение (поиск в "правой" таблице) выполняется до фильтрации в WHERE, до агрегации. Поэтому, чтобы явно задать порядок вычислений, рекомендуется выполнять JOIN подзапроса с подзапросом. - -Пример: -%% -SELECT - CounterID, - hits, - visits -FROM -( - SELECT - CounterID, - count() AS hits - FROM test.hits - GROUP BY CounterID -) ANY LEFT JOIN -( - SELECT - CounterID, - sum(Sign) AS visits - FROM test.visits - GROUP BY CounterID -) USING CounterID -ORDER BY hits DESC -LIMIT 10 - -┌─CounterID─┬───hits─┬─visits─┐ -│ 1143050 │ 523264 │ 13665 │ -│ 731962 │ 475698 │ 102716 │ -│ 722545 │ 337212 │ 108187 │ -│ 722889 │ 252197 │ 10547 │ -│ 2237260 │ 196036 │ 9522 │ -│ 23057320 │ 147211 │ 7689 │ -│ 722818 │ 90109 │ 17847 │ -│ 48221 │ 85379 │ 4652 │ -│ 19762435 │ 77807 │ 7026 │ -│ 722884 │ 77492 │ 11056 │ -└───────────┴────────┴────────┘ -%% - -У подзапросов нет возможности задать имена и нет возможности их использовать для того, чтобы сослаться на столбец из конкретного подзапроса. -Требуется, чтобы столбцы, указанные в USING, назывались одинаково в обоих подзапросах, а остальные столбцы - по-разному. Изменить имена столбцов в подзапросах можно с помощью алиасов (в примере используются алиасы hits и visits). - -В секции USING указывается один или несколько столбцов для соединения, что обозначает условие на равенство этих столбцов. Список столбцов задаётся без скобок. Более сложные условия соединения не поддерживаются. - -"Правая" таблица (результат подзапроса) располагается в оперативке. Если оперативки не хватает, вы не сможете выполнить JOIN. - -В запросе (на одном уровне) можно указать только один JOIN. Чтобы выполнить несколько JOIN-ов, вы можете разместить их в подзапросах. - -Каждый раз для выполнения запроса с одинаковым JOIN-ом, подзапрос выполняется заново - результат не кэшируется. Это можно избежать, используя специальный движок таблиц Join, представляющий собой подготовленное множество для соединения, которое всегда находится в оперативке. Подробнее смотрите в разделе "Движки таблиц, Join". - -В некоторых случаях, вместо использования JOIN достаточно использовать IN - это более эффективно. -Среди разных типов JOIN-ов, наиболее эффективен ANY LEFT JOIN, затем ANY INNER JOIN; наименее эффективны ALL LEFT JOIN и ALL INNER JOIN. - -Если JOIN необходим для соединения с таблицами измерений (dimension tables - сравнительно небольшие таблицы, которые содержат свойства измерений - например, имена для рекламных кампаний), то использование JOIN может быть не очень удобным из-за громоздкости синтаксиса, а также из-за того, что правая таблица читается заново при каждом запросе. Специально для таких случаев существует функциональность "Внешние словари", которую следует использовать вместо JOIN. Подробнее смотрите раздел "Внешние словари". - - -

    Секция WHERE

    - -Секция WHERE, если есть, должна содержать выражение, имеющее тип UInt8. Обычно это какое-либо выражение с операторами сравнения и логическими операторами. -Это выражение будет использовано для фильтрации данных перед всеми остальными преобразованиями. - -Выражение анализируется на возможность использования индексов, если индексы поддерживаются движком таблицы. - -

    Секция PREWHERE

    - -Имеет такой же смысл, как и секция WHERE. Отличие состоит в том, какие данные читаются из таблицы. -При использовании PREWHERE, из таблицы сначала читаются только столбцы, необходимые для выполнения PREWHERE. Затем читаются остальные столбцы, нужные для выполнения запроса, но из них только те блоки, в которых выражение в PREWHERE истинное. - -PREWHERE имеет смысл использовать, если есть условия фильтрации, не подходящие под индексы, которые использует меньшинство столбцов из тех, что есть в запросе, но достаточно сильно фильтрует данные. Таким образом, сокращается количество читаемых данных. - -Например, полезно писать PREWHERE для запросов, которые вынимают много столбцов, но в которых фильтрация производится лишь по нескольким столбцам. - -PREWHERE поддерживается только таблицами семейства *MergeTree. - -В запросе могут быть одновременно указаны секции PREWHERE и WHERE. В этом случае, PREWHERE идёт перед WHERE. - -Следует иметь ввиду, что указывать в PREWHERE только столбцы, по которым существует индекс, имеет мало смысла, так как при использовании индекса и так читаются лишь блоки данных, соответствующие индексу. - -Если настройка optimize_move_to_prewhere выставлена в 1, то при отсутствии PREWHERE, система будет автоматически переносить части выражений из WHERE в PREWHERE согласно некоторой эвристике. - - -

    Секция GROUP BY

    - -Это одна из наиболее важных частей СУБД. - -Секция GROUP BY, если есть, должна содержать список выражений. Каждое выражение далее будем называть "ключом". -При этом, все выражения в секциях SELECT, HAVING, ORDER BY, должны вычисляться из ключей или из агрегатных функций. То есть, каждый выбираемый из таблицы столбец, должен использоваться либо в ключах, либо внутри агрегатных функций. - -Если запрос содержит столбцы таблицы только внутри агрегатных функций, то секция GROUP BY может не указываться, и подразумевается агрегация по пустому набору ключей. - -Пример: - -%%SELECT - count(), - median(FetchTiming > 60 ? 60 : FetchTiming), - count() - sum(Refresh) -FROM hits%% - -Но, в отличие от стандартного SQL, если в таблице нет строк (вообще нет или после фильтрации с помощью WHERE), в качестве результата возвращается пустой результат, а не результат из одной строки, содержащий "начальные" значения агрегатных функций. - -В отличие от MySQL (и в соответствии со стандартом SQL), вы не можете получить какое-нибудь значение некоторого столбца, не входящего в ключ или агрегатную функцию (за исключением константных выражений). Для обхода этого вы можете воспользоваться агрегатной функцией any (получить первое попавшееся значение) или min/max. - -Пример: - -%%SELECT - domainWithoutWWW(URL) AS domain, - count(), - any(Title) AS title -- для каждого домена достаём первый попавшийся заголовок страницы -FROM hits -GROUP BY domain%% - -GROUP BY вычисляет для каждого встретившегося различного значения ключей, набор значений агрегатных функций. - -Не поддерживается GROUP BY по столбцам-массивам. - -Не поддерживается указание констант в качестве аргументов агрегатных функций. Пример: sum(1). Вместо этого, вы можете избавиться от констант. Пример: count(). - - -
    Модификатор WITH TOTALS
    - -Если указан модификатор WITH TOTALS, то будет посчитана ещё одна строчка, в которой в столбцах-ключах будут содержаться значения по умолчанию (нули, пустые строки), а в столбцах агрегатных функций - значения, посчитанные по всем строкам ("тотальные" значения). - -Эта дополнительная строчка выводится в форматах JSON*, TabSeparated*, Pretty* отдельно от остальных строчек. В остальных форматах эта строчка не выводится. - -В форматах JSON* строчка выводится отдельным полем totals. В форматах TabSeparated* строчка выводится после основного результата, и перед ней (после остальных данных) вставляется пустая строка. В форматах Pretty* строчка выводится отдельной табличкой после основного результата. - -WITH TOTALS может выполняться по-разному при наличии HAVING. Поведение зависит от настройки totals_mode. -По умолчанию totals_mode = 'before_having'. В этом случае totals считается по всем строчкам, включая непрошедших через HAVING и max_rows_to_group_by. - -Остальные варианты учитывают в totals только строчки, прошедшие через HAVING, и имеют разное поведение при наличии настройки max_rows_to_group_by и group_by_overflow_mode = 'any'. - -after_having_exclusive - не учитывать строчки, не прошедшие max_rows_to_group_by. То есть в totals попадёт меньше или столько же строчек, чем если бы max_rows_to_group_by не было. - -after_having_inclusive - учитывать в totals все строчки, не прошедшие max_rows_to_group_by. То есть в totals попадёт больше или столько же строчек, чем если бы max_rows_to_group_by не было. - -after_having_auto - считать долю строчек, прошедших через HAVING. Если она больше некоторого значения (по умолчанию - 50%), то включить все строчки, не прошедшие max_rows_to_group_by в totals, иначе - не включить. - -totals_auto_threshold - по умолчанию 0.5. Коэффициент для работы after_having_auto. - -Если max_rows_to_group_by и group_by_overflow_mode = 'any' не используются, то все варианты вида after_having не отличаются, и вы можете использовать любой из них, например, after_having_auto. - -Вы можете использовать WITH TOTALS в подзапросах, включая подзапросы в секции JOIN (в этом случае соответствующие тотальные значения будут соединены). - - -
    GROUP BY во внешней памяти
    - -Существует возможность включить сброс временных данных на диск для ограничения потребления оперативной памяти при GROUP BY. -Настройка %%max_bytes_before_external_group_by%% - потребление оперативки, при котором временные данные GROUP BY сбрасываются в файловую систему. Если равно 0 (по умолчанию) - значит выключено. - -При использовании %%max_bytes_before_external_group_by%% рекомендуется выставить %%max_memory_usage%% примерно в два раза больше. Это следует сделать, потому что агрегация выполняется в две стадии: чтение и формирование промежуточных данных (1) и слияние промежуточных данных (2). Сброс данных на файловую систему может производиться только на стадии 1. Если сброса временных данных не было, то на стадии 2 может потребляться до такого же объёма памяти, как на стадии 1. - -Например, если у вас %%max_memory_usage%% было выставлено в 10000000000, и вы хотите использовать внешнюю агрегацию, то имеет смысл выставить %%max_bytes_before_external_group_by%% в 10000000000, а %%max_memory_usage%% в 20000000000. При срабатывании внешней агрегации (если был хотя бы один сброс временных данных в файловую систему) максимальное потребление оперативки будет лишь чуть-чуть больше %%max_bytes_before_external_group_by%%. - -При распределённой обработке запроса внешняя агрегация производится на удалённых серверах. Для того чтобы на сервере-инициаторе запроса использовалось немного оперативки, нужно выставить настройку %%distributed_aggregation_memory_efficient%% в 1. - -При слиянии данных, сброшенных на диск, а также при слиянии результатов с удалённых серверов, при включенной настройке %%distributed_aggregation_memory_efficient%%, потребляется до 1/256 * количество потоков от общего объёма оперативки. - -При включенной внешней агрегации, если данных было меньше %%max_bytes_before_external_group_by%% (то есть сброса данных не было), то запрос работает так же быстро, как без внешней агрегации. Если же какие-то временные данные были сброшены, то время выполнения будет в несколько раз больше (примерно в три раза). - -Если после GROUP BY у вас есть ORDER BY с небольшим LIMIT, то на ORDER BY не будет тратиться существенного количества оперативки. -Но если есть ORDER BY без LIMIT, то не забудьте включить внешнюю сортировку (%%max_bytes_before_external_sort%%). - -
    Модификатор LIMIT N BY
    - -LIMIT %%N%% BY %%COLUMNS%% позволяет выбрать топ %%N%% строк для каждой группы %%COLUMNS%%. %%LIMIT N BY%% не связан с %%LIMIT%% и они могут использоваться в одном запросе. Ключ для %%LIMIT N BY%% может содержать произвольное число колонок или выражений. - -Пример: - -%%SELECT - domainWithoutWWW(URL) AS domain, - domainWithoutWWW(REFERRER_URL) AS referrer, - device_type, - count() cnt -FROM hits -GROUP BY domain, referrer, device_type -ORDER BY cnt DESC -LIMIT 5 BY domain, device_type -LIMIT 100 -%% - -выберет топ 5 рефереров для каждой пары domain - device type. Ограничить общее число строк результата 100. - - -

    Секция HAVING

    - -Позволяет отфильтровать результат, полученный после GROUP BY, аналогично секции WHERE. -WHERE и HAVING отличаются тем, что WHERE выполняется до агрегации (GROUP BY), а HAVING - после. -Если агрегации не производится, то HAVING использовать нельзя. - - -

    Секция ORDER BY

    - -Секция ORDER BY содержит список выражений, к каждому из которых также может быть приписано DESC или ASC (направление сортировки). Если ничего не приписано - это аналогично приписыванию ASC. ASC - сортировка по возрастанию, DESC - сортировка по убыванию. Обозначение направления сортировки действует на одно выражение, а не на весь список. Пример: %%ORDER BY Visits DESC, SearchPhrase%% - -Для сортировки по значениям типа String есть возможность указать collation (сравнение). Пример: %%ORDER BY SearchPhrase COLLATE 'tr'%% - для сортировки по поисковой фразе, по возрастанию, с учётом турецкого алфавита, регистронезависимо, при допущении, что строки в кодировке UTF-8. COLLATE может быть указан или не указан для каждого выражения в ORDER BY независимо. Если есть ASC или DESC, то COLLATE указывается после них. При использовании COLLATE сортировка всегда регистронезависима. - -Рекомендуется использовать COLLATE только для окончательной сортировки небольшого количества строк, так как производительность сортировки с указанием COLLATE меньше, чем обычной сортировки по байтам. - -Строки, для которых список выражений, по которым производится сортировка, принимает одинаковые значения, выводятся в произвольном порядке, который может быть также недетерминированным (каждый раз разным). -Если секция ORDER BY отсутствует, то, аналогично, порядок, в котором идут строки, не определён, и может быть недетерминированным. - -При сортировке чисел с плавающей запятой, NaN-ы идут отдельно от остальных значений. Вне зависимости от порядка сортировки, NaN-ы помещаются в конец. То есть, при сортировке по возрастанию, они как будто больше всех чисел, а при сортировке по убыванию - как будто меньше всех. - -Если кроме ORDER BY указан также не слишком большой LIMIT, то расходуется меньше оперативки. Иначе расходуется количество памяти, пропорциональное количеству данных для сортировки. При распределённой обработке запроса, если отсутствует GROUP BY, сортировка частично делается на удалённых серверах, а на сервере-инициаторе запроса производится слияние результатов. Таким образом, при распределённой сортировке, может сортироваться объём данных, превышающий размер памяти на одном сервере. - -Существует возможность выполнять сортировку во внешней памяти (с созданием временных файлов на диске), если оперативной памяти не хватает. Для этого предназначена настройка %%max_bytes_before_external_sort%%. Если она выставлена в 0 (по умолчанию), то внешняя сортировка выключена. Если она включена, то при достижении объёмом данных для сортировки указанного количества байт, накопленные данные будут отсортированы и сброшены во временный файл. После того, как все данные будут прочитаны, будет произведено слияние всех сортированных файлов и выдача результата. Файлы записываются в директорию /var/lib/clickhouse/tmp/ (по умолчанию, может быть изменено с помощью параметра tmp_path) в конфиге. - -На выполнение запроса может расходоваться больше памяти, чем max_bytes_before_external_sort. Поэтому, значение этой настройки должно быть существенно меньше, чем max_memory_usage. Для примера, если на вашем сервере 128 GB оперативки, и вам нужно выполнить один запрос, то выставите max_memory_usage в 100 GB, а max_bytes_before_external_sort в 80 GB. - -Внешняя сортировка работает существенно менее эффективно, чем сортировка в оперативке. - -

    Секция SELECT

    - -После вычислений, соответствующих всем перечисленным выше секциям, производится вычисление выражений, указанных в секции SELECT. -Вернее, вычисляются выражения, стоящие над агрегатными функциями, если есть агрегатные функции. -Сами агрегатные функции и то, что под ними, вычисляются при агрегации (GROUP BY). -Эти выражения работают так, как будто применяются к отдельным строкам результата. - -

    Секция DISTINCT

    - -Если указано DISTINCT, то из всех множеств полностью совпадающих строк результата, будет оставляться только одна строка. -Результат выполнения будет таким же, как если указано GROUP BY по всем указанным полям в SELECT-е и не указаны агрегатные функции. Но имеется несколько отличий от GROUP BY: -- DISTINCT может применяться совместно с GROUP BY; -- при отсутствии ORDER BY и наличии LIMIT, запрос прекратит выполнение сразу после того, как будет прочитано необходимое количество различных строк - в этом случае использование DISTINCT существенно более оптимально; -- блоки данных будут выдаваться по мере их обработки, не дожидаясь выполнения всего запроса. - -DISTINCT не поддерживается, если в SELECT-е присутствует хотя бы один столбец типа массив. - -

    Секция LIMIT

    - -LIMIT m позволяет выбрать из результата первые m строк. -LIMIT n, m позволяет выбрать из результата первые m строк после пропуска первых n строк. - -n и m должны быть неотрицательными целыми числами. - -При отсутствии секции ORDER BY, однозначно сортирующей результат, результат может быть произвольным и может являться недетерминированным. - - -

    Секция UNION ALL

    - -Произвольное количество запросов может быть объединено с помощью UNION ALL. Пример: - -%% -SELECT CounterID, 1 AS table, toInt64(count()) AS c - FROM test.hits - GROUP BY CounterID - -UNION ALL - -SELECT CounterID, 2 AS table, sum(Sign) AS c - FROM test.visits - GROUP BY CounterID - HAVING c > 0 -%% - -Поддерживается только UNION ALL. Обычный UNION (UNION DISTINCT) не поддерживается. Если вам нужен UNION DISTINCT, то вы можете написать SELECT DISTINCT из подзапроса, содержащего UNION ALL. - -Запросы - части UNION ALL могут выполняться параллельно, и их результаты могут возвращаться вперемешку. - -Структура результатов (количество и типы столбцов) у запросов должна совпадать. Но имена столбцов могут отличаться. В этом случае, имена столбцов для общего результата будут взяты из первого запроса. - -Запросы - части UNION ALL нельзя заключить в скобки. ORDER BY и LIMIT применяются к отдельным запросам, а не к общему результату. Если вам нужно применить какое-либо преобразование к общему результату, то вы можете разместить все запросы с UNION ALL в подзапросе в секции FROM. - -

    Секция INTO OUTFILE

    - -При указании %%INTO OUTFILE filename%% (где filename - строковый литерал), результат запроса будет сохранён в файл filename. -В отличие от MySQL, файл создаётся на стороне клиента. Если файл с таким именем уже существует, это приведёт к ошибке. -Функциональность доступна в клиенте командной строки и clickhouse-local (попытка выполнить запрос с INTO OUTFILE через HTTP интерфейс приведёт к ошибке). - -Формат вывода по умолчанию - TabSeparated, как и в неинтерактивном режиме клиента командной строки. - -

    Секция FORMAT

    - -При указании FORMAT format вы можете получить данные в любом указанном формате. -Это может использоваться для удобства или для создания дампов. -Подробнее смотрите раздел "Форматы". -Если секция FORMAT отсутствует, то используется формат по умолчанию, который зависит от используемого интерфейса для доступа к БД и от настроек. Для HTTP интерфейса, а также для клиента командной строки, используемого в batch-режиме, по умолчанию используется формат TabSeparated. Для клиента командной строки, используемого в интерактивном режиме, по умолчанию используется формат PrettyCompact (прикольные таблички, компактные). - -При использовании клиента командной строки данные на клиент передаются во внутреннем эффективном формате. При этом клиент самостоятельно интерпретирует секцию FORMAT запроса и форматирует данные на своей стороне (снимая нагрузку на сеть и сервер). - - -

    Операторы IN

    - -Операторы %%IN%%, %%NOT IN%%, %%GLOBAL IN%%, %%GLOBAL NOT IN%% рассматриваются отдельно, так как их функциональность достаточно богатая. - -В качестве левой части оператора, может присутствовать как один столбец, так и кортеж. - -Примеры: - -%%SELECT UserID IN (123, 456) FROM ...%% -%%SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ...%% - -Если слева стоит один столбец, входящий в индекс, а справа - множество констант, то при выполнении запроса, система воспользуется индексом. - -Не перечисляйте слишком большое количество значений (миллионы) явно. Если множество большое - лучше загрузить его во временную таблицу (например, смотрите раздел "Внешние данные для обработки запроса"), и затем воспользоваться подзапросом. - -В качестве правой части оператора может быть множество константных выражений, множество кортежей с константными выражениями (показано в примерах выше), а также имя таблицы или подзапрос SELECT в скобках. - -Если в качестве правой части оператора указано имя таблицы (например, %%UserID IN users%%), то это эквивалентно подзапросу %%UserID IN (SELECT * FROM users)%%. Это используется при работе с внешними данными, отправляемым вместе с запросом. Например, вместе с запросом может быть отправлено множество идентификаторов посетителей, загруженное во временную таблицу users, по которому следует выполнить фильтрацию. - -Если качестве правой части оператора, указано имя таблицы, имеющий движок Set (подготовленное множество, постоянно находящееся в оперативке), то множество не будет создаваться заново при каждом запросе. - -В подзапросе может быть указано более одного столбца для фильтрации кортежей. -Пример: -%%SELECT (CounterID, UserID) IN (SELECT CounterID, UserID FROM ...) FROM ...%% - -Типы столбцов слева и справа оператора %%IN%%, должны совпадать. - -Оператор IN и подзапрос могут встречаться в любой части запроса, в том числе в агрегатных и лямбда функциях. -Пример: - -%%SELECT - EventDate, - avg(UserID IN - ( - SELECT UserID - FROM test.hits - WHERE EventDate = toDate('2014-03-17') - )) AS ratio -FROM test.hits -GROUP BY EventDate -ORDER BY EventDate ASC - -┌──EventDate─┬────ratio─┐ -│ 2014-03-17 │ 1 │ -│ 2014-03-18 │ 0.807696 │ -│ 2014-03-19 │ 0.755406 │ -│ 2014-03-20 │ 0.723218 │ -│ 2014-03-21 │ 0.697021 │ -│ 2014-03-22 │ 0.647851 │ -│ 2014-03-23 │ 0.648416 │ -└────────────┴──────────┘ -%% -- за каждый день после 17 марта считаем долю хитов, сделанных посетителями, которые заходили на сайт 17 марта. - -Подзапрос в секции IN на одном сервере всегда выполняется только один раз. Зависимых подзапросов не существует. - - -

    Распределённые подзапросы

    - -Существует два варианта IN-ов с подзапросами (аналогично для JOIN-ов): обычный %%IN%% / %%JOIN%% и %%GLOBAL IN%% / %%GLOBAL JOIN%%. Они отличаются способом выполнения при распределённой обработке запроса. - -При использовании обычного %%IN%%-а, запрос отправляется на удалённые серверы, и на каждом из них выполняются подзапросы в секциях IN / JOIN. - -При использовании %%GLOBAL IN%% / %%GLOBAL JOIN%%-а, сначала выполняются все подзапросы для %%GLOBAL IN%% / %%GLOBAL JOIN%%-ов, и результаты складываются во временные таблицы. Затем эти временные таблицы передаются на каждый удалённый сервер, и на них выполняются запросы, с использованием этих переданных временных данных. - -Если запрос не распределённый, используйте обычный %%IN%% / %%JOIN%%. - - -Следует быть внимательным при использовании подзапросов в секции %%IN%% / %%JOIN%% в случае распределённой обработки запроса. - -Рассмотрим это на примерах. Пусть на каждом сервере кластера есть обычная таблица local_table. Пусть также есть таблица distributed_table типа Distributed, которая смотрит на все серверы кластера. - -При запросе к распределённой таблице distributed_table, запрос будет отправлен на все удалённые серверы, и на них будет выполнен с использованием таблицы local_table. - -Например, запрос -%%SELECT uniq(UserID) FROM distributed_table%% -будет отправлен на все удалённые серверы в виде -%%SELECT uniq(UserID) FROM local_table%% -, выполнен параллельно на каждом из них до стадии, позволяющей объединить промежуточные результаты; затем промежуточные результаты вернутся на сервер-инициатор запроса, будут на нём объединены, и финальный результат будет отправлен клиенту. - -Теперь рассмотрим запрос с IN-ом: -%%SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34)%% -- расчёт пересечения аудиторий двух сайтов. - -Этот запрос будет отправлен на все удалённые серверы в виде -%%SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM local_table WHERE CounterID = 34)%% -То есть, множество в секции %%IN%% будет собрано на каждом сервере независимо, только по тем данным, которые есть локально на каждом из серверов. - -Это будет работать правильно и оптимально, если вы предусмотрели такой случай, и раскладываете данные по серверам кластера таким образом, чтобы данные одного UserID-а лежали только на одном сервере. В таком случае все необходимые данные будут присутствовать на каждом сервере локально. В противном случае результат будет посчитан неточно. Назовём этот вариант запроса "локальный IN". - -Чтобы исправить работу запроса, когда данные размазаны по серверам кластера произвольным образом, можно было бы указать distributed_table внутри подзапроса. Запрос будет выглядеть так: -%%SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34)%% - -Этот запрос будет отправлен на все удалённые серверы в виде -%%SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID IN (SELECT UserID FROM distributed_table WHERE CounterID = 34)%% -На каждом удалённом сервере начнёт выполняться подзапрос. Так как в подзапросе используется распределённая таблица, то подзапрос будет, на каждом удалённом сервере, снова отправлен на каждый удалённый сервер, в виде -%%SELECT UserID FROM local_table WHERE CounterID = 34%% -Например, если у вас кластер из 100 серверов, то выполнение всего запроса потребует 10 000 элементарных запросов, что, как правило, является неприемлемым. - -В таких случаях всегда следует использовать %%GLOBAL IN%% вместо %%IN%%. Рассмотрим его работу для запроса -%%SELECT uniq(UserID) FROM distributed_table WHERE CounterID = 101500 AND UserID GLOBAL IN (SELECT UserID FROM distributed_table WHERE CounterID = 34)%% - -На сервере-инициаторе запроса будет выполнен подзапрос -%%SELECT UserID FROM distributed_table WHERE CounterID = 34%% -, и результат будет сложен во временную таблицу в оперативке. Затем запрос будет отправлен на каждый удалённый сервер в виде -%%SELECT uniq(UserID) FROM local_table WHERE CounterID = 101500 AND UserID GLOBAL IN _data1%% -, и вместе с запросом, на каждый удалённый сервер будет отправлена временная таблица _data1 (имя временной таблицы - implementation defined). - -Это гораздо более оптимально, чем при использовании обычного IN. Но при этом, следует помнить о нескольких вещах: - -1. При создании временной таблицы данные не уникализируются. Чтобы уменьшить объём передаваемых по сети данных, укажите в подзапросе %%DISTINCT%% (для обычного IN-а этого делать не нужно). -2. Временная таблица будет передана на все удалённые серверы. Передача не учитывает топологию сети. Например, если 10 удалённых серверов расположены в удалённом относительно сервера-инициатора запроса датацентре, то по каналу в удалённый датацентр данные будет переданы 10 раз. Старайтесь не использовать большие множества при использовании %%GLOBAL IN%%. -3. При передаче данных на удалённые серверы не настраивается ограничение использования сетевой полосы. Вы можете перегрузить сеть. -4. Старайтесь распределять данные по серверам так, чтобы в %%GLOBAL IN%%-ах не было частой необходимости. -5. Если в %%GLOBAL IN%% есть частая необходимость, то спланируйте размещение кластера ClickHouse таким образом, чтобы в каждом датацентре была хотя бы одна реплика каждого шарда, и среди них была быстрая сеть - чтобы запрос целиком можно было бы выполнить, передавая данные в пределах одного датацентра. - -В секции %%GLOBAL IN%% также имеет смысл указывать локальную таблицу - в случае, если эта локальная таблица есть только на сервере-инициаторе запроса, и вы хотите воспользоваться данными из неё на удалённых серверах. - - -

    Экстремальные значения

    - -Вы можете получить в дополнение к результату также минимальные и максимальные значения по столбцам результата. Для этого выставите настройку extremes в 1. Минимумы и максимумы считаются для числовых типов, дат, дат-с-временем. Для остальных столбцов будут выведены значения по умолчанию. - -Вычисляются дополнительные две строчки - минимумы и максимумы, соответственно. Эти дополнительные две строчки выводятся в форматах JSON*, TabSeparated*, Pretty* отдельно от остальных строчек. В остальных форматах они не выводится. - -В форматах JSON* экстремальные значения выводятся отдельным полем extremes. В форматах TabSeparated* строчка выводится после основного результата и после totals, если есть. Перед ней (после остальных данных) вставляется пустая строка. В форматах Pretty* строчка выводится отдельной табличкой после основного результата и после totals, если есть. - -Экстремальные значения считаются по строчкам, прошедшим через LIMIT. Но при этом, при использовании LIMIT offset, size, строчки до offset учитываются в extremes. В потоковых запросах, в результате может учитываться также небольшое количество строчек, прошедших LIMIT. - - -

    Замечания

    - -В секциях GROUP BY, ORDER BY, в отличие от диалекта MySQL, и в соответствии со стандартным SQL, не поддерживаются позиционные аргументы. -Например, если вы напишите GROUP BY 1, 2 - то это будет воспринято, как группировка по константам (то есть, агрегация всех строк в одну). - -Вы можете использовать синонимы (алиасы AS) в любом месте запроса. - -В любом месте запроса, вместо выражения, может стоять звёздочка. При анализе запроса звёздочка раскрывается в список всех столбцов таблицы (за исключением MATERIALIZED и ALIAS столбцов). Есть лишь немного случаев, когда оправдано использовать звёздочку: -- при создании дампа таблицы; -- для таблиц, содержащих всего несколько столбцов - например, системных таблиц; -- для получения информации о том, какие столбцы есть в таблице; в этом случае, укажите LIMIT 1. Но лучше используйте запрос DESC TABLE; -- при наличии сильной фильтрации по небольшому количеству столбцов с помощью PREWHERE; -- в подзапросах (так как из подзапросов выкидываются столбцы, не нужные для внешнего запроса). -В других случаях использование звёздочки является издевательством над системой, так как вместо преимуществ столбцовой СУБД вы получаете недостатки. То есть использовать звёздочку не рекомендуется. - - -===KILL QUERY=== -%%KILL QUERY WHERE <where expression to SELECT FROM system.processes query> [SYNC|ASYNC|TEST] [FORMAT format]%% -Пытается завершить исполняющиеся в данный момент запросы. -Запросы для завершения выбираются из таблицы %%system.processes%% для которых выражение после WHERE истинно. - -Примеры: -%%KILL QUERY WHERE query_id='2-857d-4a57-9ee0-327da5d60a90'%% -Завершает все запросы с указанным %%query_id%%. - -%%KILL QUERY WHERE user='username' SYNC%% -Синхронно завершает все запросы пользователя %%username%%. - -Readonly-пользователи могут совершать только свои запросы. -По-умолчанию используется асинхронный вариант запроса (ASYNC), который завершается не ожидая завершения запросов. -Синхронный вариант (SYNC) ожидает завершения всех запросов и построчно выводит информацию о процессах по ходу их завершения. -Ответ содержит колонку kill_status, которая может принимать следующие значения: -1. 'finished' - запрос успешно завершился; -2. 'waiting' - запросу отправлен сигнал завершения, ожидается его завершение; -3. остальные значения описывают причину невозможности завершения запроса. -Тестовый вариант запроса (TEST) только проверяет права пользователя и выводит список запросов для завершения. - - -
    -
    -

    Внешние данные для обработки запроса

    -
    -
    - -ClickHouse позволяет отправить на сервер данные, необходимые для обработки одного запроса, вместе с запросом SELECT. Такие данные будут положены во временную таблицу (см. раздел "Временные таблицы") и смогут использоваться в запросе (например, в операторах IN). - -Для примера, если у вас есть текстовый файл с важными идентификаторами посетителей, вы можете загрузить его на сервер вместе с запросом, в котором используется фильтрация по этому списку. - -Если вам нужно будет выполнить более одного запроса с достаточно большими внешними данными - лучше не использовать эту функциональность, а загрузить данные в БД заранее. - -Внешние данные могут быть загружены как с помощью клиента командной строки (в неинтерактивном режиме), так и через HTTP-интерфейс. - -В клиенте командной строки, может быть указана секция параметров вида - -%%--external --file=... [--name=...] [--format=...] [--types=...|--structure=...]%% - -Таких секций может быть несколько - по числу передаваемых таблиц. - ---external - маркер начала секции. ---file - путь к файлу с дампом таблицы, или %%-%%, что обозначает stdin. -Из stdin может быть считана только одна таблица. - -Следующие параметры не обязательные: ---name - имя таблицы. Если не указано - используется %%_data%%. ---format - формат данных в файле. Если не указано - используется %%TabSeparated%%. - -Должен быть указан один из следующих параметров: ---types - список типов столбцов через запятую. Например, %%UInt64,String%%. Столбцы будут названы %%_1%%, %%_2%%, ... ---structure - структура таблицы, в форме %%UserID UInt64, URL String%%. Определяет имена и типы столбцов. - -Файлы, указанные в %%file%%, будут разобраны форматом, указанным в %%format%%, с использованием типов данных, указанных в %%types%% или %%structure%%. Таблица будет загружена на сервер, и доступна там в качестве временной таблицы с именем %%name%%. - -Примеры: - -%%echo -ne "1\n2\n3\n" | clickhouse-client --query="SELECT count() FROM test.visits WHERE TraficSourceID IN _data" --external --file=- --types=Int8 -849897 -%% - -%%cat /etc/passwd | sed 's/:/\t/g' | clickhouse-client --query="SELECT shell, count() AS c FROM passwd GROUP BY shell ORDER BY c DESC" --external --file=- --name=passwd --structure='login String, unused String, uid UInt16, gid UInt16, comment String, home String, shell String' -/bin/sh 20 -/bin/false 5 -/bin/bash 4 -/usr/sbin/nologin 1 -/bin/sync 1 -%% - -При использовании HTTP интерфейса, внешние данные передаются в формате multipart/form-data. Каждая таблица передаётся отдельным файлом. Имя таблицы берётся из имени файла. В query_string передаются параметры name_format, name_types, name_structure, где name - имя таблицы, которой соответствуют эти параметры. Смысл параметров такой же, как при использовании клиента командной строки. - -Пример: - -
    cat /etc/passwd | sed 's/:/\t/g' > passwd.tsv
    -
    -curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+count()+AS+c+FROM+passwd+GROUP+BY+shell+ORDER+BY+c+DESC&passwd_structure=login+String,+unused+String,+uid+UInt16,+gid+UInt16,+comment+String,+home+String,+shell+String'
    -/bin/sh 20
    -/bin/false      5
    -/bin/bash       4
    -/usr/sbin/nologin       1
    -/bin/sync       1
    -
    - -При распределённой обработке запроса, временные таблицы передаются на все удалённые серверы. - -
    -
    -

    Движки таблиц

    -
    -
    - -Движок таблицы (тип таблицы) определяет: -- как и где хранятся данные - куда их писать и откуда читать; -- какие запросы поддерживаются, и каким образом; -- конкуррентный доступ к данным; -- использование индексов, если есть; -- возможно ли многопоточное выполнение запроса; -- репликацию данных; -- при чтении, движок обязан лишь достать нужный набор столбцов; - но в некоторых случаях, запрос может быть частично обработан в рамках движка таблицы. - -Забегая вперёд, заметим, что для большинства серьёзных задач, следует использовать движки семейства MergeTree. - - -==TinyLog== - -Самый простой движок таблиц, который хранит данные на диске. -Каждый столбец хранится в отдельном сжатом файле. -При записи, данные дописываются в конец файлов. -Конкуррентный доступ к данным никак не ограничивается: -- если вы одновременно читаете из таблицы и в другом запросе пишете в неё, то чтение будет завершено с ошибкой; -- если вы одновременно пишите в таблицу в нескольких запросах, то данные будут битыми. -Типичный способ использования этой таблицы - это write-once: сначала один раз только пишем данные, а потом сколько угодно читаем. -Запросы выполняются в один поток. То есть, этот движок предназначен для сравнительно маленьких таблиц (рекомендуется до 1 000 000 строк). -Этот движок таблиц имеет смысл использовать лишь в случае, если у вас есть много маленьких таблиц, так как он проще, чем движок Log (требуется открывать меньше файлов). -Случай, когда у вас много маленьких таблиц, является гарантированно плохим по производительности, но может уже использоваться при работе с другой СУБД, и вам может оказаться удобнее перейти на использование таблиц типа TinyLog. -Индексы не поддерживаются. - -В Яндекс.Метрике таблицы типа TinyLog используются для промежуточных данных, обрабатываемых маленькими пачками. - - -==Log== - -Отличается от TinyLog тем, что вместе с файлами столбцов лежит небольшой файл "засечек". Засечки пишутся на каждый блок данных и содержат смещение - с какого места нужно читать файл, чтобы пропустить заданное количество строк. Это позволяет читать данные из таблицы в несколько потоков. -При конкуррентном доступе к данным, чтения могут выполняться одновременно, а записи блокируют чтения и друг друга. -Движок Log не поддерживает индексы. Также, если при записи в таблицу произошёл сбой, то таблица станет битой, и чтения из неё будут возвращать ошибку. Движок Log подходит для временных данных, write-once таблиц, а также для тестовых и демонстрационных целей. - - -==Memory== - -Хранит данные в оперативке, в несжатом виде. Данные хранятся именно в таком виде, в каком они получаются при чтении. То есть, само чтение из этой таблицы полностью бесплатно. -Конкуррентный доступ к данным синхронизируется. Блокировки короткие: чтения и записи не блокируют друг друга. -Индексы не поддерживаются. Чтение распараллеливается. -За счёт отсутствия чтения с диска, разжатия и десериализации данных, удаётся достичь максимальной производительности (выше 10 ГБ/сек.) на простых запросах. (Стоит заметить, что во многих случаях, производительность движка MergeTree, почти такая же высокая.) -При перезапуске сервера, данные из таблицы исчезают и таблица становится пустой. -Обычно, использование этого движка таблиц является неоправданным. Тем не менее, он может использоваться для тестов, а также в задачах, где важно достичь максимальной скорости на не очень большом количестве строк (примерно до 100 000 000). - -Движок Memory используется системой для временных таблиц - внешних данных запроса (смотрите раздел "Внешние данные для обработки запроса"), для реализации GLOBAL IN (смотрите раздел "Операторы IN"). - - -==File(InputFormat)== - -Источником данных является файл, хранящий данные в одном из поддерживаемых форматов входных данных (TabSeparated, Native, и т. д.) ... - - -==Merge== - -Движок Merge (не путайте с движком MergeTree) не хранит данные самостоятельно, а позволяет читать одновременно из произвольного количества других таблиц. -Чтение автоматически распараллеливается. Запись в таблицу не поддерживается. При чтении будут использованы индексы тех таблиц, из которых реально идёт чтение, если они существуют. -Движок Merge принимает параметры: имя базы данных и регулярное выражение для таблиц. Пример: - -%%Merge(hits, '^WatchLog')%% - -- данные будут читаться из таблиц в базе hits, имена которых соответствуют регулярному выражению '^WatchLog'. - -Вместо имени базы данных может использоваться константное выражение, возвращающее строку. Например, %%currentDatabase()%%. - -Регулярные выражения - re2 (как PCRE, но без особых извратов), регистрозависимые. -Смотрите замечание об экранировании в регулярных выражениях в разделе "match". - -При выборе таблиц для чтения, сама Merge-таблица не будет выбрана, даже если попадает под регулярное выражение - чтобы не возникло циклов. -Впрочем, вы можете создать две Merge-таблицы, которые будут пытаться бесконечно читать данные друг-друга. Этого делать не нужно. - -Типичный способ использования движка Merge - возможность работы с большим количеством таблиц типа TinyLog, как с одной. - -===Виртуальные столбцы=== - -Виртуальные столбцы - столбцы, предоставляемые движком таблиц, независимо от определения таблицы. То есть, такие столбцы не указываются в CREATE TABLE, но доступны для SELECT-а. - -Виртуальные столбцы отличаются от обычных следующими особенностями: -- они не указываются в определении таблицы; -- в них нельзя вставить данные при INSERT-е; -- при INSERT-е без указания списка столбцов, виртуальные столбцы не учитываются; -- они не выбираются при использовании звёздочки (SELECT *); -- виртуальные столбцы не показываются в запросах SHOW CREATE TABLE и DESC TABLE; - -Таблица типа Merge содержит виртуальный столбец _table типа String. (Если в таблице уже есть столбец _table, то виртуальный столбец называется _table1; если уже есть _table1, то _table2 и т. п.) Он содержит имя таблицы, из которой были прочитаны данные. - -Если секция WHERE/PREWHERE содержит (в качестве одного из элементов конъюнкции или в качестве всего выражения) условия на столбец _table, не зависящие от других столбцов таблицы, то эти условия используются как индекс: условия выполняются над множеством имён таблиц, из которых нужно читать данные, и чтение будет производиться только из тех таблиц, для которых условия сработали. - - -==Distributed== - -Движок Distributed не хранит данные самостоятельно, а позволяет обрабатывать запросы распределённо, на нескольких серверах. -Чтение автоматически распараллеливается. При чтении будут использованы индексы таблиц на удалённых серверах, если есть. -Движок Distributed принимает параметры: имя кластера в конфигурационном файле сервера, имя удалённой базы данных, имя удалённой таблицы, а также (не обязательно) ключ шардирования. -Пример: - -%%Distributed(logs, default, hits[, sharding_key])%% - -- данные будут читаться со всех серверов кластера logs, из таблицы default.hits, расположенной на каждом сервере кластера. -Данные не только читаются, но и частично (настолько, насколько это возможно) обрабатываются на удалённых серверах. -Например, при запросе с GROUP BY, данные будут агрегированы на удалённых серверах, промежуточные состояния агрегатных функций будут отправлены на запросивший сервер; затем данные будут доагрегированы. - -Вместо имени базы данных может использоваться константное выражение, возвращающее строку. Например, %%currentDatabase()%%. - -logs - имя кластера в конфигурационном файле сервера. - -Кластеры задаются следующим образом: - -%% -<remote_servers> - <logs> - <shard> - <!-- Не обязательно. Вес шарда при записи данных. По умолчанию, 1. --> - <weight>1</weight> - <!-- Не обязательно. Записывать ли данные только на одну, любую из реплик. По умолчанию, false - записывать данные на все реплики. --> - <internal_replication>false</internal_replication> - <replica> - <host>example01-01-1</host> - <port>9000</port> - </replica> - <replica> - <host>example01-01-2</host> - <port>9000</port> - </replica> - </shard> - <shard> - <weight>2</weight> - <internal_replication>false</internal_replication> - <replica> - <host>example01-02-1</host> - <port>9000</port> - </replica> - <replica> - <host>example01-02-2</host> - <port>9000</port> - </replica> - </shard> - </logs> -</remote_servers> -%% - -Здесь задан кластер с именем logs, состоящий из двух шардов, каждый из которых состоит из двух реплик. -Шардами называются серверы, содержащие разные части данных (чтобы прочитать все данные, нужно идти на все шарды). -Репликами называются дублирующие серверы (чтобы прочитать данные, можно идти за данными на любую из реплик). - -В качестве параметров для каждого сервера указываются host, port и, не обязательно, user, password. -host - адрес удалённого сервера. Может быть указан домен, или IPv4 или IPv6 адрес. В случае указания домена, при старте сервера делается DNS запрос, и результат запоминается на всё время работы сервера. Если DNS запрос неуспешен, то сервер не запускается. Если вы изменяете DNS-запись, перезапустите сервер. -port - TCP-порт для межсерверного взаимодействия (в конфиге - tcp_port, обычно 9000). Не перепутайте с http_port. -user - имя пользователя для соединения с удалённым сервером. по умолчанию - default. Этот пользователь должен иметь доступ для соединения с указанным сервером. Доступы настраиваются в файле users.xml, подробнее смотрите в разделе "Права доступа". -password - пароль для соединения с удалённым сервером, в открытом виде. по умолчанию - пустая строка. - -При указании реплик, для каждого из шардов, при чтении, будет выбрана одна из доступных реплик. Можно настроить алгоритм балансировки нагрузки (то есть, предпочтения, на какую из реплик идти) - см. настройку load_balancing. -Если соединение с сервером не установлено, то будет произведена попытка соединения с небольшим таймаутом. Если соединиться не удалось, то будет выбрана следующая реплика, и так для всех реплик. Если попытка соединения для всех реплик не удалась, то будут снова произведены попытки соединения по кругу, и так несколько раз. -Это работает в пользу отказоустойчивости, хотя и не обеспечивает полную отказоустойчивость: удалённый сервер может принять соединение, но не работать, или плохо работать. - -Можно указать от одного шарда (в таком случае, обработку запроса стоит называть удалённой, а не распределённой) до произвольного количества шардов. В каждом шарде можно указать от одной до произвольного числа реплик. Можно указать разное число реплик для каждого шарда. - -Вы можете прописать сколько угодно кластеров в конфигурации. - -Для просмотра имеющихся кластеров, вы можете использовать системную таблицу system.clusters. - -Движок Distributed позволяет работать с кластером, как с локальным сервером. При этом, кластер является неэластичным: вы должны прописать его конфигурацию в конфигурационный файл сервера (лучше всех серверов кластера). - -Не поддерживаются Distributed таблицы, смотрящие на другие Distributed таблицы (за исключением случаев, когда у Distributed таблицы всего один шард). Вместо этого, сделайте так, чтобы Distributed таблица смотрела на "конечные" таблицы. - -Как видно, движок Distributed требует прописывания кластера в конфигурационный файл; кластера из конфигурационного файла обновляются налету, без перезапуска сервера. Если вам необходимо каждый раз отправлять запрос на неизвестный набор шардов и реплик, вы можете не создавать Distributed таблицу, а воспользоваться табличной функцией remote. Смотрите раздел "Табличные функции". - -Есть два способа записывать данные на кластер: - -Во первых, вы можете самостоятельно определять, на какие серверы какие данные записывать, и выполнять запись непосредственно на каждый шард. То есть, делать INSERT в те таблицы, на которые "смотрит" распределённая таблица. -Это наиболее гибкое решение - вы можете использовать любую схему шардирования, которая может быть нетривиальной из-за требований предметной области. -Также это является наиболее оптимальным решением, так как данные могут записываться на разные шарды полностью независимо. - -Во вторых, вы можете делать INSERT в Distributed таблицу. В этом случае, таблица будет сама распределять вставляемые данные по серверам. -Для того, чтобы писать в Distributed таблицу, у неё должен быть задан ключ шардирования (последний параметр). Также, если шард всего-лишь один, то запись работает и без указания ключа шардирования (так как в этом случае он не имеет смысла). - -У каждого шарда в конфигурационном файле может быть задан "вес" (weight). По умолчанию, вес равен единице. Данные будут распределяться по шардам в количестве, пропорциональном весу шарда. Например, если есть два шарда, и у первого выставлен вес 9, а у второго 10, то на первый будет отправляться 9 / 19 доля строк, а на второй - 10 / 19. - -У каждого шарда в конфигурационном файле может быть указан параметр internal_replication. - -Если он выставлен в true, то для записи будет выбираться первая живая реплика и данные будут писаться на неё. Этот вариант следует использовать, если Distributed таблица "смотрит" на реплицируемые таблицы. То есть, если таблица, в которую будут записаны данные, будет сама заниматься их репликацией. - -Если он выставлен в false (по умолчанию), то данные будут записываться на все реплики. По сути, это означает, что Distributed таблица занимается репликацией данных самостоятельно. Это хуже, чем использование реплицируемых таблиц, так как не контролируется консистентность реплик, и они со временем будут содержать немного разные данные. - -Для выбора шарда, на который отправляется строка данных, вычисляется выражение шардирования, и берётся его остаток от деления на суммарный вес шардов. Строка отправляется на шард, соответствующий полуинтервалу остатков от prev_weights до prev_weights + weight, где prev_weights - сумма весов шардов с меньшим номером, а weight - вес этого шарда. Например, если есть два шарда, и у первого выставлен вес 9, а у второго 10, то строка будет отправляться на первый шард для остатков из диапазона [0, 9), а на второй - для остатков из диапазона [10, 19). - -Выражением шардирование может быть произвольное выражение от констант и столбцов таблицы, возвращающее целое число. Например, вы можете использовать выражение rand() для случайного распределения данных, или UserID - для распределения по остатку от деления идентификатора посетителя (тогда данные одного посетителя будут расположены на одном шарде, что упростит выполнение IN и JOIN по посетителям). Если распределение какого-либо столбца недостаточно равномерное, вы можете обернуть его в хэш функцию: intHash64(UserID). - -Простой остаток от деления является довольно ограниченным решением для шардирования и подходит не для всех случаев. Он подходит для среднего и большого объёма данных (десятки серверов), но не для очень больших объёмов данных (сотни серверов и больше). В последнем случае, лучше использовать схему шардирования, продиктованную требованиями предметной области, и не использовать возможность записи в Distributed таблицы. - -В случае использования реплицированных таблиц, есть возможность перешардировать данные - смотрите раздел "Перешардирование". Но во многих случаях лучше обойтись без этого. Запросы SELECT отправляются на все шарды, и работают независимо от того, каким образом данные распределены по шардам (они могут быть распределены полностью случайно). При добавлении нового шарда, можно не переносить на него старые данные, а записывать новые данные с большим весом - данные будут распределены слегка неравномерно, но запросы будут работать корректно и достаточно эффективно. - -Беспокоиться о схеме шардирования имеет смысл в следующих случаях: -- используются запросы, требующие соединение данных (IN, JOIN) по определённому ключу - тогда если данные шардированы по этому ключу, то можно использовать локальные IN, JOIN вместо GLOBAL IN, GLOBAL JOIN, что кардинально более эффективно. -- используется большое количество серверов (сотни и больше) и большое количество маленьких запросов (запросы отдельных клиентов - сайтов, рекламодателей, партнёров) - тогда, для того, чтобы маленькие запросы не затрагивали весь кластер, имеет смысл располагать данные одного клиента на одном шарде, или (вариант, который используется в Яндекс.Метрике) сделать двухуровневое шардирование: разбить весь кластер на "слои", где слой может состоять из нескольких шардов; данные для одного клиента располагаются на одном слое, но в один слой можно по мере необходимости добавлять шарды, в рамках которых данные распределены произвольным образом; создаются распределённые таблицы на каждый слой и одна общая распределённая таблица для глобальных запросов. - -Запись данных осуществляется полностью асинхронно. При INSERT-е в Distributed таблицу, блок данных всего лишь записывается в локальную файловую систему. Данные отправляются на удалённые серверы в фоне, при первой возможности. Вы должны проверять, успешно ли отправляются данные, проверяя список файлов (данные, ожидающие отправки) в директории таблицы: /var/lib/clickhouse/data/database/table/. - -Если после INSERT-а в Distributed таблицу, сервер перестал существовать или был грубо перезапущен (например, в следствие аппаратного сбоя), то записанные данные могут быть потеряны. Если в директории таблицы обнаружен повреждённый кусок данных, то он переносится в поддиректорию broken и больше не используется. - -При выставлении опции max_parallel_replicas выполнение запроса распараллеливается по всем репликам внутри одного шарда. Подробнее смотрите раздел "Настройки, max_parallel_replicas". - -==MergeTree== - -Движок MergeTree поддерживает индекс по первичному ключу и по дате, и обеспечивает возможность обновления данных в реальном времени. -Это наиболее продвинутый движок таблиц в ClickHouse. Не путайте с движком Merge. - -Движок принимает параметры: имя столбца типа Date, содержащего дату; выражение для семплирования (не обязательно); кортеж, определяющий первичный ключ таблицы; гранулированность индекса. Пример: - -Пример без поддержки сэмплирования: -%%MergeTree(EventDate, (CounterID, EventDate), 8192)%% - -Пример с поддержкой сэмплирования: -%%MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID)), 8192)%% - -В таблице типа MergeTree обязательно должен быть отдельный столбец, содержащий дату. В этом примере, это - столбец EventDate. Тип столбца с датой - обязательно Date (а не DateTime). - -Первичным ключом может быть кортеж из произвольных выражений (обычно это просто кортеж столбцов) или одно выражение. - -Выражение для сэмплирования (использовать не обязательно) - произвольное выражение. Оно должно также присутствовать в первичном ключе. В примере используется хэширование по идентификатору посетителя, чтобы псевдослучайно перемешать данные в таблице для каждого CounterID и EventDate. То есть, при использовании секции SAMPLE в запросе, вы получите равномерно-псевдослучайную выборку данных для подмножества посетителей. - -Таблица реализована, как набор кусочков. Каждый кусочек сортирован по первичному ключу. Также, для каждого кусочка прописана минимальная и максимальная дата. При вставке в таблицу, создаётся новый сортированный кусочек. В фоне, периодически инициируется процесс слияния. При слиянии, выбирается несколько кусочков, обычно наименьших, и сливаются в один большой сортированный кусочек. - -То есть, при вставке в таблицу производится инкрементальная сортировка. Слияние реализовано таким образом, что таблица постоянно состоит из небольшого количества сортированных кусочков, а также само слияние делает не слишком много работы. - -При вставке, данные относящиеся к разным месяцам, разбиваются на разные кусочки. Кусочки, соответствующие разным месяцам, никогда не объединяются. Это сделано, чтобы обеспечить локальность модификаций данных (для упрощения бэкапов). - -Кусочки объединяются до некоторого предельного размера - чтобы не было слишком длительных слияний. - -Для каждого кусочка также пишется индексный файл. Индексный файл содержит значение первичного ключа для каждой index_granularity строки таблицы. То есть, это - разреженный индекс сортированных данных. - -Для столбцов также пишутся "засечки" каждую index_granularity строку, чтобы данные можно было читать в определённом диапазоне. - -При чтении из таблицы, запрос SELECT анализируется на предмет того, можно ли использовать индексы. -Индекс может использоваться, если в секции WHERE/PREWHERE, в качестве одного из элементов конъюнкции, или целиком, есть выражение, представляющее операции сравнения на равенства, неравенства, а также IN над столбцами, входящими в первичный ключ / дату, а также логические связки над ними. - -Таким образом, обеспечивается возможность быстро выполнять запросы по одному или многим диапазонам первичного ключа. Например, в указанном примере, будут быстро работать запросы для конкретного счётчика; для конкретного счётчика и диапазона дат; для конкретного счётчика и даты, для нескольких счётчиков и диапазона дат и т. п. - -%%SELECT count() FROM table WHERE EventDate = toDate(now()) AND CounterID = 34%% -%%SELECT count() FROM table WHERE EventDate = toDate(now()) AND (CounterID = 34 OR CounterID = 42)%% -%%SELECT count() FROM table WHERE ((EventDate >= toDate('2014-01-01') AND EventDate <= toDate('2014-01-31')) OR EventDate = toDate('2014-05-01')) AND CounterID IN (101500, 731962, 160656) AND (CounterID = 101500 OR EventDate != toDate('2014-05-01'))%% - -Во всех этих случаях будет использоваться индекс по дате и по первичному ключу. Видно, что индекс используется даже для достаточно сложных выражений. Чтение из таблицы организовано так, что использование индекса не может быть медленнее full scan-а. - -В этом примере, индекс не может использоваться: -%%SELECT count() FROM table WHERE CounterID = 34 OR URL LIKE '%upyachka%'%% - -Индекс по дате обеспечивает чтение только кусков, содержащих даты из нужного диапазона. При этом, кусок данных может содержать данные за многие даты (до целого месяца), а в пределах одного куска, данные лежат упорядоченными по первичному ключу, который может не содержать дату в качестве первого столбца. В связи с этим, при использовании запроса с указанием условия только на дату, но не на префикс первичного ключа, будет читаться данных больше, чем за одну дату. - -Для конкуррентного доступа к таблице, используется мульти-версионность. То есть, при одновременном чтении и обновлении таблицы, данные будут читаться из набора кусочков, актуального на момент запроса. Длинных блокировок нет. Вставки никак не мешают чтениям. - -Чтения из таблицы автоматически распараллеливаются. - -Поддерживается запрос OPTIMIZE, который вызывает один внеочередной шаг слияния. - -Вы можете использовать одну большую таблицу, постоянно добавляя в неё данные небольшими пачками - именно для этого предназначен движок MergeTree. - -Для всех типов таблиц семейства MergeTree возможна репликация данных - смотрите раздел "Репликация данных". - - -==CollapsingMergeTree== - -Движок достаточно специфичен для Яндекс.Метрики. -Отличается от MergeTree тем, что позволяет автоматически удалять - "схлопывать" некоторые пары строк при слиянии. - -В Яндекс.Метрике есть обычные логи (например, лог хитов) и логи изменений. Логи изменений используются, чтобы инкрементально считать статистику по постоянно меняющимся данным. Например - логи изменений визитов, логи изменений истории посетителей. Визиты в Яндекс.Метрике постоянно меняются - например, увеличивается количество хитов в визите. Изменением какого либо объекта будем называть пару (?старые значения, ?новые значения). Старые значения могут отсутствовать, если объект создался. Новые значения могут отсутствовать, если объект удалился. Если объект изменился, но был раньше и не удалился - присутствует оба значения. В лог изменений, для каждого изменения, пишется от одной до двух записей. Каждая запись содержит все те же атрибуты, что и сам объект, и ещё специальный атрибут, который позволяет отличить старые и новые значения. Видно, что при изменении объектов, в лог изменений лишь дописываются новые записи и не трогаются уже имеющиеся. - -Лог изменений позволяет инкрементально считать почти любую статистику. Для этого надо учитывать "новые" строки с положительным знаком, и "старые" строки с отрицательным знаком. То есть, возможно инкрементально считать все статистики, алгебраическая структура которых содержит операцию взятия обратного элемента. Большинство статистик именно такие. Также удаётся посчитать "идемпотентные" статистики, например, количество уникальных посетителей, так как при изменении визитов, уникальные посетители не удаляются. - -Это - основная идея, благодаря которой Яндекс.Метрика работает в реальном времени. - -CollapsingMergeTree принимает дополнительный параметр - имя столбца типа Int8, содержащего "знак" строки. Пример: - -%%CollapsingMergeTree(EventDate, (CounterID, EventDate, intHash32(UniqID), VisitID), 8192, Sign)%% - -Здесь Sign - столбец, содержащий -1 для "старых" значений и 1 для "новых" значений. - -При слиянии, для каждой группы идущих подряд одинаковых значений первичного ключа (столбцов, по которым сортируются данные), остаётся не более одной строки со значением столбца sign_column = -1 ("отрицательной строки") и не более одной строки со значением столбца sign_column = 1 ("положительной строки"). То есть - производится схлопывание записей из лога изменений. - -Если количество положительных и отрицательных строк совпадает - то пишет первую отрицательную и последнюю положительную строку. -Если положительных на 1 больше, чем отрицательных - то пишет только последнюю положительную строку. -Если отрицательных на 1 больше, чем положительных - то пишет только первую отрицательную строку. -Иначе - логическая ошибка, и ни одна из таких строк не пишется. (Логическая ошибка может возникать, если случайно один кусок лога был вставлен более одного раза. Поэтому, об ошибке всего лишь пишется в лог сервера, и слияние продолжает работать.) - -Как видно, от схлопывания не должны меняться результаты расчётов статистик. -Изменения постепенно схлопываются так что в конце-концов, для почти каждого объекта, остаются лишь его последние значения. -По сравнению с MergeTree, движок CollapsingMergeTree позволяет в несколько раз уменьшить объём данных. - -Существует несколько способов получения полностью "схлопнутых" данных из таблицы типа CollapsingMergeTree: -1. Написать запрос с GROUP BY и агрегатными функциями, учитывающими знак. Например, чтобы посчитать количество, надо вместо count() написать sum(Sign); чтобы посчитать сумму чего-либо, надо вместо sum(x) написать sum(Sign * x) и т. п., а также добавить HAVING sum(Sign) > 0. Не все величины можно посчитать подобным образом. Например, агрегатные функции min, max не могут быть переписаны. -2. Если необходимо вынимать данные без агрегации (например, проверить наличие строк, самые новые значения которых удовлетворяют некоторым условиям), можно использовать модификатор FINAL для секции FROM. Это вариант существенно менее эффективен. - - -==SummingMergeTree== - -Отличается от MergeTree тем, что суммирует данные при слиянии. - -%%SummingMergeTree(EventDate, (OrderID, EventDate, BannerID, ...), 8192)%% - -Столбцы для суммирования заданы неявно. При слиянии, для всех строчек с одинаковым значением первичного ключа (в примере - OrderID, EventDate, BannerID, ...), производится суммирование значений в числовых столбцах, не входящих в первичный ключ. - -%%SummingMergeTree(EventDate, (OrderID, EventDate, BannerID, ...), 8192, (Shows, Clicks, Cost, ...))%% - -Явно заданные столбцы для суммирования (последний параметр - Shows, Clicks, Cost, ...). При слиянии, для всех строчек с одинаковым значением первичного ключа, производится суммирование значений в указанных столбцах. Указанные столбцы также должны быть числовыми и не входить в первичный ключ. - -Если значения во всех таких столбцах оказались нулевыми, то строчка удаляется. (За исключением случаев, когда в куске данных не осталось бы ни одной строчки.) - -Для остальных столбцов, не входящих в первичный ключ, при слиянии выбирается первое попавшееся значение. - -При чтении, суммирование не делается само по себе. Если оно необходимо - напишите соответствующий GROUP BY. - -Дополнительно, таблица может иметь вложенные структуры данных, которые обрабатываются особым образом. -Если название вложенной таблицы заканчивается на Map и она содержит не менее двух столбцов, удовлетворяющих следующим критериям: -- первый столбец - числовой ((U)IntN, Date, DateTime), назовем его условно key, -- остальные столбцы - арифметические ((U)IntN, Float32/64), условно (values...), -то такая вложенная таблица воспринимается как отображение key => (values...) и при слиянии ее строк выполняется слияние элементов двух множеств по key со сложением соответствующих (values...). -Примеры: - -%% -[(1, 100)] + [(2, 150)] -> [(1, 100), (2, 150)] -[(1, 100)] + [(1, 150)] -> [(1, 250)] -[(1, 100)] + [(1, 150), (2, 150)] -> [(1, 250), (2, 150)] -[(1, 100), (2, 150)] + [(1, -100)] -> [(2, 150)] -%% - -Для вложенных структур данных не нужно указывать её столбцы в качестве списка столбцов для суммирования. - -Этот движок таблиц разработан по просьбе БК, и является мало полезным. Помните, что при хранении лишь предагрегированных данных, вы теряете часть преимуществ системы. - - -==AggregatingMergeTree== - -Отличается от MergeTree тем, что при слиянии, выполняет объединение состояний агрегатных функций, хранимых в таблице, для строчек с одинаковым значением первичного ключа. - -Чтобы это работало, используются: тип данных AggregateFunction, а также модификаторы -State и -Merge для агрегатных функций. Рассмотрим подробнее. - -Существует тип данных AggregateFunction. Это параметрический тип данных. В качестве параметров передаются: имя агрегатной функции, затем типы её аргументов. -Примеры: - -%%CREATE TABLE t -( - column1 AggregateFunction(uniq, UInt64), - column2 AggregateFunction(anyIf, String, UInt8), - column3 AggregateFunction(quantiles(0.5, 0.9), UInt64) -) ENGINE = ... -%% - -Столбец такого типа хранит состояние агрегатной функции. - -Чтобы получить значение такого типа, следует использовать агрегатные функции с суффиксом State. -Пример: uniqState(UserID), quantilesState(0.5, 0.9)(SendTiming) - в отличие от соответствующих функций uniq, quantiles, такие функции возвращают не готовое значение, а состояние. То есть, значение типа AggregateFunction. - -Значение типа AggregateFunction нельзя вывести в Pretty-форматах. В других форматах, значения такого типа выводятся в виде implementation-specific бинарных данных. То есть, значения типа AggregateFunction не предназначены для вывода, сохранения в дамп. - -Единственную полезную вещь, которую можно сделать со значениями типа AggregateFunction - это объединить состояния и получить результат, по сути - доагрегировать до конца. Для этого используются агрегатные функции с суффиксом Merge. -Пример: uniqMerge(UserIDState), где UserIDState имеет тип AggregateFunction. - -То есть, агрегатная функция с суффиксом Merge берёт множество состояний, объединяет их, и возвращает готовый результат. -Для примера, эти два запроса возвращают один и тот же результат: - -%%SELECT uniq(UserID) FROM table%% - -%%SELECT uniqMerge(state) FROM (SELECT uniqState(UserID) AS state FROM table GROUP BY RegionID)%% - -Существует движок AggregatingMergeTree. Он занимается тем, что при слияниях, выполняет объединение состояний агрегатных функций из разных строчек таблицы с одним значением первичного ключа. - -В таблицу, содержащую столбцы типа AggregateFunction невозможно вставить строчку обычным запросом INSERT, так как невозможно явно указать значение типа AggregateFunction. Вместо этого, для вставки данных, следует использовать INSERT SELECT с агрегатными функциями -State. - -При SELECT-е из таблицы AggregatingMergeTree, используйте GROUP BY и агрегатные функции с модификатором -Merge, чтобы доагрегировать данные. - -Таблицы типа AggregatingMergeTree могут использоваться для инкрементальной агрегации данных, в том числе, для агрегирующих материализованных представлений. - -Пример: -Создаём материализованное представление типа AggregatingMergeTree, следящее за таблицей test.visits: - -%% -CREATE MATERIALIZED VIEW test.basic -ENGINE = AggregatingMergeTree(StartDate, (CounterID, StartDate), 8192) -AS SELECT - CounterID, - StartDate, - sumState(Sign) AS Visits, - uniqState(UserID) AS Users -FROM test.visits -GROUP BY CounterID, StartDate; -%% - -Вставляем данные в таблицу test.visits. Данные будут также вставлены в представление, где они будут агрегированы: - -%% -INSERT INTO test.visits ... -%% - -Делаем SELECT из представления, используя GROUP BY, чтобы доагрегировать данные: - -%% -SELECT - StartDate, - sumMerge(Visits) AS Visits, - uniqMerge(Users) AS Users -FROM test.basic -GROUP BY StartDate -ORDER BY StartDate; -%% - -Вы можете создать такое материализованное представление и навесить на него обычное представление, выполняющее доагрегацию данных. - -Заметим, что в большинстве случаев, использование AggregatingMergeTree является неоправданным, так как можно достаточно эффективно выполнять запросы по неагрегированным данных. - - -==ReplacingMergeTree== - -Движок таблиц отличается от MergeTree тем, что выполняет удаление дублирующихся записей с одинаковым значением первичного ключа. - -Последний, необязательный параметр движка таблицы - столбец с "версией". При слиянии, для всех строк с одинаковым значением первичного ключа, оставляет только одну строку: если задан столбец версии - строку с максимальной версией, иначе - последнюю строку. - -Столбец с версией должен иметь тип из семейства UInt, либо Date или DateTime. - -%%ReplacingMergeTree(EventDate, (OrderID, EventDate, BannerID, ...), 8192, ver)%% - -Обратите внимание, что дедупликация данных производится лишь во время слияний. Слияние происходят в фоне, в неизвестный момент времени, на который вы не можете ориентироваться. Некоторая часть данных может так и остаться необработанной. Хотя вы можете вызвать внеочередное слияние с помощью запроса OPTIMIZE, на это не стоит рассчитывать, так как запрос OPTIMIZE приводит к чтению и записи большого объёма данных. - -Таким образом, ReplacingMergeTree подходит для фоновой чистки дублирующихся данных в целях экономии места, но не даёт гарантий отсутствия дубликатов. - -Движок не используется в Яндекс.Метрике, но нашёл своё применение в других отделах Яндекса. - - -==Null== - -При записи в таблицу типа Null, данные игнорируются. При чтении из таблицы типа Null, возвращается пустота. - -Тем не менее, есть возможность создать материализованное представление над таблицей типа Null. Тогда данные, записываемые в таблицу, будут попадать в представление. - - -==View== - -Используется для реализации представлений (подробнее см. запрос CREATE VIEW). Не хранит данные, а хранит только указанный запрос SELECT. При чтении из таблицы, выполняет его (с удалением из запроса всех ненужных столбцов). - - -==MaterializedView== - -Используется для реализации материализованных представлений (подробнее см. запрос CREATE MATERIALIZED VIEW). Для хранения данных, использует другой движок, который был указан при создании представления. При чтении из таблицы, просто использует этот движок. - - -==Set== - -Представляет собой множество, постоянно находящееся в оперативке. Предназначено для использования в правой части оператора IN (смотрите раздел "Операторы IN"). - -В таблицу можно вставлять данные INSERT-ом - будут добавлены новые элементы в множество, с игнорированием дубликатов. -Но из таблицы нельзя, непосредственно, делать SELECT. Единственная возможность чтения - использование в правой части оператора IN. - -Данные постоянно находятся в оперативке. При INSERT-е, в директорию таблицы на диске, также пишутся блоки вставленных данных. При запуске сервера, эти данные считываются в оперативку. То есть, после перезапуска, данные остаются на месте. - -При грубом перезапуске сервера, блок данных на диске может быть потерян или повреждён. В последнем случае, может потребоваться вручную удалить файл с повреждёнными данными. - - -==Join== - -Представляет собой подготовленную структуру данных для JOIN-а, постоянно находящуюся в оперативке. - -%%Join(ANY|ALL, LEFT|INNER, k1[, k2, ...])%% - -Параметры движка: ANY|ALL - строгость, LEFT|INNER - тип. Эти параметры (задаются без кавычек) должны соответствовать тому JOIN-у, для которого будет использоваться таблица. k1, k2, ... - ключевые столбцы из секции USING, по которым будет делаться соединение. - -Таблица не может использоваться для GLOBAL JOIN-ов. - -В таблицу можно вставлять данные INSERT-ом, аналогично движку Set. В случае ANY, данные для дублирующихся ключей будут проигнорированы; в случае ALL - будут учитываться. Из таблицы нельзя, непосредственно, делать SELECT. Единственная возможность чтения - использование в качестве "правой" таблицы для JOIN. - -Хранение данных на диске аналогично движку Set. - - -==Buffer== - -Буферизует записываемые данные в оперативке, периодически сбрасывая их в другую таблицу. При чтении, производится чтение данных одновременно из буфера и из другой таблицы. - -%%Buffer(database, table, num_layers, min_time, max_time, min_rows, max_rows, min_bytes, max_bytes)%% - -Параметры движка: -database, table - таблица, в которую сбрасывать данные. Вместо имени базы данных может использоваться константное выражение, возвращающее строку. -num_layers - уровень параллелизма. Физически таблица будет представлена в виде num_layers независимых буферов. Рекомендуемое значение - 16. -min_time, max_time, min_rows, max_rows, min_bytes, max_bytes - условия для сброса данных из буфера. - -Данные сбрасываются из буфера и записываются в таблицу назначения, если выполнены все min-условия или хотя бы одно max-условие. -min_time, max_time - условие на время в секундах от момента первой записи в буфер; -min_rows, max_rows - условие на количество строк в буфере; -min_bytes, max_bytes - условие на количество байт в буфере. - -При записи, данные вставляются в случайный из num_layers буферов. Или, если размер куска вставляемых данных достаточно большой (больше max_rows или max_bytes), то он записывается в таблицу назначения минуя буфер. - -Условия для сброса данных учитываются отдельно для каждого из num_layers буферов. Например, если num_layers = 16 и max_bytes = 100000000, то максимальный расход оперативки будет 1.6 GB. - -Пример: - -%%CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10, 100, 10000, 1000000, 10000000, 100000000)%% - -Создаём таблицу merge.hits_buffer такой же структуры как merge.hits и движком Buffer. При записи в эту таблицу, данные буферизуются в оперативке и, в дальнейшем, записываются в таблицу merge.hits. Создаётся 16 буферов. Данные, имеющиеся в каждом из них будут сбрасываться, если прошло сто секунд, или записан миллион строк, или записано сто мегабайт данных; или если одновременно прошло десять секунд и записано десять тысяч строк и записано десять мегабайт данных. Для примера, если записана всего лишь одна строка, то через сто секунд она будет сброшена в любом случае. А если записано много строк, то они будут сброшены раньше. - -При остановке сервера, при DROP TABLE или DETACH TABLE, данные из буфера тоже сбрасываются в таблицу назначения. - -В качестве имени базы данных и имени таблицы можно указать пустые строки в одинарных кавычках. Это обозначает отсутствие таблицы назначения. В таком случае, при достижении условий на сброс данных, буфер будет просто очищаться. Это может быть полезным, чтобы хранить в оперативке некоторое окно данных. - -При чтении из таблицы типа Buffer, будут обработаны данные, как находящиеся в буфере, так и данные из таблицы назначения (если такая есть). -Но следует иметь ввиду, что таблица Buffer не поддерживает индекс. То есть, данные в буфере будут просканированы полностью, что может быть медленно для буферов большого размера. (Для данных в подчинённой таблице, будет использоваться тот индекс, который она поддерживает.) - -Если множество столбцов таблицы Buffer не совпадает с множеством столбцов подчинённой таблицы, то будут вставлено подмножество столбцов, которое присутствует в обеих таблицах. - -Если у одного из столбцов таблицы Buffer и подчинённой таблицы не совпадает тип, то в лог сервера будет записано сообщение об ошибке и буфер будет очищен. -То же самое происходит, если подчинённая таблица не существует в момент сброса буфера. - -Если есть необходимость выполнить ALTER для подчинённой таблицы и для таблицы Buffer, то рекомендуется удалить таблицу Buffer, затем выполнить ALTER подчинённой таблицы, а затем создать таблицу Buffer заново. - -При нештатном перезапуске сервера, данные, находящиеся в буфере, будут потеряны. - -Для таблиц типа Buffer неправильно работают PREWHERE, FINAL и SAMPLE. Эти условия пробрасываются в таблицу назначения, но не используются для обработки данных в буфере. В связи с этим, рекомендуется использовать таблицу типа Buffer только для записи, а читать из таблицы назначения. - -При добавлении данных в Buffer, один из буферов блокируется. Это приводит к задержкам, если одновременно делается чтение из таблицы. - -Данные, вставляемые в таблицу Buffer, попадают в подчинённую таблицу в порядке, возможно отличающимся от порядка вставки, и блоками, возможно отличающимися от вставленных блоков. В связи с этим, трудно корректно использовать таблицу типа Buffer для записи в CollapsingMergeTree. Чтобы избежать проблемы, можно выставить num_layers в 1. - -Если таблица назначения является реплицируемой, то при записи в таблицу Buffer будут потеряны некоторые ожидаемые свойства реплицируемых таблиц. Из-за произвольного изменения порядка строк и размеров блоков данных, перестаёт работать дедупликация данных, в результате чего исчезает возможность надёжной exactly once записи в реплицируемые таблицы. - -В связи с этими недостатками, таблицы типа Buffer можно рекомендовать к применению лишь в очень редких случаях. - -Таблицы типа Buffer используются в тех случаях, когда от большого количества серверов поступает слишком много INSERT-ов в единицу времени, и нет возможности заранее самостоятельно буферизовать данные перед вставкой, в результате чего, INSERT-ы не успевают выполняться. - -Заметим, что даже для таблиц типа Buffer не имеет смысла вставлять данные по одной строке, так как таким образом будет достигнута скорость всего лишь в несколько тысяч строк в секунду, тогда как при вставке более крупными блоками, достижимо более миллиона строк в секунду (смотрите раздел "Производительность"). - - -==Репликация данных== - -===ReplicatedMergeTree=== -===ReplicatedCollapsingMergeTree=== -===ReplicatedAggregatingMergeTree=== -===ReplicatedSummingMergeTree=== - -Репликация поддерживается только для таблиц семейства MergeTree. Репликация работает на уровне отдельных таблиц, а не всего сервера. То есть, на сервере могут быть расположены одновременно реплицируемые и не реплицируемые таблицы. - -Реплицируются INSERT, ALTER (см. подробности в описании запроса ALTER). Реплицируются сжатые данные, а не тексты запросов. -Запросы CREATE, DROP, ATTACH, DETACH, RENAME не реплицируются - то есть, относятся к одному серверу. Запрос CREATE TABLE создаёт новую реплицируемую таблицу на том сервере, где выполняется запрос; а если на других серверах такая таблица уже есть - добавляет новую реплику. Запрос DROP TABLE удаляет реплику, расположенную на том сервере, где выполняется запрос. Запрос RENAME переименовывает таблицу на одной из реплик - то есть, реплицируемые таблицы на разных репликах могут называться по разному. - -Репликация никак не связана с шардированием. На каждом шарде репликация работает независимо. - -Репликация является опциональной возможностью. Для использования репликации, укажите в конфигурационном файле адреса ZooKeeper кластера. Пример: - -%% -<zookeeper> - <node index="1"> - <host>example1</host> - <port>2181</port> - </node> - <node index="2"> - <host>example2</host> - <port>2181</port> - </node> - <node index="3"> - <host>example3</host> - <port>2181</port> - </node> -</zookeeper> -%% - -Используйте версию ZooKeeper не старее 3.4.5. Для примера, в Ubuntu Precise слишком старая версия в пакете. - -Можно указать любой имеющийся у вас ZooKeeper-кластер - система будет использовать в нём одну директорию для своих данных (директория указывается при создании реплицируемой таблицы). - -Если в конфигурационном файле не настроен ZooKeeper, то вы не сможете создать реплицируемые таблицы, а уже имеющиеся реплицируемые таблицы будут доступны в режиме только на чтение. - -При запросах SELECT, ZooKeeper не используется. То есть, репликация никак не влияет на производительность SELECT-ов - запросы работают так же быстро, как и для нереплицируемых таблиц. - -При каждом запросе INSERT (точнее, на каждый вставляемый блок данных; запрос INSERT содержит один блок, или по блоку на каждые max_insert_block_size = 1048576 строк), делается около десятка записей в ZooKeeper в рамках нескольких транзакций. Это приводит к некоторому увеличению задержек при INSERT-е, по сравнению с нереплицируемыми таблицами. Но если придерживаться обычных рекомендаций - вставлять данные пачками не более одного INSERT-а в секунду, то это не составляет проблем. На всём кластере ClickHouse, использующим для координации один кластер ZooKeeper, может быть в совокупности несколько сотен INSERT-ов в секунду. Пропускная способность при вставке данных (количество строчек в секунду) такая же высокая, как для нереплицируемых таблиц. - -Для очень больших кластеров, можно использовать разные кластеры ZooKeeper для разных шардов. Впрочем, на кластере Яндекс.Метрики (примерно 300 серверов) такой необходимости не возникает. - -Репликация асинхронная, мульти-мастер. Запросы INSERT (а также ALTER) можно отправлять на любой доступный сервер. Данные вставятся на этот сервер, а затем приедут на остальные серверы. В связи с асинхронностью, только что вставленные данные, появляются на остальных репликах с небольшой задержкой. Если часть реплик недоступна - данные на них запишутся тогда, когда они станут доступны. Если реплика доступна, то задержка составляет столько времени, сколько требуется для передачи блока сжатых данных по сети. - -Кворумная запись отсутствует. То есть, вы не можете записать данные с подтверждением их получения более одной репликой. Если вы записали пачку данных на одну реплику, и данные ещё не успели разъехаться по остальным репликам, после чего сервер с этими данными перестал существовать, то эта пачка данных будет потеряна. - -Каждый блок данных записывается атомарно. Запрос INSERT разбивается на блоки данных размером до max_insert_block_size = 1048576 строк. То есть, если в запросе INSERT менее 1048576 строк, то он делается атомарно. - -Блоки данных дедуплицируются. При многократной записи одного и того же блока данных (блоков данных одинакового размера, содержащих одни и те же строчки в одном и том же порядке), блок будет записан только один раз. Это сделано для того, чтобы в случае сбоя в сети, когда клиентское приложение не может понять, были ли данные записаны в БД, можно было просто повторить запрос INSERT. При этом не имеет значения, на какую реплику будут отправлены INSERT-ы с одинаковыми данными. То есть, обеспечивается идемпотентность INSERT-ов. Это работает только для последних 100 вставленных в таблицу блоков. - -При репликации, по сети передаются только исходные вставляемые данные. Дальнейшие преобразования данных (слияния) координируются и делаются на всех репликах одинаковым образом. За счёт этого минимизируется использование сети, и благодаря этому, репликация хорошо работает при расположении реплик в разных датацентрах. (Стоит заметить, что дублирование данных в разных датацентрах, по сути, является основной задачей репликации). - -Количество реплик одних и тех же данных может быть произвольным. В Яндекс.Метрике в продакшене используется двухкратная репликация. На каждом сервере используется RAID-5 или RAID-6, в некоторых случаях RAID-10. Это является сравнительно надёжным и удобным для эксплуатации решением. - -Система следит за синхронностью данных на репликах и умеет восстанавливаться после сбоя. Восстановление после сбоя автоматическое (в случае небольших различий в данных) или полуавтоматическое (когда данные отличаются слишком сильно, что может свидетельствовать об ошибке конфигурации). - - -===Создание реплицируемых таблиц=== - -В начало имени движка таблицы добавляется Replicated. Например, ReplicatedMergeTree. - -Также добавляются два параметра в начало списка параметров - путь к таблице в ZooKeeper, имя реплики в ZooKeeper. - -Пример: -ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/hits', '{replica}', EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID), EventTime), 8192) - -Как видно в примере, эти параметры могут содержать подстановки в фигурных скобках. Подставляемые значения достаются из конфигурационного файла, из секции macros. Пример: - -%% -<macros> - <layer>05</layer> - <shard>02</shard> - <replica>example05-02-1.yandex.ru</replica> -</macros> -%% - -Путь к таблице в ZooKeeper должен быть разным для каждой реплицируемой таблицы. В том числе, для таблиц на разных шардах, должны быть разные пути. -В данном случае, путь состоит из следующих частей: - -%%/clickhouse/tables/%% - общий префикс. Рекомендуется использовать именно его. - -%%{layer}-{shard}%% - идентификатор шарда. В данном примере он состоит из двух частей, так как на кластере Яндекс.Метрики используется двухуровневое шардирование. Для большинства задач, оставьте только подстановку {shard}, которая будет раскрываться в идентификатор шарда. - -%%hits%% - имя узла для таблицы в ZooKeeper. Разумно делать его таким же, как имя таблицы. Оно указывается явно, так как, в отличие от имени таблицы, оно не меняется после запроса RENAME. - -Имя реплики - то, что идентифицирует разные реплики одной и той же таблицы. Можно использовать для него имя сервера, как показано в примере. Впрочем, достаточно, чтобы имя было уникально лишь в пределах каждого шарда. - -Можно не использовать подстановки, а прописать всё явно. Это может быть удобным для тестирования и при настройке маленьких кластеров, но менее удобным при работе с большими кластерами. - -Выполните запрос CREATE TABLE на каждой реплике. Запрос создаёт новую реплицируемую таблицу, или добавляет новую реплику к имеющимся. - -Если вы добавляете новую реплику после того, как таблица на других репликах уже содержит некоторые данные, то после выполнения запроса, данные на новую реплику будут скачаны с других реплик. То есть, новая реплика синхронизирует себя с остальными. - -Для удаления реплики, выполните запрос DROP TABLE. При этом, удаляется только одна реплика - расположенная на том сервере, где вы выполняете запрос. - - -===Восстановление после сбоя=== - -Если при старте сервера, недоступен ZooKeeper, реплицируемые таблицы переходят в режим только для чтения. Система будет пытаться периодически установить соединение с ZooKeeper. - -Если при INSERT-е недоступен ZooKeeper, или происходит ошибка при взаимодействии с ним, будет выкинуто исключение. - -При подключении к ZooKeeper, система проверяет соответствие между имеющимся в локальной файловой системе набором данных и ожидаемым набором данных (информация о котором хранится в ZooKeeper). Если имеются небольшие несоответствия, то система устраняет их, синхронизируя данные с реплик. - -Обнаруженные битые куски данных (с файлами несоответствующего размера) или неизвестные куски (куски, записанные в файловую систему, но информация о которых не была записана в ZooKeeper) переносятся в поддиректорию detached (не удаляются). Недостающие куски скачиваются с реплик. - -Стоит заметить, что ClickHouse не делает самостоятельно никаких деструктивных действий типа автоматического удаления большого количества данных. - -При старте сервера (или создании новой сессии с ZooKeeper), проверяется только количество и размеры всех файлов. Если у файлов совпадают размеры, но изменены байты где-то посередине, то это обнаруживается не сразу, а только при попытке их прочитать при каком-либо запросе SELECT - запрос кинет исключение о несоответствующей чексумме или размере сжатого блока. В этом случае, куски данных добавляются в очередь на проверку, и при необходимости, скачиваются с реплик. - -Если обнаруживается, что локальный набор данных слишком сильно отличается от ожидаемого, то срабатывает защитный механизм - сервер сообщает об этом в лог и отказывается запускаться. Это сделано, так как такой случай может свидетельствовать об ошибке конфигурации - например, если реплика одного шарда была случайно сконфигурирована, как реплика другого шарда. Тем не менее, пороги защитного механизма поставлены довольно низкими, и такая ситуация может возникнуть и при обычном восстановлении после сбоя. В этом случае, восстановление делается полуавтоматически - "по кнопке". - -Для запуска восстановления, создайте в ZooKeeper узел /path_to_table/replica_name/flags/force_restore_data с любым содержимым или выполните команду для восстановления всех реплицируемых таблиц: -%%sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data%% -Затем запустите сервер. При старте, сервер удалит эти флаги и запустит восстановление. - - -===Восстановление в случае потери всех данных=== - -Если на одном из серверов исчезли все данные и метаданные, восстановление делается следующим образом: - -1. Установите на сервер ClickHouse. Корректно пропишите подстановки в конфигурационном файле, отвечающие за идентификатор шарда и реплики, если вы их используете. - -2. Если у вас были нереплицируемые таблицы, которые должны быть вручную продублированы на серверах, скопируйте их данные (в директории /var/lib/clickhouse/data/db_name/table_name/) с реплики. - -3. Скопируйте с реплики определения таблиц, находящиеся в %%/var/lib/clickhouse/metadata/%%. Если в определениях таблиц, идентификатор шарда или реплики, прописаны в явном виде - исправьте их, чтобы они соответствовали данной реплике. -(Альтернативный вариант - запустить сервер и сделать самостоятельно все запросы ATTACH TABLE, которые должны были бы быть в соответствующих .sql файлах в %%/var/lib/clickhouse/metadata/%%.) - -4. Создайте в ZooKeeper узел /path_to_table/replica_name/flags/force_restore_data с любым содержимым или выполните команду для восстановления всех реплицируемых таблиц: -%%sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data%% -Затем запустите сервер (перезапустите, если уже запущен). Данные будут скачаны с реплик. - -В качестве альтернативного варианта восстановления, вы можете удалить из ZooKeeper информацию о потерянной реплике - /path_to_table/replica_name, и затем создать реплику заново, как написано в разделе "Создание реплицируемых таблиц". - -Отсутствует ограничение на использование сетевой полосы при восстановлении. Имейте это ввиду, если восстанавливаете сразу много реплик. - - -===Преобразование из MergeTree в ReplicatedMergeTree=== - -Здесь и далее, под MergeTree подразумеваются все движки таблиц семейства MergeTree, так же для ReplicatedMergeTree. - -Если у вас была таблица типа MergeTree, репликация которой делалась вручную, вы можете преобразовать её в реплицируемую таблицу. Это может понадобиться лишь в случаях, когда вы уже успели накопить большое количество данных в таблице типа MergeTree, а сейчас хотите включить репликацию. - -Если на разных репликах данные отличаются, то сначала синхронизируйте их, либо удалите эти данные на всех репликах кроме одной. - -Переименуйте имеющуюся MergeTree таблицу, затем создайте со старым именем таблицу типа ReplicatedMergeTree. -Перенесите данные из старой таблицы в поддиректорию detached в директории с данными новой таблицы (/var/lib/clickhouse/data/db_name/table_name/). -Затем добавьте эти куски данных в рабочий набор с помощью выполнения запросов ALTER TABLE ATTACH PART на одной из реплик. - -Если на остальных репликах есть точно такие же куски, они будут добавлены в рабочий набор на них. Если нет - куски будут скачаны с той реплики, где они есть. - - -===Преобразование из ReplicatedMergeTree в MergeTree=== - -Создайте таблицу типа MergeTree с другим именем. Перенесите в её директорию с данными все данные из директории с данными таблицы типа ReplicatedMergeTree. Затем удалите таблицу типа ReplicatedMergeTree и перезапустите сервер. - -Если вы хотите избавиться от таблицы ReplicatedMergeTree, не запуская сервер, то -- удалите соответствующий файл .sql в директории с метаданными (%%/var/lib/clickhouse/metadata/%%); -- удалите соответствующий путь в ZooKeeper (/path_to_table/replica_name); -После этого, вы можете запустить сервер, создать таблицу типа MergeTree, перенести данные в её директорию, и перезапустить сервер. - - -===Восстановление в случае потери или повреждения метаданных на ZooKeeper кластере=== - -Если данные в ZooKeeper оказались утеряны или повреждены, то вы можете сохранить данные, переместив их в нереплицируемую таблицу, как описано в пункте выше. - - -
    -
    -

    Системные таблицы

    -
    -
    - -Системные таблицы используются для реализации части функциональности системы, а также предоставляют доступ к информации о работе системы. -Вы не можете удалить системную таблицу (хотя можете сделать DETACH). -Для системных таблиц нет файлов с данными на диске и файлов с метаданными. Сервер создаёт все системные таблицы при старте. -В системные таблицы нельзя записывать данные - можно только читать. -Системные таблицы расположены в базе данных system. - -==system.one== - -Таблица содержит одну строку с одним столбцом dummy типа UInt8, содержащим значение 0. -Эта таблица используется, если в SELECT запросе не указана секция FROM. -То есть, это - аналог таблицы DUAL, которую можно найти в других СУБД. - -==system.numbers== - -Таблица содержит один столбец с именем number типа UInt64, содержащим почти все натуральные числа, начиная с нуля. -Эту таблицу можно использовать для тестов, а также если вам нужно сделать перебор. -Чтения из этой таблицы не распараллеливаются. - -==system.numbers_mt== - -То же самое, что и system.numbers, но чтение распараллеливается. Числа могут возвращаться в произвольном порядке. -Используется для тестов. - -==system.tables== - -Таблица содержит столбцы database, name, engine типа String и столбец metadata_modification_time типа DateTime. -Для каждой таблицы, о которой знает сервер, будет присутствовать соответствующая запись в таблице system.tables. -Недоработка: Движки таблиц (engine) указаны без параметров. -Эта системная таблица используется для реализации запросов SHOW TABLES. - -==system.databases== - -Таблица содержит один столбец name типа String - имя базы данных. -Для каждой базы данных, о которой знает сервер, будет присутствовать соответствующая запись в таблице. -Эта системная таблица используется для реализации запроса SHOW DATABASES. - -==system.processes== - -Эта системная таблица используется для реализации запроса SHOW PROCESSLIST. -Столбцы: - -%% -user String - имя пользователя, который задал запрос. При распределённой обработке запроса, относится к пользователю, с помощью которого сервер-инициатор запроса отправил запрос на данный сервер, а не к имени пользователя, который задал распределённый запрос на сервер-инициатор запроса. - -address String - IP-адрес, с которого задан запрос. При распределённой обработке запроса, аналогично. - -elapsed Float64 - время в секундах, прошедшее от начала выполнения запроса. - -rows_read UInt64 - количество прочитанных из таблиц строк. При распределённой обработке запроса, на сервере-инициаторе запроса, представляет собой сумму по всем удалённым серверам. - -bytes_read UInt64 - количество прочитанных из таблиц байт, в несжатом виде. При распределённой обработке запроса, на сервере-инициаторе запроса, представляет собой сумму по всем удалённым серверам. - -total_rows_approx UInt64 - приблизительная оценка общего количества строк, которые должны быть прочитаны. При распределённой обработке запроса, на сервере-инициаторе запроса, представляет собой сумму по всем удалённым серверам. Может обновляться в процессе выполнения запроса, когда становятся известны новые источники для обработки. - -memory_usage UInt64 - потребление памяти запросом. Может не учитывать некоторые виды выделенной памяти. - -query String - текст запроса. В случае INSERT - без данных для INSERT-а. - -query_id String - идентификатор запроса, если был задан. -%% - -==system.events== - -Содержит информацию о количестве произошедших в системе событий, для профилирования и мониторинга. -Пример: количество обработанных запросов типа SELECT. -Столбцы: event String - имя события, value UInt64 - количество. - -==system.metrics== -==system.asynchronous_metrics== - -Содержат метрики, используемые для профилирования и мониторинга. -Обычно отражают количество событий, происходящих в данный момент в системе, или ресурсов, суммарно потребляемых системой. -Пример: количество запросов типа SELECT, исполняемых в текущий момент; количество потребляемой памяти. -system.asynchronous_metrics и system.metrics отличаются набором и способом вычисления метрик. - -==system.clusters== - -Содержит информацию о доступных в конфигурационном файле кластерах и серверах, которые в них входят. -Столбцы: - -%% -cluster String - имя кластера -shard_num UInt32 - номер шарда в кластере, начиная с 1 -shard_weight UInt32 - относительный вес шарда при записи данных -replica_num UInt32 - номер реплики в шарде, начиная с 1 -host_name String - имя хоста, как прописано в конфиге -host_address String - IP-адрес хоста, полученный из DNS -port UInt16 - порт, на который обращаться для соединения с сервером -user String - имя пользователя, которого использовать для соединения с сервером -%% - -==system.columns== - -Содержит информацию о столбцах всех таблиц. -С помощью этой таблицы можно получить информацию аналогично запросу DESCRIBE TABLE, но для многих таблиц сразу. - -%% -database String - имя базы данных, в которой находится таблица -table String - имя таблицы -name String - имя столбца -type String - тип столбца -default_type String - тип (DEFAULT, MATERIALIZED, ALIAS) выражения для значения по умолчанию, или пустая строка, если оно не описано -default_expression String - выражение для значения по умолчанию, или пустая строка, если оно не описано -%% - -==system.dictionaries== - -Содержит информацию о внешних словарях. -Столбцы: - -%% -name String - имя словаря -type String - тип словаря: Flat, Hashed, Cache -origin String - путь к конфигурационному файлу, в котором описан словарь -attribute.names Array(String) - массив имён атрибутов, предоставляемых словарём -attribute.types Array(String) - соответствующий массив типов атрибутов, предоставляемых словарём -has_hierarchy UInt8 - является ли словарь иерархическим -bytes_allocated UInt64 - количество оперативной памяти, которое использует словарь -hit_rate Float64 - для cache-словарей - доля использований, для которых значение было в кэше -element_count UInt64 - количество хранящихся в словаре элементов -load_factor Float64 - доля заполненности словаря (для hashed словаря - доля заполнения хэш-таблицы) -creation_time DateTime - время создания или последней успешной перезагрузки словаря -last_exception String - текст ошибки, возникшей при создании или перезагрузке словаря, если словарь не удалось создать -source String - текст, описывающий источник данных для словаря -%% - -Заметим, что количество оперативной памяти, которое использует словарь, не является пропорциональным количеству элементов, хранящихся в словаре. Так, для flat и cached словарей, все ячейки памяти выделяются заранее, независимо от реальной заполненности словаря. - - -==system.functions== - -Содержит информацию об обычных и агрегатных функциях. -Столбцы: - -%% -name String - имя функции -is_aggregate UInt8 - является ли функция агрегатной -%% - -==system.merges== - -Содержит информацию о производящихся прямо сейчас слияниях для таблиц семейства MergeTree. -Столбцы: - -%% -database String - имя базы данных, в которой находится таблица -table String - имя таблицы -elapsed Float64 - время в секундах, прошедшее от начала выполнения слияния -progress Float64 - доля выполненной работы от 0 до 1 -num_parts UInt64 - количество сливаемых кусков -result_part_name String - имя куска, который будет образован в результате слияния -total_size_bytes_compressed UInt64 - суммарный размер сжатых данных сливаемых кусков -total_size_marks UInt64 - суммарное количество засечек в сливаемых кусках -bytes_read_uncompressed UInt64 - количество прочитанных байт, разжатых -rows_read UInt64 - количество прочитанных строк -bytes_written_uncompressed UInt64 - количество записанных байт, несжатых -rows_written UInt64 - количество записанных строк -%% - -==system.parts== - -Содержит информацию о кусках таблиц семейства MergeTree. -Столбцы: - -%% -database String - имя базы данных, в которой находится таблица, к которой относится кусок -table String - имя таблицы, к которой относится кусок -engine String - имя движка таблицы, без параметров -partition String - имя партиции - имеет формат YYYYMM -name String - имя куска -replicated UInt8 - относится ли кусок к реплицируемым данным -active UInt8 - используется ли кусок в таблице, или же он уже не нужен и скоро будет удалён - неактивные куски остаются после слияния -marks UInt64 - количество засечек - умножьте на гранулированность индекса (обычно 8192), чтобы получить примерное количество строк в куске -bytes UInt64 - количество байт в сжатом виде -modification_time DateTime - время модификации директории с куском - обычно соответствует времени создания куска -remove_time DateTime - только для неактивных кусков - время, когда кусок стал неактивным -refcount UInt32 - количество мест, в котором кусок используется - значение больше 2 говорит о том, что этот кусок участвует в запросах или в слияниях -%% - -==system.replicas== - -Содержит информацию и статус для реплицируемых таблиц, расположенных на локальном сервере. -Эту таблицу можно использовать для мониторинга. Таблица содержит по строчке для каждой Replicated*-таблицы. - -Пример: - -%% -SELECT * -FROM system.replicas -WHERE table = 'visits' -FORMAT Vertical - -Row 1: -────── -database: merge -table: visits -engine: ReplicatedCollapsingMergeTree -is_leader: 1 -is_readonly: 0 -is_session_expired: 0 -future_parts: 1 -parts_to_check: 0 -zookeeper_path: /clickhouse/tables/01-06/visits -replica_name: example01-06-1.yandex.ru -replica_path: /clickhouse/tables/01-06/visits/replicas/example01-06-1.yandex.ru -columns_version: 9 -queue_size: 1 -inserts_in_queue: 0 -merges_in_queue: 1 -log_max_index: 596273 -log_pointer: 596274 -total_replicas: 2 -active_replicas: 2 -%% - -Столбцы: - -%% -database: имя БД -table: имя таблицы -engine: имя движка таблицы - -is_leader: является ли реплика лидером -В один момент времени, не более одной из реплик является лидером. Лидер отвечает за выбор фоновых слияний, которые следует произвести. -Замечу, что запись можно осуществлять на любую реплику (доступную и имеющую сессию в ZK), независимо от лидерства. - -is_readonly: находится ли реплика в режиме "только для чтения" -Этот режим включается, если в конфиге нет секции с ZK; если при переинициализации сессии в ZK произошла неизвестная ошибка; во время переинициализации сессии с ZK. - -is_session_expired: истекла ли сессия с ZK. -В основном, то же самое, что и is_readonly. - -future_parts: количество кусков с данными, которые появятся в результате INSERT-ов или слияний, которых ещё предстоит сделать - -parts_to_check: количество кусков с данными в очереди на проверку -Кусок помещается в очередь на проверку, если есть подозрение, что он может быть битым. - -zookeeper_path: путь к данным таблицы в ZK -replica_name: имя реплики в ZK; разные реплики одной таблицы имеют разное имя -replica_path: путь к данным реплики в ZK. То же самое, что конкатенация zookeeper_path/replicas/replica_path. - -columns_version: номер версии структуры таблицы -Обозначает, сколько раз был сделан ALTER. Если на репликах разные версии, значит некоторые реплики сделали ещё не все ALTER-ы. - -queue_size: размер очереди действий, которых предстоит сделать -К действиям относятся вставки блоков данных, слияния, и некоторые другие действия. -Как правило, совпадает с future_parts. - -inserts_in_queue: количество вставок блоков данных, которых предстоит сделать -Обычно вставки должны быстро реплицироваться. Если величина большая - значит что-то не так. - -merges_in_queue: количество слияний, которых предстоит сделать -Бывают длинные слияния - то есть, это значение может быть больше нуля продолжительное время. - -Следующие 4 столбца имеют ненулевое значение только если активна сессия с ZK. - -log_max_index: максимальный номер записи в общем логе действий -log_pointer: максимальный номер записи из общего лога действий, которую реплика скопировала в свою очередь для выполнения, плюс единица -Если log_pointer сильно меньше log_max_index, значит что-то не так. - -total_replicas: общее число известных реплик этой таблицы -active_replicas: число реплик этой таблицы, имеющих сессию в ZK; то есть, число работающих реплик -%% - -Если запрашивать все столбцы, то таблица может работать слегка медленно, так как на каждую строчку делается несколько чтений из ZK. -Если не запрашивать последние 4 столбца (log_max_index, log_pointer, total_replicas, active_replicas), то таблица работает быстро. - -Например, так можно проверить, что всё хорошо: - -%% -SELECT - database, - table, - is_leader, - is_readonly, - is_session_expired, - future_parts, - parts_to_check, - columns_version, - queue_size, - inserts_in_queue, - merges_in_queue, - log_max_index, - log_pointer, - total_replicas, - active_replicas -FROM system.replicas -WHERE - is_readonly - OR is_session_expired - OR future_parts > 20 - OR parts_to_check > 10 - OR queue_size > 20 - OR inserts_in_queue > 10 - OR log_max_index - log_pointer > 10 - OR total_replicas < 2 - OR active_replicas < total_replicas -%% - -Если этот запрос ничего не возвращает - значит всё хорошо. - -==system.settings== - -Содержит информацию о настройках, используемых в данный момент. -То есть, используемых для выполнения запроса, с помощью которого вы читаете из таблицы system.settings. - -Столбцы: - -%% -name String - имя настройки -value String - значение настройки -changed UInt8 - была ли настройка явно задана в конфиге или изменена явным образом -%% - -Пример: - -%% -SELECT * -FROM system.settings -WHERE changed - -┌─name───────────────────┬─value───────┬─changed─┐ -│ max_threads │ 8 │ 1 │ -│ use_uncompressed_cache │ 0 │ 1 │ -│ load_balancing │ random │ 1 │ -│ max_memory_usage │ 10000000000 │ 1 │ -└────────────────────────┴─────────────┴─────────┘ -%% - - -==system.zookeeper== - -Позволяет читать данные из ZooKeeper кластера, описанного в конфигурации. -В запросе обязательно в секции WHERE должно присутствовать условие на равенство path - путь в ZooKeeper, для детей которого вы хотите получить данные. - -Запрос SELECT * FROM system.zookeeper WHERE path = '/clickhouse' выведет данные по всем детям узла /clickhouse. -Чтобы вывести данные по всем узлам в корне, напишите path = '/'. -Если узла, указанного в path не существует, то будет брошено исключение. - -Столбцы: - -%% -name String - имя узла -path String - путь к узлу -value String - значение узла -dataLength Int32 - размер значения -numChildren Int32 - количество детей -czxid Int64 - идентификатор транзакции, в которой узел был создан -mzxid Int64 - идентификатор транзакции, в которой узел был последний раз изменён -pzxid Int64 - идентификатор транзакции, последний раз удаливший или добавивший детей -ctime DateTime - время создания узла -mtime DateTime - время последней модификации узла -version Int32 - версия узла - количество раз, когда узел был изменён -cversion Int32 - количество добавлений или удалений детей -aversion Int32 - количество изменений ACL -ephemeralOwner Int64 - для эфемерных узлов - идентификатор сессии, которая владеет этим узлом -%% - -Пример: - -%% -SELECT * -FROM system.zookeeper -WHERE path = '/clickhouse/tables/01-08/visits/replicas' -FORMAT Vertical - -Row 1: -────── -name: example01-08-1.yandex.ru -value: -czxid: 932998691229 -mzxid: 932998691229 -ctime: 2015-03-27 16:49:51 -mtime: 2015-03-27 16:49:51 -version: 0 -cversion: 47 -aversion: 0 -ephemeralOwner: 0 -dataLength: 0 -numChildren: 7 -pzxid: 987021031383 -path: /clickhouse/tables/01-08/visits/replicas - -Row 2: -────── -name: example01-08-2.yandex.ru -value: -czxid: 933002738135 -mzxid: 933002738135 -ctime: 2015-03-27 16:57:01 -mtime: 2015-03-27 16:57:01 -version: 0 -cversion: 37 -aversion: 0 -ephemeralOwner: 0 -dataLength: 0 -numChildren: 7 -pzxid: 987021252247 -path: /clickhouse/tables/01-08/visits/replicas -%% - - - -
    -
    -

    Табличные функции

    -
    -
    - -Табличные функции могут указываться в секции FROM вместо имени БД и таблицы. -Табличные функции можно использовать только если не выставлена настройка readonly. -Табличные функции не имеют отношения к другим функциям. - -==merge== - -%%merge(db_name, 'tables_regexp')%% - создаёт временную таблицу типа Merge. Подробнее смотрите раздел "Движки таблиц, Merge". -Структура таблицы берётся из первой попавшейся таблицы, подходящей под регулярное выражение. - -==remote== - -%%remote('addresses_expr', db, table[, 'user'[, 'password']])%% -или %%remote('addresses_expr', db.table[, 'user'[, 'password']])%% -- позволяет обратиться к удалённым серверам без создания таблицы типа Distributed. - -%%addresses_expr%% - выражение, генерирующее адреса удалённых серверов. - -Это может быть просто один адрес сервера. Адрес сервера - это хост:порт, или только хост. Хост может быть указан в виде имени сервера, или в виде IPv4 или IPv6 адреса. IPv6 адрес указывается в квадратных скобках. Порт - TCP-порт удалённого сервера. Если порт не указан, используется %%tcp_port%% из конфигурационного файла сервера (по умолчанию - 9000). - -Замечание: в качестве исключения, при указании IPv6-адреса, обязательно также указывать порт. - -Примеры: -%% -example01-01-1 -example01-01-1:9000 -localhost -127.0.0.1 -[::]:9000 -[2a02:6b8:0:1111::11]:9000%% - -Могут быть указаны адреса через запятую - в этом случае, запрос пойдёт на все указанные адреса (как на шарды с разными данными) и будет обработан распределённо. - -Пример: -%%example01-01-1,example01-02-1%% - -Часть выражения может быть указана в фигурных скобках. Предыдущий пример может быть записан следующим образом: -%%example01-0{1,2}-1%% - -В фигурных скобках может быть указан диапазон (неотрицательных целых) чисел через две точки. В этом случае, диапазон раскрывается в множество значений, генерирующих адреса шардов. Если запись первого числа начинается с нуля, то значения формируются с таким же выравниванием нулями. Предыдущий пример может быть записан следующим образом: -%%example01-{01..02}-1%% - -При наличии нескольких пар фигурных скобок, генерируется прямое произведение соответствующих множеств. - -Адреса или их фрагменты в фигурных скобках, могут быть указаны через символ |. В этом случае, соответствующие множества адресов понимаются как реплики - запрос будет отправлен на первую живую реплику. При этом, реплики перебираются в порядке, согласно текущей настройке load_balancing. Пример: - -%%example01-{01..02}-{1|2}%% - -В этом примере указано два шарда, в каждом из которых имеется две реплики. - -Количество генерируемых адресов ограничено некоторой константой - сейчас это 1000 штук. - -Использование табличной функции remote менее оптимально, чем создание таблицы типа Distributed, так как в этом случае, соединения с серверами устанавливаются заново при каждом запросе, в случае задания имён хостов, делается резолвинг имён, а также не ведётся подсчёт ошибок при работе с разными репликами. При обработке большого количества запросов, всегда создавайте Distributed таблицу заранее, не используйте табличную функцию remote. - -Табличная функция remote может быть полезна для следующих случаев: -- обращение на конкретный сервер в целях сравнения данных, отладки и тестирования; -- запросы между разными кластерами ClickHouse в целях исследований; -- нечастых распределённых запросов, задаваемых вручную; -- распределённых запросов, где набор серверов определяется каждый раз заново. - -Имя пользователя может быть не задано - тогда используется имя пользователя 'default'. -Пароль может быть не задан - тогда используется пустой пароль. - -
    -
    -

    Форматы

    -
    -
    - -Формат определяет, в каком виде данные отдаются вам (пишутся, форматируются сервером) при SELECT-е и в каком виде принимаются (читаются, парсятся сервером) при INSERT-е. - - -==Native== - -Самый эффективный формат. Данные пишутся и читаются блоками в бинарном виде. Для каждого блока пишется количество строк, количество столбцов, имена и типы столбцов, а затем кусочки столбцов этого блока, один за другим. То есть, этот формат является "столбцовым" - не преобразует столбцы в строки. Именно этот формат используется в родном интерфейсе - при межсерверном взаимодействии, при использовании клиента командной строки, при работе клиентов, написанных на C++. - -Вы можете использовать этот формат для быстрой генерации дампов, которые могут быть прочитаны только СУБД ClickHouse. Вряд ли имеет смысл работать с этим форматом самостоятельно. - - -==TabSeparated== - -В TabSeparated формате данные пишутся по строкам. Каждая строчка содержит значения, разделённые табами. После каждого значения идёт таб, кроме последнего значения в строке, после которого идёт перевод строки. Везде подразумеваются исключительно unix-переводы строк. Последняя строка также обязана содержать перевод строки на конце. Значения пишутся в текстовом виде, без обрамляющих кавычек, с экранированием служебных символов. - -Целые числа пишутся в десятичной форме. Числа могут содержать лишний символ "+" в начале (игнорируется при парсинге, а при форматировании не пишется). Неотрицательные числа не могут содержать знак отрицания. При чтении допустим парсинг пустой строки, как числа ноль, или (для знаковых типов) строки, состоящей из одного минуса, как числа ноль. Числа, не помещающиеся в соответствующий тип данных, могут парсится, как некоторое другое число, без сообщения об ошибке. - -Числа с плавающей запятой пишутся в десятичной форме. При этом, десятичный разделитель - точка. Поддерживается экспоненциальная запись, а также inf, +inf, -inf, nan. Запись числа с плавающей запятой может начинаться или заканчиваться на десятичную точку. -При форматировании возможна потеря точности чисел с плавающей запятой. -При парсинге, допустимо чтение не обязательно наиболее близкого к десятичной записи машинно-представимого числа. - -Даты выводятся в формате YYYY-MM-DD, парсятся в том же формате, но с любыми символами в качестве разделителей. -Даты-с-временем выводятся в формате YYYY-MM-DD hh:mm:ss, парсятся в том же формате, но с любыми символами в качестве разделителей. -Всё это происходит в системном часовом поясе на момент старта клиента (если клиент занимается форматированием данных) или сервера. Для дат-с-временем не указывается, действует ли daylight saving time. То есть, если в дампе есть времена во время перевода стрелок назад, то дамп не соответствует данным однозначно, и при парсинге будет выбрано какое-либо из двух времён. -При парсинге, некорректные даты и даты-с-временем могут парситься с естественным переполнением или как нулевые даты/даты-с-временем без сообщения об ошибке. - -В качестве исключения, поддерживается также парсинг даты-с-временем в формате unix timestamp, если он состоит ровно из 10 десятичных цифр. Результат не зависит от часового пояса. Различение форматов YYYY-MM-DD hh:mm:ss и NNNNNNNNNN делается автоматически. - -Строки выводятся с экранированием спец-символов с помощью обратного слеша. При выводе, используются следующие escape-последовательности: %%\b%%, %%\f%%, %%\r,%% %%\n%%, %%\t%%, %%\0%%, %%\'%%, %%\\%%. При парсинге, также поддерживаются последовательности %%\a%%, %%\v%%, а также \xHH (hex escape-последовательности) и любые последовательности вида \c, где c - любой символ - такие последовательности преобразуется в c. Таким образом, при чтении поддерживаются форматы, где перевод строки может быть записан как %%\n%% и как %%\%% и перевод строки. Например, строка Hello world, где между словами вместо пробела стоит перевод строки, может быть считана в любом из следующих вариантов: - -%%Hello\nworld%% - -%%Hello\ -world%% - -Второй вариант поддерживается, так как его использует MySQL при записи tab-separated дампа. - -Минимальный набор символов, которых вам необходимо экранировать при передаче в TabSeparated формате: таб, перевод строки (LF) и обратный слеш. - -Экранируется лишь небольшой набор символов. Вы можете легко наткнуться на строковое значение, которое испортит ваш терминал при выводе в него. - -Массивы форматируются в виде списка значений через запятую в квадратных скобках. Элементы массива - числа форматируются как обычно, а даты, даты-с-временем и строки - в одинарных кавычках с такими же правилами экранирования, как указано выше. - -Формат TabSeparated удобен для обработки данных произвольными программами и скриптами. Он используется по умолчанию в HTTP-интерфейсе, а также в batch-режиме клиента командной строки. Также формат позволяет переносить данные между разными СУБД. Например, вы можете получить дамп из MySQL и загрузить его в ClickHouse, или наоборот. - -Формат TabSeparated поддерживает вывод тотальных значений (при использовании WITH TOTALS) и экстремальных значений (при настройке extremes выставленной в 1). В этих случаях, после основных данных выводятся тотальные значения, и экстремальные значения. Основной результат, тотальные значения и экстремальные значения, отделяются друг от друга пустой строкой. Пример: - -%%SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated%% - -%% -2014-03-17 1406958 -2014-03-18 1383658 -2014-03-19 1405797 -2014-03-20 1353623 -2014-03-21 1245779 -2014-03-22 1031592 -2014-03-23 1046491 - -0000-00-00 8873898 - -2014-03-17 1031592 -2014-03-23 1406958 -%% - -Этот формат также доступен под именем %%TSV%%. - - -==TabSeparatedWithNames== - -Отличается от формата TabSeparated тем, что в первой строке пишутся имена столбцов. -При парсинге, первая строка полностью игнорируется: вы не можете использовать имена столбцов, чтобы указать их порядок расположения, или чтобы проверить их корректность. -(Поддержка обработки заголовка при парсинге может быть добавлена в будущем.) - -Этот формат также доступен под именем %%TSVWithNames%%. - - -==TabSeparatedWithNamesAndTypes== - -Отличается от формата TabSeparated тем, что в первой строке пишутся имена столбцов, а во второй - типы столбцов. -При парсинге, первая и вторая строка полностью игнорируется. - -Этот формат также доступен под именем %%TSVWithNamesAndTypes%%. - - -==TabSeparatedRaw== - -Отличается от формата TabSeparated тем, что строки выводятся без экранирования. -Этот формат подходит только для вывода результата выполнения запроса, но не для парсинга (приёма данных для вставки в таблицу). - -Этот формат также доступен под именем %%TSVRaw%%. - -==BlockTabSeparated== - -Данные пишутся не по строкам, а по столбцам, блоками. -Каждый блок состоит из кусочков столбцов, каждый из которых пишется на отдельной строке. -Значения разделены табами, после последнего значения кусочка столбца, вместо таба ставится перевод строки. -Блоки разделены двойным переводом строки. -Остальные правила такие же, как в формате TabSeparated. -Этот формат подходит только для вывода результата выполнения запроса, но не для парсинга (приёма данных для вставки в таблицу). - - -==CSV== - -Формат comma separated values (RFC). - -При форматировании, строки выводятся в двойных кавычках. Двойная кавычка внутри строки выводится как две двойные кавычки подряд. Других правил экранирования нет. Даты и даты-с-временем выводятся в двойных кавычках. Числа выводятся без кавычек. Значения разделяются запятыми. Строки разделяются unix переводом строки (LF). Массивы сериализуются в CSV следующим образом: сначала массив сериализуется в строку, как в формате TabSeparated, а затем полученная строка выводится в CSV в двойных кавычках. Кортежи в формате CSV сериализуются, как отдельные столбцы (то есть, теряется их вложенность в кортеж). - -При парсинге, все значения могут парситься как в кавычках, так и без кавычек. Поддерживаются как двойные, так и одинарные кавычки. В том числе, строки могут быть расположены без кавычек - тогда они парсятся до запятой или перевода строки (CR или LF). В нарушение RFC, в случае парсинга строк не в кавычках, начальные и конечные пробелы и табы игнорируются. В качестве перевода строки, поддерживаются как Unix (LF), так и Windows (CR LF) и Mac OS Classic (LF CR) варианты. - -Формат CSV поддерживает вывод totals и extremes аналогично TabSeparated. - - -==CSVWithNames== - -Выводит также заголовок, аналогично TabSeparatedWithNames. - - -==RowBinary== - -Форматирует и парсит данные по строкам, в бинарном виде. Строки и значения уложены подряд, без разделителей. -Формат менее эффективен, чем формат Native, так как является строковым. - -Числа представлены в little endian формате фиксированной длины. Для примера, UInt64 занимает 8 байт. -DateTime представлены как UInt32, содержащий unix timestamp в качестве значения. -Date представлены как UInt16, содержащий количество дней, прошедших с 1970-01-01 в качестве значения. -String представлены как длина в формате varint (unsigned LEB128), а затем байты строки. -FixedString представлены просто как последовательность байт. -Array представлены как длина в формате varint (unsigned LEB128), а затем элементы массива, подряд. - - -==Pretty== - -Выводит данные в виде Unicode-art табличек, также используя ANSI-escape последовательности для установки цветов в терминале. -Рисуется полная сетка таблицы и, таким образом, каждая строчка занимает две строки в терминале. -Каждый блок результата выводится в виде отдельной таблицы. Это нужно, чтобы можно было выводить блоки без буферизации результата (буферизация потребовалась бы, чтобы заранее вычислить видимую ширину всех значений.) -Для защиты от вываливания слишком большого количества данных в терминал, выводится только первые 10 000 строк. Если строк больше или равно 10 000, то будет написано "Showed first 10 000." -Этот формат подходит только для вывода результата выполнения запроса, но не для парсинга (приёма данных для вставки в таблицу). - -Формат Pretty поддерживает вывод тотальных значений (при использовании WITH TOTALS) и экстремальных значений (при настройке extremes выставленной в 1). В этих случаях, после основных данных выводятся тотальные значения, и экстремальные значения, в отдельных табличках. Пример (показан для формата PrettyCompact): - -%%SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT PrettyCompact%% - -%% -┌──EventDate─┬───────c─┐ -│ 2014-03-17 │ 1406958 │ -│ 2014-03-18 │ 1383658 │ -│ 2014-03-19 │ 1405797 │ -│ 2014-03-20 │ 1353623 │ -│ 2014-03-21 │ 1245779 │ -│ 2014-03-22 │ 1031592 │ -│ 2014-03-23 │ 1046491 │ -└────────────┴─────────┘ - -Totals: -┌──EventDate─┬───────c─┐ -│ 0000-00-00 │ 8873898 │ -└────────────┴─────────┘ - -Extremes: -┌──EventDate─┬───────c─┐ -│ 2014-03-17 │ 1031592 │ -│ 2014-03-23 │ 1406958 │ -└────────────┴─────────┘ -%% - -==PrettyCompact== - -Отличается от Pretty тем, что не рисуется сетка между строками - результат более компактный. -Этот формат используется по умолчанию в клиенте командной строки в интерактивном режиме. - - -==PrettyCompactMonoBlock== - -Отличается от PrettyCompact тем, что строки (до 10 000 штук) буферизуются и затем выводятся в виде одной таблицы, а не по блокам. - - -==PrettySpace== - -Отличается от PrettyCompact тем, что вместо сетки используется пустое пространство (пробелы). - - -==PrettyNoEscapes== - -Отличается от Pretty тем, что не используются ANSI-escape последовательности. Это нужно для отображения этого формата в браузере, а также при использовании утилиты командной строки watch. Пример: - -%%watch -n1 "clickhouse-client --query='SELECT * FROM system.events FORMAT PrettyCompactNoEscapes'"%% - -Для отображения в браузере, вы можете использовать HTTP интерфейс. - - -==PrettyCompactNoEscapes== - -Аналогично. - - -==PrettySpaceNoEscapes== - -Аналогично. - - -==Vertical== - -Выводит каждое значение на отдельной строке, с указанием имени столбца. Формат удобно использовать для вывода одной-нескольких строк, если каждая строка состоит из большого количества столбцов. -Этот формат подходит только для вывода результата выполнения запроса, но не для парсинга (приёма данных для вставки в таблицу). - - -==Values== - -Выводит каждую строку в скобках. Строки разделены запятыми. После последней строки запятой нет. Значения внутри скобок также разделены запятыми. Числа выводятся в десятичном виде без кавычек. Массивы выводятся в квадратных скобках. Строки, даты, даты-с-временем выводятся в кавычках. Правила экранирования и особенности парсинга аналогичны формату TabSeparated. При форматировании, лишние пробелы не ставятся, а при парсинге - допустимы и пропускаются (за исключением пробелов внутри значений типа массив, которые недопустимы). - -Минимальный набор символов, которых вам необходимо экранировать при передаче в Values формате: одинарная кавычка и обратный слеш. - -Именно этот формат используется в запросе INSERT INTO t VALUES ... -Но вы также можете использовать его для форматирования результатов запросов. - - -==JSON== - -Выводит данные в формате JSON. Кроме таблицы с данными, также выводятся имена и типы столбцов, и некоторая дополнительная информация - общее количество выведенных строк, а также количество строк, которое могло бы быть выведено, если бы не было LIMIT-а. Пример: - -%%SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase WITH TOTALS ORDER BY c DESC LIMIT 5 FORMAT JSON%% - -%% -{ - "meta": - [ - { - "name": "SearchPhrase", - "type": "String" - }, - { - "name": "c", - "type": "UInt64" - } - ], - - "data": - [ - { - "SearchPhrase": "", - "c": "8267016" - }, - { - "SearchPhrase": "интерьер ванной комнаты", - "c": "2166" - }, - { - "SearchPhrase": "яндекс", - "c": "1655" - }, - { - "SearchPhrase": "весна 2014 мода", - "c": "1549" - }, - { - "SearchPhrase": "фриформ фото", - "c": "1480" - } - ], - - "totals": - { - "SearchPhrase": "", - "c": "8873898" - }, - - "extremes": - { - "min": - { - "SearchPhrase": "", - "c": "1480" - }, - "max": - { - "SearchPhrase": "", - "c": "8267016" - } - }, - - "rows": 5, - - "rows_before_limit_at_least": 141137 -} -%% - -JSON совместим с JavaScript. Для этого, дополнительно экранируются некоторые символы: символ прямого слеша %%/%% экранируется в виде %%\/%%; альтернативные переводы строк %%U+2028%%, %%U+2029%%, на которых ломаются некоторые браузеры, экранируются в виде \uXXXX-последовательностей. Экранируются ASCII control characters: backspace, form feed, line feed, carriage return, horizontal tab в виде %%\b%%, %%\f%%, %%\n%%, %%\r%%, %%\t%% соответственно, а также остальные байты из диапазона 00-1F с помощью \uXXXX-последовательностей. Невалидные UTF-8 последовательности заменяются на replacement character %%�%% и, таким образом, выводимый текст будет состоять из валидных UTF-8 последовательностей. Числа типа UInt64 и Int64, для совместимости с JavaScript, по умолчанию выводятся в двойных кавычках, чтобы они выводились без кавычек можно установить конфигурационный параметр output_format_json_quote_64bit_integers равным 0. - -%%rows%% - общее количество выведенных строчек. -%%rows_before_limit_at_least%% - не менее скольких строчек получилось бы, если бы не было LIMIT-а. Выводится только если запрос содержит LIMIT. -В случае, если запрос содержит GROUP BY, %%rows_before_limit_at_least%% - точное число строк, которое получилось бы, если бы не было LIMIT-а. - -%%totals%% - тотальные значения (при использовании %%WITH TOTALS%%). -%%extremes%% - экстремальные значения (при настройке %%extremes%%, выставленной в 1). - -Этот формат подходит только для вывода результата выполнения запроса, но не для парсинга (приёма данных для вставки в таблицу). -Смотрите также формат JSONEachRow. - - -==JSONCompact== - -Отличается от JSON только тем, что строчки данных выводятся в массивах, а не в object-ах. Пример: - -%% -{ - "meta": - [ - { - "name": "SearchPhrase", - "type": "String" - }, - { - "name": "c", - "type": "UInt64" - } - ], - - "data": - [ - ["", "8267016"], - ["интерьер ванной комнаты", "2166"], - ["яндекс", "1655"], - ["весна 2014 мода", "1549"], - ["фриформ фото", "1480"] - ], - - "totals": ["","8873898"], - - "extremes": - { - "min": ["","1480"], - "max": ["","8267016"] - }, - - "rows": 5, - - "rows_before_limit_at_least": 141137 -} -%% - -Этот формат подходит только для вывода результата выполнения запроса, но не для парсинга (приёма данных для вставки в таблицу). -Смотрите также формат JSONEachRow. - - -==JSONEachRow== - -Выводит данные в виде отдельных JSON объектов для каждой строки (newline delimited JSON). - -%% -{"SearchPhrase":"","count()":"8267016"} -{"SearchPhrase":"интерьер ванной комнаты","count()":"2166"} -{"SearchPhrase":"яндекс","count()":"1655"} -{"SearchPhrase":"весна 2014 мода","count()":"1549"} -{"SearchPhrase":"фриформ фото","count()":"1480"} -{"SearchPhrase":"анджелина джоли","count()":"1245"} -{"SearchPhrase":"омск","count()":"1112"} -{"SearchPhrase":"фото собак разных пород","count()":"1091"} -{"SearchPhrase":"дизайн штор","count()":"1064"} -{"SearchPhrase":"баку","count()":"1000"} -%% - -В отличие от формата JSON, нет замены невалидных UTF-8 последовательностей. В строках может выводиться произвольный набор байт. Это сделано для того, чтобы данные форматировались без потери информации. Экранирование значений осуществляется аналогично формату JSON. - -При парсинге, поддерживается расположение значений разных столбцов в произвольном порядке. Допустимо отсутствие некоторых значений - тогда они воспринимаются как равные значениям по умолчанию. При этом, в качестве значений по умолчанию используются нули, пустые строки и не поддерживаются сложные значения по умолчанию, которые могут быть заданы в таблице. Пропускаются пробельные символы между элементами. После объектов может быть расположена запятая, которая игнорируется. Объекты не обязательно должны быть разделены переводами строк. - - -==TSKV== - -Похож на TabSeparated, но выводит значения в формате %%name=value%%. Имена экранируются так же, как строки в формате TabSeparated и, дополнительно, экранируется также символ %%=%%. - -%% -SearchPhrase= count()=8267016 -SearchPhrase=интерьер ванной комнаты count()=2166 -SearchPhrase=яндекс count()=1655 -SearchPhrase=весна 2014 мода count()=1549 -SearchPhrase=фриформ фото count()=1480 -SearchPhrase=анджелина джоли count()=1245 -SearchPhrase=омск count()=1112 -SearchPhrase=фото собак разных пород count()=1091 -SearchPhrase=дизайн штор count()=1064 -SearchPhrase=баку count()=1000 -%% - -При большом количестве маленьких столбцов, этот формат существенно неэффективен, и обычно нет причин его использовать. Он реализован, так как используется в некоторых отделах Яндекса. - -Поддерживается как вывод, так и парсинг данных в этом формате. При парсинге, поддерживается расположение значений разных столбцов в произвольном порядке. Допустимо отсутствие некоторых значений - тогда они воспринимаются как равные значениям по умолчанию. При этом, в качестве значений по умолчанию используются нули, пустые строки и не поддерживаются сложные значения по умолчанию, которые могут быть заданы в таблице. - -При парсинге, в качестве дополнительного поля, может присутствовать %%tskv%% без знака равенства и без значения. Это поле игнорируется. - - -==XML== - -Формат XML подходит только для вывода данных, не для парсинга. Пример: - -%% -<?xml version='1.0' encoding='UTF-8' ?> -<result> - <meta> - <columns> - <column> - <name>SearchPhrase</name> - <type>String</type> - </column> - <column> - <name>count()</name> - <type>UInt64</type> - </column> - </columns> - </meta> - <data> - <row> - <SearchPhrase></SearchPhrase> - <field>8267016</field> - </row> - <row> - <SearchPhrase>интерьер ванной комнаты</SearchPhrase> - <field>2166</field> - </row> - <row> - <SearchPhrase>яндекс</SearchPhrase> - <field>1655</field> - </row> - <row> - <SearchPhrase>весна 2014 мода</SearchPhrase> - <field>1549</field> - </row> - <row> - <SearchPhrase>фриформ фото</SearchPhrase> - <field>1480</field> - </row> - <row> - <SearchPhrase>анджелина джоли</SearchPhrase> - <field>1245</field> - </row> - <row> - <SearchPhrase>омск</SearchPhrase> - <field>1112</field> - </row> - <row> - <SearchPhrase>фото собак разных пород</SearchPhrase> - <field>1091</field> - </row> - <row> - <SearchPhrase>дизайн штор</SearchPhrase> - <field>1064</field> - </row> - <row> - <SearchPhrase>баку</SearchPhrase> - <field>1000</field> - </row> - </data> - <rows>10</rows> - <rows_before_limit_at_least>141137</rows_before_limit_at_least> -</result> -%% - -Если имя столбца не имеет некоторый допустимый вид, то в качестве имени элемента используется просто field. В остальном, структура XML повторяет структуру в формате JSON. -Как и для формата JSON, невалидные UTF-8 последовательности заменяются на replacement character � и, таким образом, выводимый текст будет состоять из валидных UTF-8 последовательностей. - -В строковых значениях, экранируются символы %%<%% и %%&%% как %%&lt;%% и %%&amp;%%. - -Массивы выводятся как %%<array><elem>Hello</elem><elem>World</elem>...</array>%%, -а кортежи как %%<tuple><elem>Hello</elem><elem>World</elem>...</tuple>%%. - -==Null== - -Ничего не выводит. При этом, запрос обрабатывается, а при использовании клиента командной строки, данные ещё и передаются на клиент. Используется для тестов, в том числе, тестов производительности. -Очевидно, формат подходит только для вывода, но не для парсинга. - -
    -
    -

    Типы данных

    -
    -
    - -==UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64== - -Целые числа фиксированной длины, без знака или со знаком. - - -==Float32, Float64== - -Числа с плавающей запятой, то же, что и float, double в языке C. -В отличие от стандартного SQL, числа с плавающей запятой поддерживают inf, -inf, а также nan-ы. -Смотрите замечание о сортировке nan-ов в разделе "Секция ORDER BY". -Не рекомендуется хранить числа с плавающей запятой в таблицах. - - -==String== - -Строки произвольной длины. Длина не ограничена. Значение может содержать произвольный набор байт, включая нулевые байты. -Таким образом, тип String заменяет типы VARCHAR, BLOB, CLOB и т. п. из других СУБД. - -===Кодировки=== - -В ClickHouse нет понятия кодировок. Строки могут содержать произвольный набор байт, который хранится и выводится, как есть. -Если вам нужно хранить тексты, рекомендуется использовать кодировку UTF-8. По крайней мере, если у вас терминал работает в кодировке UTF-8 (это рекомендуется), вы сможете читать и писать свои значения без каких-либо преобразований. -Также, некоторые функции по работе со строками, имеют отдельные варианты, которые работают при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. -Например, функция length вычисляет длину строки в байтах, а функция lengthUTF8 - длину строки в кодовых точках Unicode, при допущении, что значение в кодировке UTF-8. - - -==FixedString(N)== - -Строка фиксированной длины N байт (не символов, не кодовых точек). N должно быть строго положительным натуральным числом. -При чтении сервером строки (например, при парсинге данных для INSERT), содержащей меньшее число байт, строка дополняется до N байт дописыванием нулевых байт справа. -При чтении сервером строки, содержащей большее число байт, выдаётся сообщение об ошибке. -При записи сервером строки (например, при выводе результата запроса SELECT), нулевые байты с конца строки не вырезаются, а выводятся. -Обратите внимание, как это поведение отличается от поведения MySQL для типа CHAR (строки дополняются пробелами, пробелы перед выводом вырезаются). - -С типом FixedString(N) умеет работать меньше функций, чем с типом String - то есть, он менее удобен в использовании. - - -==Date== - -Дата. Хранится в двух байтах в виде (беззнакового) числа дней, прошедших от 1970-01-01. Позволяет хранить значения от чуть больше, чем начала unix-эпохи до верхнего порога, определяющегося константой на этапе компиляции (сейчас - до 2038 года, но может быть расширено до 2106 года). -Минимальное значение выводится как 0000-00-00. - -Дата хранится без учёта часового пояса. - - -==DateTime== - -Дата-с-временем. Хранится в 4 байтах, в виде (беззнакового) unix timestamp. Позволяет хранить значения в том же интервале, что и для типа Date. Минимальное значение выводится как 0000-00-00 00:00:00. -Время хранится с точностью до одной секунды (без учёта секунд координации). - -===Часовые пояса=== - -Дата-с-временем преобразуется из текстового (разбитого на составляющие) в бинарный вид и обратно, с использованием системного часового пояса на момент старта клиента или сервера. В текстовом виде, теряется информация о том, был ли произведён перевод стрелок. - -По умолчанию клиент переключается на часовой пояс сервера при подключении. Это поведение можно изменить, включив у клиента параметр командной строки --use_client_time_zone. - -Поддерживаются только часовые пояса, для которых для всего диапазона времён, с которым вы будете работать, не существовало моментов времени, в которые время отличалось от UTC на нецелое число часов (без учёта секунд координации). - -То есть, при работе с датой в виде текста (например, при сохранении текстовых дампов), следует иметь ввиду о проблемах с неоднозначностью во время перевода стрелок назад, и о проблемах с соответствием данных, при смене часового пояса. - - -==Enum== - -Enum8 или Enum16. Представляет собой конечное множество строковых значений, сохраняемых более эффективно, чем это делает тип данных %%String%%. Пример: - -%%Enum8('hello' = 1, 'world' = 2)%% -- тип данных с двумя возможными значениями - 'hello' и 'world'. - -Для каждого из значений прописывается число в диапазоне -128..127 для %%Enum8%% или в диапазоне -32768..32767 для %%Enum16%%. Все строки должны быть разными, числа - тоже. Разрешена пустая строка. При указании такого типа (в определении таблицы), числа могут идти не подряд и в произвольном порядке. При этом, порядок не имеет значения. - -В оперативке столбец такого типа представлен так же, как %%Int8%% или %%Int16%% соответствующими числовыми значениями. -При чтении в текстовом виде, парсит значение как строку и ищет соответствующую строку из множества значений Enum-а. Если не находит - кидается исключение. -При записи в текстовом виде, записывает значение как соответствующую строку. Если в данных столбца есть мусор - числа не из допустимого множества, то кидается исключение. При чтении и записи в бинарном виде, оно осуществляется так же, как для типов данных %%Int8%%, %%Int16%%. -Неявное значение по умолчанию - это значение с минимальным номером. - -При %%ORDER BY%%, %%GROUP BY%%, %%IN%%, %%DISTINCT%% и т. п., Enum-ы ведут себя так же, как соответствующие числа. Например, при %%ORDER BY%% они сортируются по числовым значениям. Функции сравнения на равенство и сравнения на отношение порядка двух Enum-ов работают с Enum-ами так же, как с числами. - -Сравнивать Enum с числом нельзя. Можно сравнивать Enum с константной строкой - при этом, для строки ищется соответствующее значение Enum-а; если не находится - кидается исключение. Поддерживается оператор IN, где слева стоит Enum, а справа - множество строк. В этом случае, строки рассматриваются как значения соответствующего Enum-а. - -Большинство операций с числами и со строками не имеет смысла и не работают для Enum-ов: например, к Enum-у нельзя прибавить число. -Для Enum-а естественным образом определяется функция %%toString%%, которая возвращает его строковое значение. - -Также для Enum-а определяются функции %%toT%%, где T - числовой тип. При совпадении T с типом столбца Enum-а, преобразование работает бесплатно. -При ALTER, есть возможность бесплатно изменить тип Enum-а, если меняется только множество значений. При этом, можно добавлять новые значения; можно удалять старые значения (это безопасно только если они ни разу не использовались, так как это не проверяется). В качестве "защиты от дурака", нельзя менять числовые значения у имеющихся строк - в этом случае, кидается исключение. - -При ALTER, есть возможность поменять Enum8 на Enum16 и обратно - так же, как можно поменять Int8 на Int16. - - -==Array(T)== - -Массив из элементов типа T. Типом T может быть любой тип, в том числе, массив. -Многомерные массивы не рекомендуется использовать, так как их поддержка довольно слабая (например, многомерные массивы нельзя сохранить в таблицы с движком семейства MergeTree). - - -==Tuple(T1, T2, ...)== - -Кортежи не могут быть записаны в таблицы (кроме таблиц типа Memory). Они используется для временной группировки столбцов. Столбцы могут группироваться при использовании выражения IN в запросе, а также для указания нескольких формальных параметров лямбда-функций. Подробнее смотрите раздел "Операторы IN", "Функции высшего порядка". - -Кортежи могут быть выведены в результате выполнения запроса. В этом случае, в текстовых форматах кроме JSON*, значения выводятся в круглых скобках через запятую. В форматах JSON*, кортежи выводятся в виде массивов (в квадратных скобках). - - -==Вложенные структуры данных== - -==Nested(Name1 Type1, Name2 Type2, ...)== - -Вложенная структура данных - это как будто вложенная таблица. Параметры вложенной структуры данных - имена и типы столбцов, указываются так же, как в запроса CREATE. Каждой строке таблицы может соответствовать произвольное количество строк вложенной структуры данных. - -Пример: - -%% -CREATE TABLE test.visits -( - CounterID UInt32, - StartDate Date, - Sign Int8, - IsNew UInt8, - VisitID UInt64, - UserID UInt64, - ... - Goals Nested - ( - ID UInt32, - Serial UInt32, - EventTime DateTime, - Price Int64, - OrderID String, - CurrencyID UInt32 - ), - ... -) ENGINE = CollapsingMergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192, Sign) -%% - -В этом примере объявлена вложенная структура данных Goals, содержащая данные о достижении целей. Каждой строке таблицы visits может соответствовать от нуля до произвольного количества достижений целей. - -Поддерживается только один уровень вложенности. Столбцы вложенных структур, содержащие массивы, эквивалентны многомерным массивам, поэтому их поддержка ограничена (не поддерживается хранение таких столбцов в таблицах с движком семейства MergeTree). - -В большинстве случаев, при работе с вложенной структурой данных, указываются отдельные её столбцы. Для этого, имена столбцов указываются через точку. Эти столбцы представляют собой массивы соответствующих типов. Все столбцы-массивы одной вложенной структуры данных имеют одинаковые длины. - -Пример: - -%% -SELECT - Goals.ID, - Goals.EventTime -FROM test.visits -WHERE CounterID = 101500 AND length(Goals.ID) < 5 -LIMIT 10 - -┌─Goals.ID───────────────────────┬─Goals.EventTime───────────────────────────────────────────────────────────────────────────┐ -│ [1073752,591325,591325] │ ['2014-03-17 16:38:10','2014-03-17 16:38:48','2014-03-17 16:42:27'] │ -│ [1073752] │ ['2014-03-17 00:28:25'] │ -│ [1073752] │ ['2014-03-17 10:46:20'] │ -│ [1073752,591325,591325,591325] │ ['2014-03-17 13:59:20','2014-03-17 22:17:55','2014-03-17 22:18:07','2014-03-17 22:18:51'] │ -│ [] │ [] │ -│ [1073752,591325,591325] │ ['2014-03-17 11:37:06','2014-03-17 14:07:47','2014-03-17 14:36:21'] │ -│ [] │ [] │ -│ [] │ [] │ -│ [591325,1073752] │ ['2014-03-17 00:46:05','2014-03-17 00:46:05'] │ -│ [1073752,591325,591325,591325] │ ['2014-03-17 13:28:33','2014-03-17 13:30:26','2014-03-17 18:51:21','2014-03-17 18:51:45'] │ -└────────────────────────────────┴───────────────────────────────────────────────────────────────────────────────────────────┘ -%% - -Проще всего понимать вложенную структуру данных, как набор из нескольких столбцов-массивов одинаковых длин. - -Единственное место, где в запросе SELECT можно указать имя целой вложенной структуры данных, а не отдельных столбцов - секция ARRAY JOIN. Подробнее см. раздел "Секция ARRAY JOIN". Пример: - -%% -SELECT - Goal.ID, - Goal.EventTime -FROM test.visits -ARRAY JOIN Goals AS Goal -WHERE CounterID = 101500 AND length(Goals.ID) < 5 -LIMIT 10 - -┌─Goal.ID─┬──────Goal.EventTime─┐ -│ 1073752 │ 2014-03-17 16:38:10 │ -│ 591325 │ 2014-03-17 16:38:48 │ -│ 591325 │ 2014-03-17 16:42:27 │ -│ 1073752 │ 2014-03-17 00:28:25 │ -│ 1073752 │ 2014-03-17 10:46:20 │ -│ 1073752 │ 2014-03-17 13:59:20 │ -│ 591325 │ 2014-03-17 22:17:55 │ -│ 591325 │ 2014-03-17 22:18:07 │ -│ 591325 │ 2014-03-17 22:18:51 │ -│ 1073752 │ 2014-03-17 11:37:06 │ -└─────────┴─────────────────────┘ -%% - -Вы не можете сделать SELECT целой вложенной структуры данных. Можно лишь явно перечислить отдельные столбцы - её составляющие. - -При запросе INSERT, вы должны передать все составляющие столбцы-массивы вложенной структуры данных по-отдельности (как если бы это были отдельные столбцы-массивы). При вставке проверяется, что они имеют одинаковые длины. - -При запросе DESCRIBE, столбцы вложенной структуры данных перечисляются так же по отдельности. - -Работоспособность запроса ALTER для элементов вложенных структур данных, является сильно ограниченной. - - -==AggregateFunction(name, types_of_arguments...)== - -Промежуточное состояние агрегатной функции. Чтобы его получить, используются агрегатные функции с суффиксом -State. Подробнее смотрите в разделе "AggregatingMergeTree". - - -==Служебные типы данных== - -Значения служебных типов данных не могут сохраняться в таблицу и выводиться в качестве результата, а возникают как промежуточный результат выполнения запроса. - -===Set=== - -Используется для представления правой части выражения IN. - -===Expression=== - -Используется для представления лямбда-выражений в функциях высшего порядка. - - -==Булевы значения== - -Отдельного типа для булевых значений нет. Для них используется тип UInt8, в котором используются только значения 0 и 1. - -
    -
    -

    Операторы

    -
    -
    - -Все операторы преобразуются в соответствующие функции на этапе парсинга запроса, с учётом их приоритетов и ассоциативности. -Далее будут перечислены группы операторов в порядке их приоритета (чем выше, тем раньше оператор связывается со своими аргументами). - -==Операторы доступа== - -%%a[N]%% - доступ к элементу массива, функция arrayElement(a, N). -%%a.N%% - доступ к элементу кортежа, функция tupleElement(a, N). - -==Оператор числового отрицания== - -%%-a%% - функция negate(a). - -==Операторы умножения и деления== - -%%a * b%% - функция multiply(a, b) -%%a / b%% - функция divide(a, b) -%%a % b%% - функция modulo(a, b) - -==Операторы сложения и вычитания== - -%%a + b%% - функция plus(a, b) -%%a - b%% - функция minus(a, b) - -==Операторы сравнения== - -%%a = b%% - функция equals(a, b) -%%a == b%% - функция equals(a, b) -%%a != b%% - функция notEquals(a, b) -%%a <> b%% - функция notEquals(a, b) -%%a <= b%% - функция lessOrEquals(a, b) -%%a >= b%% - функция greaterOrEquals(a, b) -%%a < b%% - функция less(a, b) -%%a > b%% - функция greater(a, b) -%%a LIKE s%% - функция like(a, b) -%%a NOT LIKE s%% - функция notLike(a, b) -%%a BETWEEN b AND c%% - равнозначно %%a >= b AND a <= c%% - -==Операторы для работы с множествами== - -Смотрите раздел "Операторы IN". - -%%a IN ...%% - функция in(a, b) -%%a NOT IN ...%% - функция notIn(a, b) -%%a GLOBAL IN ...%% - функция globalIn(a, b) -%%a GLOBAL NOT IN ...%% - функция globalNotIn(a, b) - -==Оператор логического отрицания== - -%%NOT a%% - функция not(a) - -==Оператор логического "И".== - -%%a AND b%% - функция and(a, b) - -==Оператор логического "ИЛИ".== - -%%a OR b%% - функция or(a, b) - -==Условный оператор== - -%%a ? b : c%% - функция if(a, b, c) - -==Условное выражение== -%% -CASE [x] - WHEN a THEN b - [WHEN ... THEN ...] - ELSE c -END -%% -В случае указания x - функция transform(x, [a, ...], [b, ...], c). -Иначе - multiIf(a, b, ..., c). - -==Оператор склеивания строк== -%%s1 || s2%% - функция concat(s1, s2) - -==Оператор создания лямбда-выражения== - -%%x -> expr%% - функция lambda(x, expr) - -Следующие операторы не имеют приоритета, так как представляют собой скобки: - -==Оператор создания массива== - -%%[x1, ...]%% - функция array(x1, ...) - -==Оператор создания кортежа== - -%%(x1, x2, ...)%% - функция tuple(x2, x2, ...) - - -==Ассоциативность== - -Все бинарные операторы имеют левую ассоциативность. Например, 1 + 2 + 3 преобразуется в plus(plus(1, 2), 3). -Иногда это работает не так, как ожидается. Например, SELECT 4 > 3 > 2 выдаст 0. - -Для эффективности, реализованы функции and и or, принимающие произвольное количество аргументов. Соответствующие цепочки операторов AND и OR, преобразуются в один вызов этих функций. - -
    -
    -

    Функции

    -
    -
    - -Функции бывают как минимум* двух видов - обычные функции (называются просто, функциями) и агрегатные функции. Это совершенно разные вещи. Обычные функции работают так, как будто применяются к каждой строке по отдельности (для каждой строки, результат вычисления функции не зависит от других строк). Агрегатные функции аккумулируют множество значений из разных строк (то есть, зависят от целого множества строк). - -В этом разделе речь пойдёт об обычных функциях. Для агрегатных функций, смотрите раздел "Агрегатные функции". -* - есть ещё третий вид функций, к которым относится функция arrayJoin; также можно отдельно иметь ввиду табличные функции. - -===Строгая типизация=== - -В ClickHouse, в отличие от стандартного SQL, типизация является строгой. То есть, не производится неявных преобразований между типами. Все функции работают для определённого набора типов. Это значит, что иногда вам придётся использовать функции преобразования типов. - -===Склейка одинаковых выражений=== - -Все выражения в запросе, имеющие одинаковые AST (одинаковую запись или одинаковый результат синтаксического разбора), считаются имеющими одинаковые значения. Такие выражения склеиваются и исполняются один раз. Одинаковые подзапросы тоже склеиваются. - -===Типы результата=== - -Все функции возвращают одно (не несколько, не ноль) значение в качестве результата. Тип результата обычно определяется только типами аргументов, но не значениями аргументов. Исключение - функция tupleElement (оператор a.N), а также функция toFixedString. - -===Константы=== - -Для простоты, некоторые функции могут работать только с константами в качестве некоторых аргументов. Например, правый аргумент оператора LIKE должен быть константой. -Почти все функции возвращают константу для константных аргументов. Исключение - функции генерации случайных чисел. -Функция now возвращает разные значения для запросов, выполненных в разное время, но результат считается константой, так как константность важна лишь в пределах одного запроса. -Константное выражение также считается константой (например, правую часть оператора LIKE можно сконструировать из нескольких констант). - -Функции могут быть по-разному реализованы для константных и не константных аргументов (выполняется разный код). Но результат работы для константы и полноценного столбца, содержащего только одно такое же значение, должен совпадать. - -===Неизменяемость=== - -Функции не могут поменять значения своих аргументов - любые изменения возвращаются в качестве результата. Соответственно, от порядка записи функций в запросе, результат вычислений отдельных функций не зависит. - -===Обработка ошибок=== - -Некоторые функции могут кидать исключения в случае ошибочных данных. В этом случае, выполнение запроса прерывается, и текст ошибки выводится клиенту. При распределённой обработке запроса, при возникновении исключения на одном из серверов, на другие серверы пытается отправиться просьба тоже прервать выполнение запроса. - -===Вычисление выражений-аргументов=== - -В почти всех языках программирования, для некоторых операторов может не вычисляться один из аргументов. Обычно - для операторов &&, ||, ?:. -Но в ClickHouse, аргументы функций (операторов) вычисляются всегда. Это связано с тем, что вычисления производятся не по отдельности для каждой строки, а сразу для целых кусочков столбцов. - -===Выполнение функций при распределённой обработке запроса=== - -При распределённой обработке запроса, как можно большая часть стадий выполнения запроса производится на удалённых серверах, а оставшиеся стадии (слияние промежуточных результатов и всё, что дальше) - на сервере-инициаторе запроса. - -Это значит, что выполнение функций может производиться на разных серверах. -Например, в запросе SELECT f(sum(g(x))) FROM distributed_table GROUP BY h(y), -- если %%distributed_table%% имеет хотя бы два шарда, то функции %%g%% и %%h%% выполняются на удалённых серверах, а функция %%f%% - на сервере-инициаторе запроса; -- если %%distributed_table%% имеет только один шард, то все функции %%f%%, %%g%%, %%h%% выполняются на сервере этого шарда. - -Обычно результат выполнения функции не зависит от того, на каком сервере её выполнить. Но иногда это довольно важно. -Например, функции, работающие со словарями, будут использовать словарь, присутствующий на том сервере, на котором они выполняются. -Другой пример - функция %%hostName%% вернёт имя сервера, на котором она выполняется, и это можно использовать для служебных целей - чтобы в запросе SELECT сделать GROUP BY по серверам. - -Если функция в запросе выполняется на сервере-инициаторе запроса, а вам нужно, чтобы она выполнялась на удалённых серверах, вы можете обернуть её в агрегатную функцию any или добавить в ключ в GROUP BY. - - -==Арифметические функции== - -Для всех арифметических функций, тип результата вычисляется, как минимальный числовой тип, который может вместить результат, если такой тип есть. Минимум берётся одновременно по числу бит, знаковости и "плавучести". Если бит не хватает, то берётся тип максимальной битности. - -Пример: - -
    -:) SELECT toTypeName(0), toTypeName(0 + 0), toTypeName(0 + 0 + 0), toTypeName(0 + 0 + 0 + 0)
    -
    -┌─toTypeName(0)─┬─toTypeName(plus(0, 0))─┬─toTypeName(plus(plus(0, 0), 0))─┬─toTypeName(plus(plus(plus(0, 0), 0), 0))─┐
    -│ UInt8         │ UInt16                 │ UInt32                          │ UInt64                                   │
    -└───────────────┴────────────────────────┴─────────────────────────────────┴──────────────────────────────────────────┘
    -
    - -Арифметические функции работают для любой пары типов из UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64. - -Переполнение производится также, как в C++. - - -===plus(a, b), оператор a + b=== - -Вычисляет сумму чисел. - -Также можно складывать целые числа с датой и датой-с-временем. В случае даты, прибавление целого числа означает прибавление соответствующего количества дней. В случае даты-с-временем - прибавление соответствующего количества секунд. - -===minus(a, b), оператор a - b=== - -Вычисляет разность чисел. Результат всегда имеет знаковый тип. - -Также можно вычитать целые числа из даты и даты-с-временем. Смысл аналогичен - смотрите выше для plus. - -===multiply(a, b), оператор a * b=== - -Вычисляет произведение чисел. - -===divide(a, b), оператор a / b=== - -Вычисляет частное чисел. Тип результата всегда является типом с плавающей запятой. -То есть, деление не целочисленное. Для целочисленного деления, используйте функцию intDiv. -При делении на ноль получится inf, -inf или nan. - -===intDiv(a, b)=== - -Вычисляет частное чисел. Деление целочисленное, с округлением вниз (по абсолютному значению). -При делении на ноль или при делении минимального отрицательного числа на минус единицу, кидается исключение. - -===intDivOrZero(a, b)=== - -Отличается от intDiv тем, что при делении на ноль или при делении минимального отрицательного числа на минус единицу, возвращается ноль. - -===modulo(a, b), оператор a % b=== - -Вычисляет остаток от деления. -Если аргументы - числа с плавающей запятой, то они предварительно преобразуются в целые числа, путём отбрасывания дробной части. -Берётся остаток в том же смысле, как это делается в C++. По факту, для отрицательных чисел, используется truncated division. -При делении на ноль или при делении минимального отрицательного числа на минус единицу, кидается исключение. - -===negate(a), оператор -a=== - -Вычисляет число, обратное по знаку. Результат всегда имеет знаковый тип. - -===abs(a)=== - -Вычисляет абсолютное значение для числа a. То есть, если a < 0, то возвращает -a. -Для беззнаковых типов ничего не делает. Для чисел типа целых со знаком, возвращает число беззнакового типа. - -==Битовые функции== - -Битовые функции работают для любой пары типов из UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64. - -Тип результата - целое число, битность которого равна максимальной битности аргументов. Если хотя бы один аргумент знаковый, то результат - знаковое число. Если аргумент - число с плавающей запятой - оно приводится к Int64. - -===bitAnd(a, b)=== - -===bitOr(a, b)=== - -===bitXor(a, b)=== - -===bitNot(a)=== - -===bitShiftLeft(a, b)=== - -===bitShiftRight(a, b)=== - - -==Функции сравнения== - -Функции сравнения возвращают всегда 0 или 1 (UInt8). - -Сравнивать можно следующие типы: - - числа; - - строки и фиксированные строки; - - даты; - - даты-с-временем; -внутри каждой группы, но не из разных групп. - -Например, вы не можете сравнить дату со строкой. Надо использовать функцию преобразования строки в дату или наоборот. - -Строки сравниваются побайтово. Более короткая строка меньше всех строк, начинающихся с неё и содержащих ещё хотя бы один символ. - -Замечание. До версии 1.1.54134 сравнение знаковых и беззнаковых целых чисел производилось также, как в C++. То есть, вы могли получить неверный результат в таких случаях: SELECT 9223372036854775807 > -1. С версии 1.1.54134 поведение изменилось и стало математически корректным. - - -===equals, оператор a = b и a == b=== - -===notEquals, оператор a != b и a <> b=== - -===less, оператор <=== - -===greater, оператор >=== - -===lessOrEquals, оператор <==== - -===greaterOrEquals, оператор >==== - - -==Логические функции== - -Логические функции принимают любые числовые типы, а возвращают число типа UInt8, равное 0 или 1. - -Ноль в качестве аргумента считается "ложью", а любое ненулевое значение - "истиной". - - -===and, оператор AND=== - -===or, оператор OR=== - -===not, оператор NOT=== - -===xor=== - - -==Функции преобразования типов== - -===toUInt8, toUInt16, toUInt32, toUInt64=== -===toInt8, toInt16, toInt32, toInt64=== -===toFloat32, toFloat64=== -===toUInt8OrZero, toUInt16OrZero, toUInt32OrZero, toUInt64OrZero, toInt8OrZero, toInt16OrZero, toInt32OrZero, toInt64OrZero, toFloat32OrZero, toFloat64OrZero=== -===toDate, toDateTime=== -===toString=== - -Функции преобразования между числами, строками (но не фиксированными строками), датами и датами-с-временем. -Все эти функции принимают один аргумент. - -При преобразовании в строку или из строки, производится форматирование или парсинг значения по тем же правилам, что и для формата TabSeparated (и почти всех остальных текстовых форматов). Если распарсить строку не удаётся - кидается исключение и выполнение запроса прерывается. - -При преобразовании даты в число или наоборот, дате соответствует число дней от начала unix эпохи. -При преобразовании даты-с-временем в число или наоборот, дате-с-временем соответствует число секунд от начала unix эпохи. - -Форматы даты и даты-с-временем для функций toDate/toDateTime определены следующим образом: -%% -YYYY-MM-DD -YYYY-MM-DD hh:mm:ss -%% - -В качестве исключения, если делается преобразование из числа типа UInt32, Int32, UInt64, Int64 в Date, и если число больше или равно 65536, то число рассматривается как unix timestamp (а не как число дней) и округляется до даты. Это позволяет поддержать распространённый случай, когда пишут toDate(unix_timestamp), что иначе было бы ошибкой и требовало бы написания более громоздкого toDate(toDateTime(unix_timestamp)) - -Преобразование между датой и датой-с-временем производится естественным образом: добавлением нулевого времени или отбрасыванием времени. - -Преобразование между числовыми типами производится по тем же правилам, что и присваивание между разными числовыми типами в C++. - -Дополнительно, функция toString от аргумента типа DateTime может принимать второй аргумент String - имя тайм-зоны. Пример: %%Asia/Yekaterinburg%% В этом случае, форматирование времени производится согласно указанной тайм-зоне. - -%% -SELECT - now() AS now_local, - toString(now(), 'Asia/Yekaterinburg') AS now_yekat - -┌───────────now_local─┬─now_yekat───────────┐ -│ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ -└─────────────────────┴─────────────────────┘ -%% - -Также смотрите функцию %%toUnixTimestamp%%. - - -===toFixedString(s, N)=== - -Преобразует аргумент типа String в тип FixedString(N) (строку фиксированной длины N). N должно быть константой. -Если строка имеет меньше байт, чем N, то она дополняется нулевыми байтами справа. Если строка имеет больше байт, чем N - кидается исключение. - -===toStringCutToZero(s)=== - -Принимает аргумент типа String или FixedString. Возвращает String, вырезая содержимое строки до первого найденного нулевого байта. - -Пример: -%% -:) SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut - -┌─s─────────────┬─s_cut─┐ -│ foo\0\0\0\0\0 │ foo │ -└───────────────┴───────┘ - -:) SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut - -┌─s──────────┬─s_cut─┐ -│ foo\0bar\0 │ foo │ -└────────────┴───────┘ -%% - -===reinterpretAsUInt8, reinterpretAsUInt16, reinterpretAsUInt32, reinterpretAsUInt64=== -===reinterpretAsInt8, reinterpretAsInt16, reinterpretAsInt32, reinterpretAsInt64=== -===reinterpretAsFloat32, reinterpretAsFloat64=== -===reinterpretAsDate, reinterpretAsDateTime=== - -Функции принимают строку и интерпретируют байты, расположенные в начале строки, как число в host order (little endian). Если строка имеет недостаточную длину, то функции работают так, как будто строка дополнена необходимым количеством нулевых байт. Если строка длиннее, чем нужно, то лишние байты игнорируются. Дата интерпретируется, как число дней с начала unix-эпохи, а дата-с-временем - как число секунд с начала unix-эпохи. - -===reinterpretAsString=== - -Функция принимает число или дату или дату-с-временем и возвращает строку, содержащую байты, представляющие соответствующее значение в host order (little endian). При этом, отбрасываются нулевые байты с конца. Например, значение 255 типа UInt32 будет строкой длины 1 байт. - -===CAST(x, t)=== - -Преобразует x в тип данных t. -Поддерживается также синтаксис %%CAST(x AS t)%%. - -Пример: -%% -SELECT - '2016-06-15 23:00:00' AS timestamp, - CAST(timestamp AS DateTime) AS datetime, - CAST(timestamp AS Date) AS date, - CAST(timestamp, 'String') AS string, - CAST(timestamp, 'FixedString(22)') AS fixed_string - -┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ -│ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ -└─────────────────────┴─────────────────────┴────────────┴─────────────────────┴───────────────────────────┘ -%% - -Преобразование в FixedString(N) работает только для аргументов типа String или FixedString(N). - - -==Функции для работы с датами и временем== - -===Поддержка часовых поясов=== -Все функции по работе с датой и временем, для которых это имеет смысл, могут принимать второй, необязательный аргумент - имя часового пояса. Пример: %%Asia/Yekaterinburg%%. В этом случае, они используют не локальный часовой пояс (по умолчанию), а указанный. - -%% -SELECT - toDateTime('2016-06-15 23:00:00') AS time, - toDate(time) AS date_local, - toDate(time, 'Asia/Yekaterinburg') AS date_yekat, - toString(time, 'US/Samoa') AS time_samoa - -┌────────────────time─┬─date_local─┬─date_yekat─┬─time_samoa──────────┐ -│ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-16 │ 2016-06-15 09:00:00 │ -└─────────────────────┴────────────┴────────────┴─────────────────────┘ -%% - -Поддерживаются только часовые пояса, отличающиеся от UTC на целое число часов. - - -===toYear=== -Переводит дату или дату-с-временем в число типа UInt16, содержащее номер года (AD). - -===toMonth=== -Переводит дату или дату-с-временем в число типа UInt8, содержащее номер месяца (1-12). - -===toDayOfMonth=== -Переводит дату или дату-с-временем в число типа UInt8, содержащее номер дня в месяце (1-31). - -===toDayOfWeek=== -Переводит дату или дату-с-временем в число типа UInt8, содержащее номер дня в неделе (понедельник - 1, воскресенье - 7). - -===toHour=== -Переводит дату-с-временем в число типа UInt8, содержащее номер часа в сутках (0-23). -Функция исходит из допущения, что перевод стрелок вперёд, если осуществляется, то на час, в два часа ночи, а перевод стрелок назад, если осуществляется, то на час, в три часа ночи (что, в общем, не верно - даже в Москве два раза перевод стрелок был осуществлён в другое время). - -===toMinute=== -Переводит дату-с-временем в число типа UInt8, содержащее номер минуты в часе (0-59). - -===toSecond=== -Переводит дату-с-временем в число типа UInt8, содержащее номер секунды в минуте (0-59). -Секунды координации не учитываются. - -===toMonday=== -Округляет дату или дату-с-временем вниз до ближайшего понедельника. -Возвращается дата. - -===toStartOfMonth=== -Округляет дату или дату-с-временем вниз до первого дня месяца. -Возвращается дата. - -===toStartOfQuarter=== -Округляет дату или дату-с-временем вниз до первого дня квартала. -Первый день квартала - это одно из 1 января, 1 апреля, 1 июля, 1 октября. -Возвращается дата. - -===toStartOfYear=== -Округляет дату или дату-с-временем вниз до первого дня года. -Возвращается дата. - -===toStartOfMinute=== -Округляет дату-с-временем вниз до начала минуты. - -===toStartOfFiveMinute=== -Округляет дату-с-временем вниз до начала пятиминутного интервала. - -Замечание: если вам нужно округлить дату-с-временем до какого-либо другого количества секунд, минут или часов, вы можете перевести её в число с помощью функции %%toUInt32%%, затем округлить число с помощью функции %%intDiv%% и умножения, а затем перевести обратно, с помощью функции %%toDateTime%%. - -===toStartOfHour=== -Округляет дату-с-временем вниз до начала часа. - -===toTime=== -Переводит дату-с-временем на некоторую фиксированную дату, сохраняя при этом время. - -===toRelativeYearNum=== -Переводит дату-с-временем или дату в номер года, начиная с некоторого фиксированного момента в прошлом. - -===toRelativeMonthNum=== -Переводит дату-с-временем или дату в номер месяца, начиная с некоторого фиксированного момента в прошлом. - -===toRelativeWeekNum=== -Переводит дату-с-временем или дату в номер недели, начиная с некоторого фиксированного момента в прошлом. - -===toRelativeDayNum=== -Переводит дату-с-временем или дату в номер дня, начиная с некоторого фиксированного момента в прошлом. - -===toRelativeHourNum=== -Переводит дату-с-временем в номер часа, начиная с некоторого фиксированного момента в прошлом. - -===toRelativeMinuteNum=== -Переводит дату-с-временем в номер минуты, начиная с некоторого фиксированного момента в прошлом. - -===toRelativeSecondNum=== -Переводит дату-с-временем в номер секунды, начиная с некоторого фиксированного момента в прошлом. - -===now=== -Принимает ноль аргументов и возвращает текущее время на один из моментов выполнения запроса. -Функция возвращает константу, даже если запрос выполнялся долго. - -===today=== -Принимает ноль аргументов и возвращает текущую дату на один из моментов выполнения запроса. -То же самое, что toDate(now()) - -===yesterday=== -Принимает ноль аргументов и возвращает вчерашнюю дату на один из моментов выполнения запроса. -Делает то же самое, что today() - 1. - -===timeSlot=== -Округляет время до получаса. -Эта функция является специфичной для Яндекс.Метрики, так как пол часа - минимальное время, для которого, если соседние по времени хиты одного посетителя на одном счётчике отстоят друг от друга строго более, чем на это время, визит может быть разбит на два визита. То есть, кортежи (номер счётчика, идентификатор посетителя, тайм-слот) могут использоваться для поиска хитов, входящий в соответствующий визит. - -===timeSlots(StartTime, Duration)=== -Для интервала времени, начинающегося в StartTime и продолжающегося Duration секунд, возвращает массив моментов времени, состоящий из округлений вниз до получаса точек из этого интервала. -Например, %%timeSlots(toDateTime('2012-01-01 12:20:00'), toUInt32(600)) = [toDateTime('2012-01-01 12:00:00'), toDateTime('2012-01-01 12:30:00')]%%. -Это нужно для поиска хитов, входящих в соответствующий визит. - - -==Функции для работы со строками== - -===empty=== -Возвращает 1 для пустой строки, и 0 для непустой строки. -Тип результата - UInt8. -Строка считается непустой, если содержит хотя бы один байт, пусть даже это пробел или нулевой байт. -Функция также работает для массивов. - -===notEmpty=== -Возвращает 0 для пустой строки, и 1 для непустой строки. -Тип результата - UInt8. -Функция также работает для массивов. - -===length=== -Возвращает длину строки в байтах (не символах, не кодовых точках). -Тип результата - UInt64. -Функция также работает для массивов. - -===lengthUTF8=== -Возвращает длину строки в кодовых точках Unicode (не символах), при допущении, что строка содержит набор байт, являющийся текстом в кодировке UTF-8. Если допущение не выполнено - то возвращает какой-нибудь результат (не кидает исключение). -Тип результата - UInt64. - -===lower=== -Переводит ASCII-символы латиницы в строке в нижний регистр. - -===upper=== -Переводит ASCII-символы латиницы в строке в верхний регистр. - -===lowerUTF8=== -Переводит строку в нижний регистр, при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. -Не учитывает язык. То есть, для турецкого языка, результат может быть не совсем верным. -Если длина UTF-8 последовательности байт различна для верхнего и нижнего регистра кодовой точки, то для этой кодовой точки, результат работы может быть некорректным. -Если строка содержит набор байт, не являющийся UTF-8, то поведение не определено. - -===upperUTF8=== -Переводит строку в верхний регистр, при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. -Не учитывает язык. То есть, для турецкого языка, результат может быть не совсем верным. -Если длина UTF-8 последовательности байт различна для верхнего и нижнего регистра кодовой точки, то для этой кодовой точки, результат работы может быть некорректным. -Если строка содержит набор байт, не являющийся UTF-8, то поведение не определено. - -===reverse=== -Разворачивает строку (как последовательность байт). - -===reverseUTF8=== -Разворачивает последовательность кодовых точек Unicode, при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. Иначе - что-то делает (не кидает исключение). - -===concat(s1, s2, ...)=== -Склеивает строки, перечисленные в аргументах, без разделителей. - -===substring(s, offset, length)=== -Возвращает подстроку, начиная с байта по индексу offset, длины length байт. Индексация символов - начиная с единицы (как в стандартном SQL). Аргументы offset и length должны быть константами. - -===substringUTF8(s, offset, length)=== -Так же, как substring, но для кодовых точек Unicode. Работает при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. Если допущение не выполнено - то возвращает какой-нибудь результат (не кидает исключение). - -===appendTrailingCharIfAbsent(s, c)=== -Если строка %%s%% непустая и не содержит символ %%c%% на конце, то добавляет символ %%c%% в конец. - -===convertCharset(s, from, to)=== -Возвращает сконвертированную из кодировки from в кодировку to строку s. - -==Функции поиска в строках== - -Во всех функциях, поиск регистрозависимый. -Во всех функциях, подстрока для поиска или регулярное выражение, должно быть константой. - -===position(haystack, needle)=== -Поиск подстроки needle в строке haystack. -Возвращает позицию (в байтах) найденной подстроки, начиная с 1, или 0, если подстрока не найдена. -Есть также функция positionCaseInsensitive. - -===positionUTF8(haystack, needle)=== -Так же, как position, но позиция возвращается в кодовых точках Unicode. Работает при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. Если допущение не выполнено - то возвращает какой-нибудь результат (не кидает исключение). -Есть также функция positionCaseInsensitiveUTF8. - -===match(haystack, pattern)=== -Проверка строки на соответствие регулярному выражению pattern. Регулярное выражение re2. -Возвращает 0 (если не соответствует) или 1 (если соответствует). - -Обратите внимание, что для экранирования в регулярном выражении, используется символ %%\%% (обратный слеш). Этот же символ используется для экранирования в строковых литералах. Поэтому, чтобы экранировать символ в регулярном выражении, необходимо написать в строковом литерале %%\\%% (два обратных слеша). - -Регулярное выражение работает со строкой как с набором байт. Регулярное выражение не может содержать нулевые байты. -Для шаблонов на поиск подстроки в строке, лучше используйте LIKE или position, так как они работают существенно быстрее. - -===extract(haystack, pattern)=== -Извлечение фрагмента строки по регулярному выражению. Если haystack не соответствует регулярному выражению pattern, то возвращается пустая строка. Если регулярное выражение не содержит subpattern-ов, то вынимается фрагмент, который подпадает под всё регулярное выражение. Иначе вынимается фрагмент, который подпадает под первый subpattern. - -===extractAll(haystack, pattern)=== -Извлечение всех фрагментов строки по регулярному выражению. Если haystack не соответствует регулярному выражению pattern, то возвращается пустая строка. Возвращается массив строк, состоящий из всех соответствий регулярному выражению. В остальном, поведение аналогично функции extract (по прежнему, вынимается первый subpattern, или всё выражение, если subpattern-а нет). - -===like(haystack, pattern), оператор haystack LIKE pattern=== -Проверка строки на соответствие простому регулярному выражению. -Регулярное выражение может содержать метасимволы %%%%% и %%_%%. -%%%%% обозначает любое количество любых байт (в том числе, нулевое количество символов). -%%_%% обозначает один любой байт. - -Для экранирования метасимволов, используется символ %%\%% (обратный слеш). Смотрите замечание об экранировании в описании функции match. - -Для регулярных выражений вида %%%needle%%% действует более оптимальный код, который работает также быстро, как функция position. -Для остальных регулярных выражений, код аналогичен функции match. - -===notLike(haystack, pattern), оператор haystack NOT LIKE pattern=== -То же, что like, но с отрицанием. - - -==Функции поиска и замены в строках== - -===replaceOne(haystack, pattern, replacement)=== -Замена первого вхождения, если такое есть, подстроки pattern в haystack на подстроку replacement. -Здесь и далее, pattern и replacement должны быть константами. - -===replaceAll(haystack, pattern, replacement)=== -Замена всех вхождений подстроки pattern в haystack на подстроку replacement. - -===replaceRegexpOne(haystack, pattern, replacement)=== -Замена по регулярному выражению pattern. Регулярное выражение re2. -Заменяется только первое вхождение, если есть. -В качестве replacement может быть указан шаблон для замен. Этот шаблон может включать в себя подстановки \0-\9. -Подстановка \0 - вхождение регулярного выражения целиком. Подстановки \1-\9 - соответствующие по номеру subpattern-ы. -Для указания символа \ в шаблоне, он должен быть экранирован с помощью символа \. -Также помните о том, что строковый литерал требует ещё одно экранирование. - -Пример 1. Переведём дату в американский формат: - -%% -SELECT DISTINCT - EventDate, - replaceRegexpOne(toString(EventDate), '(\\d{4})-(\\d{2})-(\\d{2})', '\\2/\\3/\\1') AS res -FROM test.hits -LIMIT 7 -FORMAT TabSeparated - -2014-03-17 03/17/2014 -2014-03-18 03/18/2014 -2014-03-19 03/19/2014 -2014-03-20 03/20/2014 -2014-03-21 03/21/2014 -2014-03-22 03/22/2014 -2014-03-23 03/23/2014 -%% - -Пример 2. Размножить строку десять раз: - -%% -SELECT replaceRegexpOne('Hello, World!', '.*', '\\0\\0\\0\\0\\0\\0\\0\\0\\0\\0') AS res - -┌─res────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World!Hello, World! │ -└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -%% - -===replaceRegexpAll(haystack, pattern, replacement)=== -То же самое, но делается замена всех вхождений. Пример: - -%% -SELECT replaceRegexpAll('Hello, World!', '.', '\\0\\0') AS res - -┌─res────────────────────────┐ -│ HHeelllloo,, WWoorrlldd!! │ -└────────────────────────────┘ -%% - -В качестве исключения, если регулярное выражение сработало на пустой подстроке, то замена делается не более одного раза. Пример: - -%% -SELECT replaceRegexpAll('Hello, World!', '^', 'here: ') AS res - -┌─res─────────────────┐ -│ here: Hello, World! │ -└─────────────────────┘ -%% - -==Функции по работе с массивами== - -===empty=== -Возвращает 1 для пустого массива, и 0 для непустого массива. -Тип результата - UInt8. -Функция также работает для строк. - -===notEmpty=== -Возвращает 0 для пустого массива, и 1 для непустого массива. -Тип результата - UInt8. -Функция также работает для строк. - -===length=== -Возвращает количество элементов в массиве. -Тип результата - UInt64. -Функция также работает для строк. - -===emptyArrayUInt8, emptyArrayUInt16, emptyArrayUInt32, emptyArrayUInt64=== -===emptyArrayInt8, emptyArrayInt16, emptyArrayInt32, emptyArrayInt64=== -===emptyArrayFloat32, emptyArrayFloat64=== -===emptyArrayDate, emptyArrayDateTime=== -===emptyArrayString=== -Принимает ноль аргументов и возвращает пустой массив соответствующего типа. - -===emptyArrayToSingle=== -Принимает пустой массив и возвращает массив из одного элемента, равного значению по умолчанию. - -===range(N)=== -Возвращает массив чисел от 0 до N-1. -На всякий случай, если на блок данных, создаются массивы суммарной длины больше 100 000 000 элементов, то кидается исключение. - -===array(x1, ...), оператор [x1, ...]=== -Создаёт массив из аргументов функции. -Аргументы должны быть константами и иметь типы, для которых есть наименьший общий тип. Должен быть передан хотя бы один аргумент, так как иначе непонятно, какого типа создавать массив. То есть, с помощью этой функции невозможно создать пустой массив (для этого используйте функции emptyArray*, описанные выше). -Возвращает результат типа Array(T), где T - наименьший общий тип от переданных аргументов. - -===arrayElement(arr, n), оператор arr[n]=== -Достаёт элемент с индексом n из массива arr. -n должен быть любым целочисленным типом. -Индексы в массиве начинаются с единицы. -Поддерживаются отрицательные индексы - в этом случае, будет выбран соответствующий по номеру элемент с конца. Например, arr[-1] - последний элемент массива. - -Если индекс выходит за границы массива, то -- если оба аргумента - константы, то кидается исключение; -- иначе, возвращается некоторое значение по умолчанию (0 для чисел, пустая строка для строк и т. п.). - -===has(arr, elem)=== -Проверяет наличие элемента elem в массиве arr. -Возвращает 0, если элемента в массиве нет, или 1, если есть. -elem должен быть константой. - -===indexOf(arr, x)=== -Возвращает индекс элемента x (начиная с 1), если он есть в массиве, или 0, если его нет. - -===countEqual(arr, x)=== -Возвращает количество элементов массива, равных x. Эквивалентно arrayCount(elem -> elem = x, arr). - -===arrayEnumerate(arr)=== -Возвращает массив %%[1, 2, 3, ..., length(arr)]%% - -Эта функция обычно используется совместно с ARRAY JOIN. Она позволяет, после применения ARRAY JOIN, посчитать что-либо только один раз для каждого массива. Пример: - -%% -SELECT - count() AS Reaches, - countIf(num = 1) AS Hits -FROM test.hits -ARRAY JOIN - GoalsReached, - arrayEnumerate(GoalsReached) AS num -WHERE CounterID = 160656 -LIMIT 10 - -┌─Reaches─┬──Hits─┐ -│ 95606 │ 31406 │ -└─────────┴───────┘ -%% - -В этом примере, Reaches - число достижений целей (строк, получившихся после применения ARRAY JOIN), а Hits - число хитов (строк, которые были до ARRAY JOIN). В данном случае, тот же результат можно получить проще: - -%% -SELECT - sum(length(GoalsReached)) AS Reaches, - count() AS Hits -FROM test.hits -WHERE (CounterID = 160656) AND notEmpty(GoalsReached) - -┌─Reaches─┬──Hits─┐ -│ 95606 │ 31406 │ -└─────────┴───────┘ -%% - -Также эта функция может быть использована в функциях высшего порядка. Например, с её помощью можно достать индексы массива для элементов, удовлетворяющих некоторому условию. - -===arrayEnumerateUniq(arr, ...)=== -Возвращает массив, такого же размера, как исходный, где для каждого элемента указано, какой он по счету среди элементов с таким же значением. -Например: %%arrayEnumerateUniq([10, 20, 10, 30]) = [1, 1, 2, 1]%%. - -Эта функция полезна при использовании ARRAY JOIN и агрегации по элементам массива. Пример: - -%% -SELECT - Goals.ID AS GoalID, - sum(Sign) AS Reaches, - sumIf(Sign, num = 1) AS Visits -FROM test.visits -ARRAY JOIN - Goals, - arrayEnumerateUniq(Goals.ID) AS num -WHERE CounterID = 160656 -GROUP BY GoalID -ORDER BY Reaches DESC -LIMIT 10 - -┌──GoalID─┬─Reaches─┬─Visits─┐ -│ 53225 │ 3214 │ 1097 │ -│ 2825062 │ 3188 │ 1097 │ -│ 56600 │ 2803 │ 488 │ -│ 1989037 │ 2401 │ 365 │ -│ 2830064 │ 2396 │ 910 │ -│ 1113562 │ 2372 │ 373 │ -│ 3270895 │ 2262 │ 812 │ -│ 1084657 │ 2262 │ 345 │ -│ 56599 │ 2260 │ 799 │ -│ 3271094 │ 2256 │ 812 │ -└─────────┴─────────┴────────┘ -%% - -В этом примере, для каждого идентификатора цели, посчитано количество достижений целей (каждый элемент вложенной структуры данных Goals является достижением целей) и количество визитов. Если бы не было ARRAY JOIN, мы бы считали количество визитов как %%sum(Sign)%%. Но в данном случае, строчки были размножены по вложенной структуре Goals, и чтобы после этого учесть каждый визит один раз, мы поставили условие на значение функции %%arrayEnumerateUniq(Goals.ID)%%. - -Функция arrayEnumerateUniq может принимать несколько аргументов - массивов одинаковых размеров. В этом случае, уникальность считается для кортежей элементов на одинаковых позициях всех массивов. - -%% -SELECT arrayEnumerateUniq([1, 1, 1, 2, 2, 2], [1, 1, 2, 1, 1, 2]) AS res - -┌─res───────────┐ -│ [1,2,1,1,2,1] │ -└───────────────┘ -%% - -Это нужно при использовании ARRAY JOIN с вложенной структурой данных и затем агрегации по нескольким элементам этой структуры. - - -===arrayUniq(arr, ...)=== - -Если передан один аргумент, считает количество разных элементов в массиве. -Если передано несколько аргументов, считает количество разных кортежей из элементов на соответствующих позициях в нескольких массивах. - -Если необходимо получить список уникальных элементов массива, можно воспользоваться %%arrayReduce('groupUniqArray', arr)%%. - - -===arrayJoin(arr)=== -Особенная функция. Смотрите раздел "Функция arrayJoin". - - -==Функции высшего порядка== - -===Оператор %%->%%, функция lambda(params, expr)=== -Позволяет описать лямбда-функцию для передачи в функцию высшего порядка. Слева от стрелочки стоит формальный параметр - произвольный идентификатор, или несколько формальных параметров - произвольные идентификаторы в кортеже. Справа от стрелочки стоит выражение, в котором могут использоваться эти формальные параметры, а также любые столбцы таблицы. - -Примеры: %%x -> 2 * x%%, %%str -> str != Referer%%. - -Функции высшего порядка, в качестве своего функционального аргумента могут принимать только лямбда-функции. - -В функции высшего порядка может быть передана лямбда-функция, принимающая несколько аргументов. В этом случае, в функцию высшего порядка передаётся несколько массивов одинаковых длин, которым эти аргументы будут соответствовать. - -Для всех функций кроме arrayMap, arrayFilter, первый аргумент (лямбда-функция) может отсутствовать. В этом случае, подразумевается тождественное отображение. - -===arrayMap(func, arr1, ...)=== -Вернуть массив, полученный из исходного применением функции func к каждому элементу массива arr. - -===arrayFilter(func, arr1, ...)=== -Вернуть массив, содержащий только те элементы массива arr1, для которых функция func возвращает не 0. - -Примеры: - -%% -SELECT arrayFilter(x -> x LIKE '%World%', ['Hello', 'abc World']) AS res - -┌─res───────────┐ -│ ['abc World'] │ -└───────────────┘ - -SELECT - arrayFilter( - (i, x) -> x LIKE '%World%', - arrayEnumerate(arr), - ['Hello', 'abc World'] AS arr) - AS res - -┌─res─┐ -│ [2] │ -└─────┘ -%% - -===arrayCount([func,] arr1, ...)=== -Вернуть количество элементов массива arr, для которых функция func возвращает не 0. Если func не указана - вернуть количество ненулевых элементов массива. - -===arrayExists([func,] arr1, ...)=== -Вернуть 1, если существует хотя бы один элемент массива arr, для которого функция func возвращает не 0. Иначе вернуть 0. - -===arrayAll([func,] arr1, ...)=== -Вернуть 1, если для всех элементов массива arr, функция func возвращает не 0. Иначе вернуть 0. - -===arraySum([func,] arr1, ...)=== -Вернуть сумму значений функции func. Если функция не указана - просто вернуть сумму элементов массива. - -===arrayFirst(func, arr1, ...)=== -Вернуть первый элемент массива arr1, для которого функция func возвращает не 0. - -===arrayFirstIndex(func, arr1, ...)=== -Вернуть индекс первого элемента массива arr1, для которого функция func возвращает не 0. - - -==Функции разбиения и слияния строк и массивов== - -===splitByChar(separator, s)=== -Разбивает строку на подстроки, используя в качестве разделителя separator. -separator должен быть константной строкой из ровно одного символа. -Возвращается массив выделенных подстрок. Могут выделяться пустые подстроки, если разделитель идёт в начале или в конце строки, или если идёт более одного разделителя подряд. - -===splitByString(separator, s)=== -То же самое, но использует строку из нескольких символов в качестве разделителя. Строка должна быть непустой. - -===arrayStringConcat(arr[, separator])=== -Склеивает строки, перечисленные в массиве, с разделителем separator. -separator - необязательный параметр, константная строка, по умолчанию равен пустой строке. -Возвращается строка. - -===alphaTokens(s)=== -Выделяет подстроки из подряд идущих байт из диапазонов a-z и A-Z. -Возвращается массив выделенных подстрок. - - -==Функции для работы с URL== - -Все функции работают не по RFC - то есть, максимально упрощены ради производительности. - -===Функции, извлекающие часть URL-а.=== - -Если в URL-е нет ничего похожего, то возвращается пустая строка. - -

    protocol

    -Возвращает протокол. Примеры: http, ftp, mailto, magnet... - -

    domain

    -Возвращает домен. - -

    domainWithoutWWW

    -Возвращает домен, удалив не более одного 'www.' с начала, если есть. - -

    topLevelDomain

    -Возвращает домен верхнего уровня. Пример: .ru. - -

    firstSignificantSubdomain

    -Возвращает "первый существенный поддомен". Это понятие является нестандартным и специфично для Яндекс.Метрики. -Первый существенный поддомен - это домен второго уровня, если он не равен одному из com, net, org, co, или домен третьего уровня, иначе. -Например, firstSignificantSubdomain('https://news.yandex.ru/') = 'yandex', firstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex'. -Список "несущественных" доменов второго уровня и другие детали реализации могут изменяться в будущем. - -

    cutToFirstSignificantSubdomain

    -Возвращает часть домена, включающую поддомены верхнего уровня до "первого существенного поддомена" (см. выше). -Например, cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'. - -

    path

    -Возвращает путь. Пример: /top/news.html -Путь не включает в себя query string. - -

    pathFull

    -То же самое, но включая query string и fragment. Пример: /top/news.html?page=2#comments - -

    queryString

    -Возвращает query-string. Пример: page=1&lr=213. -query-string не включает в себя начальный знак вопроса, а также # и всё, что после #. - -

    fragment

    -Возвращает fragment identifier. -fragment не включает в себя начальный символ решётки. - -

    queryStringAndFragment

    -Возвращает query string и fragment identifier. Пример: страница=1#29390. - -

    extractURLParameter(URL, name)

    -Возвращает значение параметра name в URL, если такой есть; или пустую строку, иначе; если параметров с таким именем много - вернуть первый попавшийся. Функция работает при допущении, что имя параметра закодировано в URL в точности таким же образом, что и в переданном аргументе. - -

    extractURLParameters(URL)

    -Возвращает массив строк вида name=value, соответствующих параметрам URL. Значения никак не декодируются. - -

    extractURLParameterNames(URL)

    -Возвращает массив строк вида name, соответствующих именам параметров URL. Значения никак не декодируются. - -

    URLHierarchy(URL)

    -Возвращает массив, содержащий URL, обрезанный с конца по символам %%/%%, %%?%% в пути и query-string. Подряд идущие символы-разделители считаются за один. Резка производится в позиции после всех подряд идущих символов-разделителей. Пример: - -

    URLPathHierarchy(URL)

    -То же самое, но без протокола и хоста в результате. Элемент / (корень) не включается. Пример: - -Функция используется для реализации древовидных отчётов по URL в Яндекс.Метрике. - -%% -URLPathHierarchy('https://example.com/browse/CONV-6788') = -[ - '/browse/', - '/browse/CONV-6788' -] -%% - -

    decodeURLComponent(URL)

    -Возвращает декодированный URL. - -Пример: -%% -:) SELECT decodeURLComponent('http://127.0.0.1:8123/?query=SELECT%201%3B') AS DecodedURL; - -┌─DecodedURL─────────────────────────────┐ -│ http://127.0.0.1:8123/?query=SELECT 1; │ -└────────────────────────────────────────┘ -%% - -===Функции, удаляющие часть из URL-а.=== - -Если в URL-е нет ничего похожего, то URL остаётся без изменений. - -

    cutWWW

    -Удаляет не более одного 'www.' с начала домена URL-а, если есть. - -

    cutQueryString

    -Удаляет query string. Знак вопроса тоже удаляется. - -

    cutFragment

    -Удаляет fragment identifier. Символ решётки тоже удаляется. - -

    cutQueryStringAndFragment

    -Удаляет query string и fragment identifier. Знак вопроса и символ решётки тоже удаляются. - -

    cutURLParameter(URL, name)

    -Удаляет параметр URL с именем name, если такой есть. Функция работает при допущении, что имя параметра закодировано в URL в точности таким же образом, что и в переданном аргументе. - - -==Функции для работы с IP-адресами== - -===IPv4NumToString(num)=== -Принимает число типа UInt32. Интерпретирует его, как IPv4-адрес в big endian. Возвращает строку, содержащую соответствующий IPv4-адрес в формате A.B.C.D (числа в десятичной форме через точки). - -===IPv4StringToNum(s)=== -Функция, обратная к IPv4NumToString. Если IPv4 адрес в неправильном формате, то возвращает 0. - -===IPv4NumToStringClassC(num)=== -Похоже на IPv4NumToString, но вместо последнего октета используется %%xxx%%. Пример: - -%% -SELECT - IPv4NumToStringClassC(ClientIP) AS k, - count() AS c -FROM test.hits -GROUP BY k -ORDER BY c DESC -LIMIT 10 - -┌─k──────────────┬─────c─┐ -│ 83.149.9.xxx │ 26238 │ -│ 217.118.81.xxx │ 26074 │ -│ 213.87.129.xxx │ 25481 │ -│ 83.149.8.xxx │ 24984 │ -│ 217.118.83.xxx │ 22797 │ -│ 78.25.120.xxx │ 22354 │ -│ 213.87.131.xxx │ 21285 │ -│ 78.25.121.xxx │ 20887 │ -│ 188.162.65.xxx │ 19694 │ -│ 83.149.48.xxx │ 17406 │ -└────────────────┴───────┘ -%% - -В связи с тем, что использование xxx весьма необычно, это может быть изменено в дальнейшем, и вам не следует полагаться на конкретный вид этого фрагмента. - -===IPv6NumToString(x)=== -Принимает значение типа FixedString(16), содержащее IPv6-адрес в бинарном виде. Возвращает строку, содержащую этот адрес в текстовом виде. -IPv6-mapped IPv4 адреса выводится в формате %%::ffff:111.222.33.44%%. Примеры: - -%% -SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr - -┌─addr─────────┐ -│ 2a02:6b8::11 │ -└──────────────┘ -%% - -%% -SELECT - IPv6NumToString(ClientIP6 AS k), - count() AS c -FROM hits_all -WHERE EventDate = today() AND substring(ClientIP6, 1, 12) != unhex('00000000000000000000FFFF') -GROUP BY k -ORDER BY c DESC -LIMIT 10 - -┌─IPv6NumToString(ClientIP6)──────────────┬─────c─┐ -│ 2a02:2168:aaa:bbbb::2 │ 24695 │ -│ 2a02:2698:abcd:abcd:abcd:abcd:8888:5555 │ 22408 │ -│ 2a02:6b8:0:fff::ff │ 16389 │ -│ 2a01:4f8:111:6666::2 │ 16016 │ -│ 2a02:2168:888:222::1 │ 15896 │ -│ 2a01:7e00::ffff:ffff:ffff:222 │ 14774 │ -│ 2a02:8109:eee:ee:eeee:eeee:eeee:eeee │ 14443 │ -│ 2a02:810b:8888:888:8888:8888:8888:8888 │ 14345 │ -│ 2a02:6b8:0:444:4444:4444:4444:4444 │ 14279 │ -│ 2a01:7e00::ffff:ffff:ffff:ffff │ 13880 │ -└─────────────────────────────────────────┴───────┘ -%% - -%% -SELECT - IPv6NumToString(ClientIP6 AS k), - count() AS c -FROM hits_all -WHERE EventDate = today() -GROUP BY k -ORDER BY c DESC -LIMIT 10 - -┌─IPv6NumToString(ClientIP6)─┬──────c─┐ -│ ::ffff:94.26.111.111 │ 747440 │ -│ ::ffff:37.143.222.4 │ 529483 │ -│ ::ffff:5.166.111.99 │ 317707 │ -│ ::ffff:46.38.11.77 │ 263086 │ -│ ::ffff:79.105.111.111 │ 186611 │ -│ ::ffff:93.92.111.88 │ 176773 │ -│ ::ffff:84.53.111.33 │ 158709 │ -│ ::ffff:217.118.11.22 │ 154004 │ -│ ::ffff:217.118.11.33 │ 148449 │ -│ ::ffff:217.118.11.44 │ 148243 │ -└────────────────────────────┴────────┘ -%% - -===IPv6StringToNum(s)=== -Функция, обратная к IPv6NumToString. Если IPv6 адрес в неправильном формате, то возвращает строку из нулевых байт. -HEX может быть в любом регистре. - - -==Функции генерации псевдослучайных чисел== - -Используются некриптографические генераторы псевдослучайных чисел. - -Все функции принимают ноль аргументов или один аргумент. -В случае, если передан аргумент - он может быть любого типа, и его значение никак не используется. -Этот аргумент нужен только для того, чтобы предотвратить склейку одинаковых выражений - чтобы две разные записи одной функции возвращали разные столбцы, с разными случайными числами. - -===rand=== -Возвращает псевдослучайное число типа UInt32, равномерно распределённое среди всех чисел типа UInt32. -Используется linear congruential generator. - -===rand64=== -Возвращает псевдослучайное число типа UInt64, равномерно распределённое среди всех чисел типа UInt64. -Используется linear congruential generator. - - -==Функции хэширования== - -Функции хэширования могут использоваться для детерминированного псевдослучайного разбрасывания элементов. - -===halfMD5=== -Вычисляет MD5 от строки. Затем берёт первые 8 байт от хэша и интерпретирует их как UInt64 в big endian. -Принимает аргумент типа String. Возвращает UInt64. -Функция работает достаточно медленно (5 миллионов коротких строк в секунду на одном процессорном ядре). -Если вам не нужен конкретно MD5, то используйте вместо этого функцию sipHash64. - -===MD5=== -Вычисляет MD5 от строки и возвращает полученный набор байт в виде FixedString(16). -Если вам не нужен конкретно MD5, а нужен неплохой криптографический 128-битный хэш, то используйте вместо этого функцию sipHash128. -Если вы хотите получить такой же результат, как выдаёт утилита md5sum, напишите %%lower(hex(MD5(s)))%%. - -===sipHash64=== -Вычисляет SipHash от строки. -Принимает аргумент типа String. Возвращает UInt64. -SipHash - криптографическая хэш-функция. Работает быстрее чем MD5 не менее чем в 3 раза. -Подробнее смотрите по ссылке: https://131002.net/siphash/ - -===sipHash128=== -Вычисляет SipHash от строки. -Принимает аргумент типа String. Возвращает FixedString(16). -Отличается от sipHash64 тем, что финальный xor-folding состояния делается только до 128 бит. - -===cityHash64=== -Вычисляет CityHash64 от строки или похожую хэш-функцию для произвольного количества аргументов произвольного типа. -Если аргумент имеет тип String, то используется CityHash. Это быстрая некриптографическая хэш-функция неплохого качества для строк. -Если аргумент имеет другой тип, то используется implementation specific быстрая некриптографическая хэш-функция неплохого качества. -Если передано несколько аргументов, то функция вычисляется по тем же правилам, с помощью комбинации по цепочке с использованием комбинатора из CityHash. -Например, так вы можете вычислить чексумму всей таблицы с точностью до порядка строк: %%SELECT sum(cityHash64(*)) FROM table%%. - -===intHash32=== -Вычисляет 32-битный хэш-код от целого числа любого типа. -Это сравнительно быстрая некриптографическая хэш-функция среднего качества для чисел. - -===intHash64=== -Вычисляет 64-битный хэш-код от целого числа любого типа. -Работает быстрее, чем intHash32. Качество среднее. - -===SHA1=== -===SHA224=== -===SHA256=== -Вычисляет SHA-1, SHA-224, SHA-256 от строки и возвращает полученный набор байт в виде FixedString(20), FixedString(28), FixedString(32). -Функция работает достаточно медленно (SHA-1 - примерно 5 миллионов коротких строк в секунду на одном процессорном ядре, SHA-224 и SHA-256 - примерно 2.2 миллионов). -Рекомендуется использовать эти функции лишь в тех случаях, когда вам нужна конкретная хэш-функция и вы не можете её выбрать. -Даже в этих случаях, рекомендуется применять функцию оффлайн - заранее вычисляя значения при вставке в таблицу, вместо того, чтобы применять её при SELECT-ах. - -===URLHash(url[, N])=== -Быстрая некриптографическая хэш-функция неплохого качества для строки, полученной из URL путём некоторой нормализации. -URLHash(s) - вычислить хэш от строки без одного завершающего символа /, ? или # на конце, если такой там есть. -URLHash(s, N) - вычислить хэш от строки до N-го уровня в иерархии URL, без одного завершающего символа /, ? или # на конце, если такой там есть. -Уровни аналогичные URLHierarchy. Функция специфична для Яндекс.Метрики. - -==Функции кодирования== - -===hex=== -Принимает строку, число, дату или дату-с-временем. Возвращает строку, содержащую шестнадцатеричное представление аргумента. Используются заглавные буквы A-F. Не используются префиксы %%0x%% и суффиксы %%h%%. Для строк просто все байты кодируются в виде двух шестнадцатеричных цифр. Числа выводятся в big endian ("человеческом") формате. Для чисел вырезаются старшие нули, но только по целым байтам. Например, %%hex(1) = '01'%%. Даты кодируются как число дней с начала unix-эпохи. Даты-с-временем кодируются как число секунд с начала unix-эпохи. - -===unhex(str)=== -Принимает строку, содержащую произвольное количество шестнадцатеричных цифр, и возвращает строку, содержащую соответствующие байты. Поддерживаются как строчные, так и заглавные буквы A-F. Число шестнадцатеричных цифр не обязано быть чётным. Если оно нечётное - последняя цифра интерпретируется как младшая половинка байта 00-0F. Если строка-аргумент содержит что-либо кроме шестнадцатеричных цифр, то будет возвращён какой-либо implementation-defined результат (не кидается исключение). -Если вы хотите преобразовать результат в число, то вы можете использовать функции reverse и reinterpretAsType. - -===UUIDStringToNum(str)=== -Принимает строку, содержащую 36 символов в формате %%123e4567-e89b-12d3-a456-426655440000%%, и возвращает в виде набора байт в FixedString(16). - -===UUIDNumToString(str)=== -Принимает значение типа FixedString(16). Возвращает строку из 36 символов в текстовом виде. - -===bitmaskToList(num)=== -Принимает целое число. Возвращает строку, содержащую список степеней двойки, в сумме дающих исходное число; по возрастанию, в текстовом виде, через запятую, без пробелов. - -===bitmaskToArray(num)=== -Принимает целое число. Возвращает массив чисел типа UInt64, содержащий степени двойки, в сумме дающих исходное число; числа в массиве идут по возрастанию. - - -==Функции округления== - -===floor(x[, N])=== -Возвращает наибольшее круглое число, которое меньше или равно, чем x. -Круглым называется число, кратное 1 / 10N или ближайшее к нему число соответствующего типа данных, если 1 / 10N не представимо точно. -N - целочисленная константа, не обязательный параметр. По умолчанию - ноль, что означает - округлять до целого числа. -N может быть отрицательным. -Примеры: %%floor(123.45, 1) = 123.4%%, %%floor(123.45, -1) = 120%%. -x - любой числовой тип. Результат - число того же типа. -Для целочисленных аргументов имеет смысл округление с отрицательным значением N (для неотрицательных N, функция ничего не делает). -В случае переполнения при округлении (например, %%floor(-128, -1)%%), возвращается implementation specific результат. - -===ceil(x[, N])=== -Возвращает наименьшее круглое число, которое больше или равно, чем x. -В остальном, аналогично функции floor, см. выше. - -===round(x[, N])=== -Возвращает ближайшее к num круглое число, которое может быть меньше или больше или равно x. -Если x находится посередине от ближайших круглых чисел, то возвращается какое-либо одно из них (implementation specific). -Число -0. может считаться или не считаться круглым (implementation specific). -В остальном, аналогично функциям floor и ceil, см. выше. - -===roundToExp2(num)=== -Принимает число. Если число меньше единицы - возвращает 0. Иначе округляет число вниз до ближайшей (целой неотрицательной) степени двух. - -===roundDuration(num)=== -Принимает число. Если число меньше единицы - возвращает 0. Иначе округляет число вниз до чисел из набора: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000. Эта функция специфична для Яндекс.Метрики и предназначена для реализации отчёта по длительности визита. - -===roundAge(num)=== -Принимает число. Если число меньше 18 - возвращает 0. Иначе округляет число вниз до чисел из набора: 18, 25, 35, 45. Эта функция специфична для Яндекс.Метрики и предназначена для реализации отчёта по возрасту посетителей. - - - -==Условные функции== - -===if(cond, then, else), оператор cond ? then : else=== - -Возвращает then, если cond != 0 или else, если cond = 0. -cond должно иметь тип UInt8, а then и else должны иметь тип, для которого есть наименьший общий тип. - - -==Математические функции== - -Все функции возвращают число типа Float64. Точность результата близка к максимально возможной, но результат может не совпадать с наиболее близким к соответствующему вещественному числу машинно представимым числом. - -===e()=== -Принимает ноль аргументов, возвращает число типа Float64, близкое к числу e. - -===pi()=== -Принимает ноль аргументов, возвращает число типа Float64, близкое к числу π. - -===exp(x)=== -Принимает числовой аргумент, возвращает число типа Float64, близкое к экспоненте от аргумента. - -===log(x)=== -Принимает числовой аргумент, возвращает число типа Float64, близкое к натуральному логарифму от аргумента. - -===exp2(x)=== -Принимает числовой аргумент, возвращает число типа Float64, близкое к 2x. - -===log2(x)=== -Принимает числовой аргумент, возвращает число типа Float64, близкое к двоичному логарифму от аргумента. - -===exp10(x)=== -Принимает числовой аргумент, возвращает число типа Float64, близкое к 10x. - -===log10(x)=== -Принимает числовой аргумент, возвращает число типа Float64, близкое к десятичному логарифму от аргумента. - -===sqrt(x)=== -Принимает числовой аргумент, возвращает число типа Float64, близкое к квадратному корню от аргумента. - -===cbrt(x)=== -Принимает числовой аргумент, возвращает число типа Float64, близкое к кубическому корню от аргумента. - -===erf(x)=== - -Если x неотрицательно, то %%erf(x / σ√2)%% - вероятность того, что случайная величина, имеющая нормальное распределение со среднеквадратичным отклонением σ, принимает значение, отстоящее от мат. ожидания больше чем на x. - -Пример (правило трёх сигм): - -%% -SELECT erf(3 / sqrt(2)) - -┌─erf(divide(3, sqrt(2)))─┐ -│ 0.9973002039367398 │ -└─────────────────────────┘ -%% - -===erfc(x)=== -Принимает числовой аргумент, возвращает число типа Float64, близкое к 1 - erf(x), но без потери точности для больших x. - -===lgamma(x)=== -Логарифм от гамма функции. - -===tgamma(x)=== -Гамма функция. - -===sin(x)=== -Синус. - -===cos(x)=== -Косинус. - -===tan(x)=== -Тангенс. - -===asin(x)=== -Арксинус. - -===acos(x)=== -Арккосинус. - -===atan(x)=== -Арктангенс. - -===pow(x, y)=== -xy. - -==Функции для работы со словарями Яндекс.Метрики== - -Чтобы указанные ниже функции работали, в конфиге сервера должны быть указаны пути и адреса для получения всех словарей Яндекс.Метрики. Словари загружаются при первом вызове любой из этих функций. Если справочники не удаётся загрузить - будет выкинуто исключение. - -О том, как создать справочники, смотрите в разделе "Словари". - -===Множественные геобазы=== - -ClickHouse поддерживает работу одновременно с несколькими альтернативными геобазами (иерархиями регионов), для того чтобы можно было поддержать разные точки зрения о принадлежности регионов странам. - -В конфиге clickhouse-server указывается файл с иерархией регионов: -%%<path_to_regions_hierarchy_file>/opt/geo/regions_hierarchy.txt</path_to_regions_hierarchy_file>%% - -Кроме указанного файла, рядом ищутся файлы, к имени которых (до расширения) добавлен символ _ и какой угодно суффикс. -Например, также найдётся файл %%/opt/geo/regions_hierarchy_ua.txt%%, если такой есть. - -%%ua%% называется ключом словаря. Для словаря без суффикса, ключ является пустой строкой. - -Все словари перезагружаются в рантайме (раз в количество секунд, заданное в конфигурационном параметре builtin_dictionaries_reload_interval, по умолчанию - раз в час), но перечень доступных словарей определяется один раз, при старте сервера. - -Во все функции по работе с регионами, в конце добавлен один необязательный аргумент - ключ словаря. Далее он обозначен как geobase. -Пример: -%% -regionToCountry(RegionID) - использует словарь по умолчанию: /opt/geo/regions_hierarchy.txt; -regionToCountry(RegionID, '') - использует словарь по умолчанию: /opt/geo/regions_hierarchy.txt; -regionToCountry(RegionID, 'ua') - использует словарь для ключа ua: /opt/geo/regions_hierarchy_ua.txt; -%% - -===regionToCity(id[, geobase])=== - -Принимает число типа UInt32 - идентификатор региона из геобазы Яндекса. Если регион является городом или входит в некоторый город, то возвращает идентификатор региона - соответствующего города. Иначе возвращает 0. - -===regionToArea(id[, geobase])=== - -Переводит регион в область (тип в геобазе - 5). В остальном, аналогично функции regionToCity. - -%% -SELECT DISTINCT regionToName(regionToArea(toUInt32(number), 'ua')) -FROM system.numbers -LIMIT 15 - -┌─regionToName(regionToArea(toUInt32(number), \'ua\'))─┐ -│ │ -│ Москва и Московская область │ -│ Санкт-Петербург и Ленинградская область │ -│ Белгородская область │ -│ Ивановская область │ -│ Калужская область │ -│ Костромская область │ -│ Курская область │ -│ Липецкая область │ -│ Орловская область │ -│ Рязанская область │ -│ Смоленская область │ -│ Тамбовская область │ -│ Тверская область │ -│ Тульская область │ -└──────────────────────────────────────────────────────┘ -%% - -===regionToDistrict(id[, geobase])=== - -Переводит регион в федеральный округ (тип в геобазе - 4). В остальном, аналогично функции regionToCity. - -%% -SELECT DISTINCT regionToName(regionToDistrict(toUInt32(number), 'ua')) -FROM system.numbers -LIMIT 15 - -┌─regionToName(regionToDistrict(toUInt32(number), \'ua\'))─┐ -│ │ -│ Центральный федеральный округ │ -│ Северо-Западный федеральный округ │ -│ Южный федеральный округ │ -│ Северо-Кавказский федеральный округ │ -│ Приволжский федеральный округ │ -│ Уральский федеральный округ │ -│ Сибирский федеральный округ │ -│ Дальневосточный федеральный округ │ -│ Шотландия │ -│ Фарерские острова │ -│ Фламандский регион │ -│ Брюссельский столичный регион │ -│ Валлония │ -│ Федерация Боснии и Герцеговины │ -└──────────────────────────────────────────────────────────┘ -%% - -===regionToCountry(id[, geobase])=== - -Переводит регион в страну. В остальном, аналогично функции regionToCity. -Пример: %%regionToCountry(toUInt32(213)) = 225%% - преобразовали Москву (213) в Россию (225). - -===regionToContinent(id[, geobase])=== - -Переводит регион в континент. В остальном, аналогично функции regionToCity. -Пример: %%regionToContinent(toUInt32(213)) = 10001%% - преобразовали Москву (213) в Евразию (10001). - -===regionToPopulation(id[, geobase])=== - -Получает население для региона. -Население может быть прописано в файлах с геобазой. Смотрите в разделе "Встроенные словари". -Если для региона не прописано население, возвращается 0. -В геобазе Яндекса, население может быть прописано для дочерних регионов, но не прописано для родительских. - -===regionIn(lhs, rhs[, geobase])=== - -Проверяет принадлежность региона lhs региону rhs. Возвращает число типа UInt8, равное 1, если принадлежит и 0, если не принадлежит. -Отношение рефлексивное - любой регион принадлежит также самому себе. - -===regionHierarchy(id[, geobase])=== - -Принимает число типа UInt32 - идентификатор региона из геобазы Яндекса. Возвращает массив идентификаторов регионов, состоящий из переданного региона и всех родителей по цепочке. -Пример: %%regionHierarchy(toUInt32(213)) = [213,1,3,225,10001,10000]%%. - -===regionToName(id[, lang])=== - -Принимает число типа UInt32 - идентификатор региона из геобазы Яндекса. Вторым аргументом может быть передана строка - название языка. Поддерживаются языки ru, en, ua, uk, by, kz, tr. Если второй аргумент отсутствует - используется язык ru. Если язык не поддерживается - кидается исключение. Возвращает строку - название региона на соответствующем языке. Если региона с указанным идентификатором не существует - возвращается пустая строка. - -ua и uk обозначают одно и то же - украинский язык. - - -==Функции для работы с внешними словарями== - -Подробнее смотрите в разделе "Внешние словари". - -===dictGetUInt8, dictGetUInt16, dictGetUInt32, dictGetUInt64=== -===dictGetInt8, dictGetInt16, dictGetInt32, dictGetInt64=== -===dictGetFloat32, dictGetFloat64=== -===dictGetDate, dictGetDateTime=== -===dictGetString=== - -%%dictGetT('dict_name', 'attr_name', id)%% -- получить из словаря dict_name значение атрибута attr_name по ключу id. -dict_name и attr_name - константные строки. -id должен иметь тип UInt64. -Если ключа id нет в словаре - вернуть значение по умолчанию, заданное в описании словаря. - -===dictGetTOrDefault=== -%%dictGetT('dict_name', 'attr_name', id, default)%% -Аналогично функциям dictGetT, но значение по умолчанию берётся из последнего аргумента функции. - -===dictIsIn=== -%%dictIsIn('dict_name', child_id, ancestor_id)%% -- для иерархического словаря dict_name - узнать, находится ли ключ child_id внутри ancestor_id (или совпадает с ancestor_id). Возвращает UInt8. - -===dictGetHierarchy=== -%%dictGetHierarchy('dict_name', id)%% -- для иерархического словаря dict_name - вернуть массив ключей словаря, начиная с id и продолжая цепочкой родительских элементов. Возвращает Array(UInt64). - -===dictHas=== -%%dictHas('dict_name', id)%% -- проверить наличие ключа в словаре. Возвращает значение типа UInt8, равное 0, если ключа нет и 1, если ключ есть. - - -==Функции для работы с JSON.== - -В Яндекс.Метрике пользователями передаётся JSON в качестве параметров визитов. Для работы с таким JSON-ом, реализованы некоторые функции. (Хотя в большинстве случаев, JSON-ы дополнительно обрабатываются заранее, и полученные значения кладутся в отдельные столбцы в уже обработанном виде.) Все эти функции исходят из сильных допущений о том, каким может быть JSON, и при этом стараются почти ничего не делать. - -Делаются следующие допущения: - -1. Имя поля (аргумент функции) должно быть константой; -2. Считается, что имя поля в JSON-е закодировано некоторым каноническим образом. Например, -%%visitParamHas('{"abc":"def"}', 'abc') = 1%% -, но -%%visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0%% -3. Поля ищутся на любом уровне вложенности, без разбора. Если есть несколько подходящих полей - берётся первое. -4. В JSON-е нет пробельных символов вне строковых литералов. - - -===visitParamHas(params, name)=== - -Проверить наличие поля с именем name. - -===visitParamExtractUInt(params, name)=== - -Распарсить UInt64 из значения поля с именем name. Если поле строковое - попытаться распарсить число из начала строки. Если такого поля нет, или если оно есть, но содержит не число, то вернуть 0. - -===visitParamExtractInt(params, name)=== - -Аналогично для Int64. - -===visitParamExtractFloat(params, name)=== - -Аналогично для Float64. - -===visitParamExtractBool(params, name)=== - -Распарсить значение true/false. Результат - UInt8. - -===visitParamExtractRaw(params, name)=== - -Вернуть значение поля, включая разделители. Примеры: -%%visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"'%% -%%visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}'%% - -===visitParamExtractString(params, name)=== - -Распарсить строку в двойных кавычках. У значения убирается экранирование. Если убрать экранированные символы не удалось, то возвращается пустая строка. Примеры: -%%visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0'%% -%%visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺'%% -%%visitParamExtractString('{"abc":"\\u263"}', 'abc') = ''%% -%%visitParamExtractString('{"abc":"hello}', 'abc') = ''%% -На данный момент, не поддерживаются записанные в формате \uXXXX\uYYYY кодовые точки не из basic multilingual plane (они переводятся не в UTF-8, а в CESU-8). - - -==Функции для реализации оператора IN.== - -===in, notIn, globalIn, globalNotIn=== - -Смотрите раздел "Операторы IN". - - -===tuple(x, y, ...), оператор (x, y, ...)=== -Функция, позволяющая сгруппировать несколько столбцов. -Для столбцов, имеющих типы T1, T2, ... возвращает кортеж типа Tuple(T1, T2, ...), содержащий эти столбцы. Выполнение функции ничего не стоит. -Кортежи обычно используются как промежуточное значение в качестве аргумента операторов IN, или для создания списка формальных параметров лямбда-функций. Кортежи не могут быть записаны в таблицу. - -===tupleElement(tuple, n), оператор x.N=== -Функция, позволяющая достать столбец из кортежа. -N - индекс столбца начиная с 1. N должно быть константой. N должно быть целым строго положительным числом не большим размера кортежа. -Выполнение функции ничего не стоит. - - -==Прочие функции== - -===hostName()=== -Возвращает строку - имя хоста, на котором эта функция была выполнена. При распределённой обработке запроса, это будет имя хоста удалённого сервера, если функция выполняется на удалённом сервере. - -===visibleWidth(x)=== -Вычисляет приблизительную ширину при выводе значения в текстовом (tab-separated) виде на консоль. -Функция используется системой для реализации Pretty форматов. - -===toTypeName(x)=== -Возвращает строку, содержащую имя типа переданного аргумента. - -===blockSize()=== -Получить размер блока. -В ClickHouse выполнение запроса всегда идёт по блокам (наборам кусочков столбцов). Функция позволяет получить размер блока, для которого её вызвали. - -===materialize(x)=== -Превращает константу в полноценный столбец, содержащий только одно значение. -В ClickHouse полноценные столбцы и константы представлены в памяти по-разному. Функции по-разному работают для аргументов-констант и обычных аргументов (выполняется разный код), хотя результат почти всегда должен быть одинаковым. Эта функция предназначена для отладки такого поведения. - -===ignore(...)=== -Принимает любые аргументы, всегда возвращает 0. -При этом, аргумент всё равно вычисляется. Это может использоваться для бенчмарков. - -===sleep(seconds)=== -Спит seconds секунд на каждый блок данных. Можно указать как целое число, так и число с плавающей запятой. - -===currentDatabase()=== -Возвращает имя текущей базы данных. -Эта функция может использоваться в параметрах движка таблицы в запросе CREATE TABLE там, где нужно указать базу данных. - -===isFinite(x)=== -Принимает Float32 или Float64 и возвращает UInt8, равный 1, если аргумент не бесконечный и не NaN, иначе 0. - -===isInfinite(x)=== -Принимает Float32 или Float64 и возвращает UInt8, равный 1, если аргумент бесконечный, иначе 0. Отметим, что в случае NaN возвращается 0. - -===isNaN(x)=== -Принимает Float32 или Float64 и возвращает UInt8, равный 1, если аргумент является NaN, иначе 0. - -===hasColumnInTable('database', 'table', 'column')=== -Принимает константные строки - имя базы данных, имя таблицы и название столбца. Возвращает константное выражение типа UInt8, равное 1, -если есть столбец, иначе 0. -Функция кидает исключение, если таблица не существует. -Для элементов вложенной структуры данных функция проверяет существование столбца. Для самой же вложенной структуры данных функция возвращает 0. - -===bar=== -Позволяет построить unicode-art диаграмму. - -bar(x, min, max, width) - рисует полосу ширины пропорциональной (x - min) и равной width символов при x == max. -min, max - целочисленные константы, значение должно помещаться в Int64. -width - константа, положительное число, может быть дробным. - -Полоса рисуется с точностью до одной восьмой символа. Пример: - -%% -SELECT - toHour(EventTime) AS h, - count() AS c, - bar(c, 0, 600000, 20) AS bar -FROM test.hits -GROUP BY h -ORDER BY h ASC - -┌──h─┬──────c─┬─bar────────────────┐ -│ 0 │ 292907 │ █████████▋ │ -│ 1 │ 180563 │ ██████ │ -│ 2 │ 114861 │ ███▋ │ -│ 3 │ 85069 │ ██▋ │ -│ 4 │ 68543 │ ██▎ │ -│ 5 │ 78116 │ ██▌ │ -│ 6 │ 113474 │ ███▋ │ -│ 7 │ 170678 │ █████▋ │ -│ 8 │ 278380 │ █████████▎ │ -│ 9 │ 391053 │ █████████████ │ -│ 10 │ 457681 │ ███████████████▎ │ -│ 11 │ 493667 │ ████████████████▍ │ -│ 12 │ 509641 │ ████████████████▊ │ -│ 13 │ 522947 │ █████████████████▍ │ -│ 14 │ 539954 │ █████████████████▊ │ -│ 15 │ 528460 │ █████████████████▌ │ -│ 16 │ 539201 │ █████████████████▊ │ -│ 17 │ 523539 │ █████████████████▍ │ -│ 18 │ 506467 │ ████████████████▊ │ -│ 19 │ 520915 │ █████████████████▎ │ -│ 20 │ 521665 │ █████████████████▍ │ -│ 21 │ 542078 │ ██████████████████ │ -│ 22 │ 493642 │ ████████████████▍ │ -│ 23 │ 400397 │ █████████████▎ │ -└────┴────────┴────────────────────┘ -%% - -===transform=== -Преобразовать значение согласно явно указанному отображению одних элементов на другие. -Имеется два варианта функции: - -1. %%transform(x, array_from, array_to, default)%% - -%%x%% - что преобразовывать. -%%array_from%% - константный массив значений для преобразования. -%%array_to%% - константный массив значений, в которые должны быть преобразованы значения из from. -%%default%% - какое значение использовать, если x не равен ни одному из значений во from. - -array_from и array_to - массивы одинаковых размеров. - -Типы: -transform(T, Array(T), Array(U), U) -> U - -T и U - могут быть числовыми, строковыми, или Date или DateTime типами. -При этом, где обозначена одна и та же буква (T или U), могут быть, в случае числовых типов, не совпадающие типы, а типы, для которых есть общий тип. -Например, первый аргумент может иметь тип Int64, а второй - Array(UInt16). - -Если значение x равно одному из элементов массива array_from, то возвращает соответствующий (такой же по номеру) элемент массива array_to; иначе возвращает default. Если имеется несколько совпадающих элементов в array_from, то возвращает какой-нибудь из соответствующих. - -Пример: - -%% - -SELECT - transform(SearchEngineID, [2, 3], ['Яндекс', 'Google'], 'Остальные') AS title, - count() AS c -FROM test.hits -WHERE SearchEngineID != 0 -GROUP BY title -ORDER BY c DESC - -┌─title─────┬──────c─┐ -│ Яндекс │ 498635 │ -│ Google │ 229872 │ -│ Остальные │ 104472 │ -└───────────┴────────┘ -%% - -2. %%transform(x, array_from, array_to)%% - -Отличается от первого варианта отсутствующим аргументом default. -Если значение x равно одному из элементов массива array_from, то возвращает соответствующий (такой же по номеру) элемент массива array_to; иначе возвращает x. - -Типы: -transform(T, Array(T), Array(T)) -> T - -Пример: - -%% - -SELECT - transform(domain(Referer), ['yandex.ru', 'google.ru', 'vk.com'], ['www.yandex', 'ввв.яндекс.рф', 'example.com']) AS s, - count() AS c -FROM test.hits -GROUP BY domain(Referer) -ORDER BY count() DESC -LIMIT 10 - -┌─s──────────────┬───────c─┐ -│ │ 2906259 │ -│ www.yandex │ 867767 │ -│ ███████.ru │ 313599 │ -│ mail.yandex.ru │ 107147 │ -│ ввв.яндекс.рф │ 105668 │ -│ ██████.ru │ 100355 │ -│ █████████.ru │ 65040 │ -│ news.yandex.ru │ 64515 │ -│ ██████.net │ 59141 │ -│ example.com │ 57316 │ -└────────────────┴─────────┘ -%% - -===formatReadableSize(x)=== - -Принимает размер (число байт). Возвращает округленный размер с суффиксом (KiB, MiB и т.д.) в виде строки. - -Пример: - -%% -SELECT - arrayJoin([1, 1024, 1024*1024, 192851925]) AS filesize_bytes, - formatReadableSize(filesize_bytes) AS filesize - -┌─filesize_bytes─┬─filesize───┐ -│ 1 │ 1.00 B │ -│ 1024 │ 1.00 KiB │ -│ 1048576 │ 1.00 MiB │ -│ 192851925 │ 183.92 MiB │ -└────────────────┴────────────┘ -%% - -===least(a, b)=== - -Возвращает наименьшее значение из a и b. - -===greatest(a, b)=== - -Возвращает наибольшее значение из a и b. - -===uptime()=== - -Возвращает аптайм сервера в секундах. - -===version()=== - -Возвращает версию сервера в виде строки. - -===rowNumberInAllBlocks()=== - -Возвращает порядковый номер строки в блоке данных. Функция учитывает только задействованные блоки данных. - -===runningDifference(x)=== - -Считает разницу между последовательными значениями строк в блоке данных. -Возвращает 0 для первой строки и разницу с предыдущей строкой для каждой последующей строки. - -Результат функции зависит от затронутых блоков данных и порядка данных в блоке. -Если сделать подзапрос с ORDER BY и вызывать функцию извне подзапроса, можно будет получить ожидаемый результат. - -Пример: -%% -SELECT - EventID, - EventTime, - runningDifference(EventTime) AS delta -FROM -( - SELECT - EventID, - EventTime - FROM events - WHERE EventDate = '2016-11-24' - ORDER BY EventTime ASC - LIMIT 5 -) - -┌─EventID─┬───────────EventTime─┬─delta─┐ -│ 1106 │ 2016-11-24 00:00:04 │ 0 │ -│ 1107 │ 2016-11-24 00:00:05 │ 1 │ -│ 1108 │ 2016-11-24 00:00:05 │ 0 │ -│ 1109 │ 2016-11-24 00:00:09 │ 4 │ -│ 1110 │ 2016-11-24 00:00:10 │ 1 │ -└─────────┴─────────────────────┴───────┘ -%% - - -==Функция arrayJoin== - -Это совсем необычная функция. - -Обычные функции не изменяют множество строк, а лишь изменяют значения в каждой строке (map). -Агрегатные функции выполняют свёртку множества строк (fold, reduce). -Функция arrayJoin выполняет размножение каждой строки в множество строк (unfold). - -Функция принимает в качестве аргумента массив, и размножает исходную строку в несколько строк - по числу элементов массива. -Все значения в столбцах просто копируются, кроме значения в столбце с применением этой функции - он заменяется на соответствующее значение массива. - -В запросе может быть использовано несколько функций arrayJoin. В этом случае, соответствующее преобразование делается несколько раз. - -Обратите внимание на синтаксис ARRAY JOIN в запросе SELECT, который предоставляет более широкие возможности. - -Пример: - -%% -:) SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src - -SELECT - arrayJoin([1, 2, 3] AS src) AS dst, - 'Hello', - src - -┌─dst─┬─\'Hello\'─┬─src─────┐ -│ 1 │ Hello │ [1,2,3] │ -│ 2 │ Hello │ [1,2,3] │ -│ 3 │ Hello │ [1,2,3] │ -└─────┴───────────┴─────────┘ -%% - -
    -
    -

    Агрегатные функции

    -
    -
    - -==count()== - -Считает количество строк. Принимает ноль аргументов, возвращает UInt64. -Не поддерживается синтаксис COUNT(DISTINCT x) - для этого есть отдельная агрегатная функция uniq. - -Запрос вида SELECT count() FROM table не оптимизируется, так как количество записей в таблице нигде не хранится отдельно - из таблицы будет выбран какой-нибудь достаточно маленький столбец, и будет посчитано количество значений в нём. - - -==any(x)== - -Выбирает первое попавшееся значение. -Порядок выполнения запроса может быть произвольным и даже каждый раз разным, поэтому результат данной функции недетерминирован. -Для получения детерминированного результата, можно использовать функции min или max вместо any. - -В некоторых случаях, вы всё-таки можете рассчитывать на порядок выполнения запроса. Это - случаи, когда SELECT идёт из подзапроса, в котором используется ORDER BY. - -При наличии в запросе SELECT секции GROUP BY или хотя бы одной агрегатной функции, ClickHouse (в отличие от MySQL) требует, чтобы все выражения в секциях SELECT, HAVING, ORDER BY вычислялись из ключей или из агрегатных функций. То есть, каждый выбираемый из таблицы столбец, должен использоваться либо в ключах, либо внутри агрегатных функций. Чтобы получить поведение, как в MySQL, вы можете поместить остальные столбцы в агрегатную функцию any. - - -==anyLast(x)== - -Выбирает последнее попавшееся значение. -Результат так же недетерминирован, как и для функции any. - - -==min(x)== - -Вычисляет минимум. - - -==max(x)== - -Вычисляет максимум. - - -==argMin(arg, val)== - -Вычисляет значение arg при минимальном значении val. Если есть несколько разных значений arg для минимальных значений val, то выдаётся первое попавшееся из таких значений. - - -==argMax(arg, val)== - -Вычисляет значение arg при максимальном значении val. Если есть несколько разных значений arg для максимальных значений val, то выдаётся первое попавшееся из таких значений. - - -==sum(x)== - -Вычисляет сумму. -Работает только для чисел. - - -==avg(x)== - -Вычисляет среднее. -Работает только для чисел. -Результат всегда - Float64. - - -==uniq(x)== - -Приближённо вычисляет количество различных значений аргумента. Работает для чисел, строк, дат, дат-с-временем, для нескольких аргументов и аргументов-кортежей. - -Используется алгоритм типа adaptive sampling: в качестве состояния вычислений используется выборка значений хэшей элементов, размером до 65 536. -Алгоритм является очень точным для множеств небольшой кардинальности (до 65 536) и очень эффективным по CPU (при расчёте не слишком большого количества таких функций, использование uniq почти так же быстро, как использование других агрегатных функций). - -Результат детерминирован (не зависит от порядка выполнения запроса). - - -==uniqCombined(x)== - -Приближённо вычисляет количество различных значений аргумента. Работает для чисел, строк, дат, дат-с-временем, для нескольких аргументов и аргументов-кортежей. - -Используется комбинация трёх алгоритмов: массив, хэш-таблица и HyperLogLog с таблицей коррекции погрешности. Расход памяти в несколько раз меньше, чем у функции uniq, а точность в несколько раз выше. Скорость работы чуть ниже, чем у функции uniq, но иногда может быть даже выше - в случае распределённых запросов, в которых по сети передаётся большое количество состояний агрегации. Максимальный размер состояния составляет 96 KiB (HyperLogLog из 217 6-битовых ячеек). - -Результат детерминирован (не зависит от порядка выполнения запроса). - -Функция uniqCombined является хорошим выбором по умолчанию для подсчёта количества различных значений. - - -==uniqHLL12(x)== - -Приближённо вычисляет количество различных значений аргумента, используя алгоритм HyperLogLog. -Используется 212 5-битовых ячеек. Размер состояния чуть больше 2.5 КБ. - -Результат детерминирован (не зависит от порядка выполнения запроса). - -В большинстве случаев, используйте функцию uniq или uniqCombined. - - -==uniqExact(x)== - -Вычисляет количество различных значений аргумента, точно. -Не стоит бояться приближённых расчётов. Поэтому, используйте лучше функцию uniq. -Функцию uniqExact следует использовать, если вам точно нужен точный результат. - -Функция uniqExact расходует больше оперативки, чем функция uniq, так как размер состояния неограниченно растёт по мере роста количества различных значений. - - -==groupArray(x)== - -Составляет массив из значений аргумента. -Значения в массив могут быть добавлены в любом (недетерминированном) порядке. - -В некоторых случаях, вы всё-таки можете рассчитывать на порядок выполнения запроса. Это - случаи, когда SELECT идёт из подзапроса, в котором используется ORDER BY. - - -==groupUniqArray(x)== - -Составляет массив из различных значений аргумента. Расход оперативки такой же, как у функции uniqExact. - - -==quantile(level)(x)== - -Приближённо вычисляет квантиль уровня level. level - константа, число с плавающей запятой от 0 до 1. -Рекомендуется использовать значения level в диапазоне 0.01 .. 0.99. -Не используйте значения level, равные 0 или 1 - для таких случаев есть функции min и max. - -В этой функции, равно как и во всех функциях для расчёта квантилей, параметр level может быть не указан. В таком случае, он принимается равным 0.5 - то есть, функция будет вычислять медиану. - -Работает для чисел, дат, дат-с-временем. -Для чисел возвращает Float64, для дат - дату, для дат-с-временем - дату-с-временем. - -Используется reservoir sampling с размером резервуара до 8192. -При необходимости, результат выдаётся с линейной аппроксимацией из двух соседних значений. -Этот алгоритм обеспечивает весьма низкую точность расчёта. Смотрите также функции quantileTiming, quantileTDigest, quantileExact. - -Результат зависит от порядка выполнения запроса, и является недетерминированным. - -При использовании нескольких функций quantile (и аналогичных) с разными уровнями в запросе, внутренние состояния не объединяются (то есть, запрос работает менее эффективно, чем мог бы). В этом случае, используйте функцию quantiles (и аналогичные). - - -==quantileDeterministic(level)(x, determinator)== - -Работает аналогично функции quantile, но, в отличие от неё, результат является детерминированным и не зависит от порядка выполнения запроса. - -Для этого, функция принимает второй аргумент - "детерминатор". Это некоторое число, хэш от которого используется вместо генератора случайных чисел в алгоритме reservoir sampling. Для правильной работы функции, одно и то же значение детерминатора не должно встречаться слишком часто. В качестве детерминатора вы можете использовать идентификатор события, идентификатор посетителя и т. п. - -Не используйте эту функцию для рассчёта таймингов. Для этого есть более подходящая функции - quantileTiming. - - -==quantileTiming(level)(x)== - -Вычисляет квантиль уровня level с фиксированной точностью. -Работает для чисел. Предназначена для расчёта квантилей от времени загрузки страницы в миллисекундах. - -Если значение больше 30 000 (соответствует времени загрузки страницы большем 30 секундам.) - результат приравнивается к 30 000. - -Если всего значений не больше примерно 5670, то вычисление точное. -Иначе: -- если время меньше 1024 мс., то вычисление точное. -- иначе вычисление идёт с округлением до числа, кратного 16 мс. - -При передаче в функцию отрицательных значений, поведение не определено. - -Возвращаемое значение имеет тип Float32. Когда в функцию не было передано ни одного значения (при использовании quantileTimingIf), возвращается nan. Это сделано, чтобы отличать такие случаи от нулей. Смотрите замечание о сортировке NaN-ов в разделе "Секция ORDER BY". - -Результат детерминирован (не зависит от порядка выполнения запроса). - -Для своей задачи (расчёт квантилей времени загрузки страниц), использование этой функции эффективнее и результат точнее, чем для функции quantile. - - -==quantileTimingWeighted(level)(x, weight)== - -Отличается от функции medianTiming наличием второго аргумента - "веса". Вес - неотрицательное целое число. -Результат считается так же, как если бы в функцию medianTiming значение x было передано weight количество раз. - - -==quantileExact(level)(x)== - -Вычисляет квантиль уровня level точно. Для этого, все переданные значения складываются в массив, который затем частично сортируется. Поэтому, функция потребляет O(n) памяти, где n - количество переданных значений. Впрочем, для случая маленького количества значений, функция весьма эффективна. - - -==quantileExactWeighted(level)(x, weight)== - -Вычисляет квантиль уровня level точно. При этом, каждое значение учитывается с весом weight - как будто оно присутствует weight раз. Аргументы функции можно рассматривать как гистограммы, где значению x соответствует "столбик" гистограммы высоты weight, а саму функцию можно рассматривать как суммирование гистограмм. - -В качестве алгоритма используется хэш-таблица. Из-за этого, в случае, если передаваемые значения часто повторяются, функция потребляет меньше оперативки, чем quantileExact. Вы можете использовать эту функцию вместо quantileExact, указав в качестве веса число 1. - - -==quantileTDigest(level)(x)== - -Вычисляет квантиль уровня level приближённо, с использованием алгоритма t-digest. Максимальная погрешность составляет 1%. Расход памяти на состояние пропорционален логарифму от количества переданных значений. - -Производительность функции ниже quantile, quantileTiming. По соотношению размера состояния и точности, функция существенно лучше, чем quantile. - -Результат зависит от порядка выполнения запроса, и является недетерминированным. - - -==median== - -Для всех quantile-функций, также присутствуют соответствующие median-функции: %%median%%, %%medianDeterministic%%, %%medianTiming%%, %%medianTimingWeighted%%, %%medianExact%%, %%medianExactWeighted%%, %%medianTDigest%%. Они являются синонимами и их поведение ничем не отличается. - - -==quantiles(level1, level2, ...)(x)== - -Для всех quantile-функций, также присутствуют соответствующие quantiles-функции: %%quantiles%%, %%quantilesDeterministic%%, %%quantilesTiming%%, %%quantilesTimingWeighted%%, %%quantilesExact%%, %%quantilesExactWeighted%%, %%quantilesTDigest%%. Эти функции за один проход вычисляют все квантили перечисленных уровней и возвращают массив вычисленных значений. - - -==varSamp(x)== - -Вычисляет величину Σ((x - x̅)2) / (n - 1), где n - размер выборки, x̅ - среднее значение x. - -Она представляет собой несмещённую оценку дисперсии случайной величины, если переданные в функцию значения являются выборкой этой случайной величины. - -Возвращает Float64. В случае, когда n <= 1, возвращается +∞. - - -==varPop(x)== - -Вычисляет величину Σ((x - x̅)2) / n, где n - размер выборки, x̅ - среднее значение x. - -То есть, дисперсию для множества значений. Возвращает Float64. - - -==stddevSamp(x)== - -Результат равен квадратному корню от varSamp(x). - - -==stddevPop(x)== - -Результат равен квадратному корню от varPop(x). - - -==covarSamp(x, y)== - -Вычисляет величину %%Σ((x - x̅)(y - y̅)) / (n - 1)%%. - -Возвращает Float64. В случае, когда n <= 1, возвращается +∞. - - -==covarPop(x, y)== - -Вычисляет величину %%Σ((x - x̅)(y - y̅)) / n%%. - - -==corr(x, y)== - -Вычисляет коэффициент корреляции Пирсона: Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)2) * Σ((y - y̅)2)). - - -==Параметрические агрегатные функции== - -Некоторые агрегатные функции могут принимать не только столбцы-аргументы (по которым производится свёртка), но и набор параметров - констант для инициализации. Синтаксис - две пары круглых скобок вместо одной. Первая - для параметров, вторая - для аргументов. - - -==sequenceMatch(pattern)(time, cond1, cond2, ...)== - -Сопоставление с образцом для цепочки событий. - -pattern - строка, содержащая шаблон для сопоставления. Шаблон похож на регулярное выражение. -time - время события, тип DateTime -cond1, cond2 ... - от одного до 32 аргументов типа UInt8 - признаков, было ли выполнено некоторое условие для события. - -Функция собирает в оперативке последовательность событий. Затем производит проверку на соответствие этой последовательности шаблону. -Возвращает UInt8 - 0, если шаблон не подходит и 1, если шаблон подходит. - -Пример: %%sequenceMatch('(?1).*(?2)')(EventTime, URL LIKE '%company%', URL LIKE '%cart%')%% -- была ли цепочка событий, в которой посещение страницы с адресом, содержащим %%company%% было раньше по времени посещения страницы с адресом, содержащим %%cart%%. - -Это вырожденный пример. Его можно записать с помощью других агрегатных функций: -%%minIf(EventTime, URL LIKE '%company%') < maxIf(EventTime, URL LIKE '%cart%')%%. -Но в более сложных случаях, такого решения нет. - -Синтаксис шаблонов: -%%(?1)%% - ссылка на условие (вместо 1 - любой номер); -%%.*%% - произвольное количество любых событий; -%%(?t>=1800)%% - условие на время; -за указанное время допускается любое количество любых событий; -вместо >= могут использоваться операторы <, >, <=; -вместо 1800 может быть любое число; - -События, произошедшие в одну секунду, могут оказаться в цепочке в произвольном порядке. От этого может зависеть результат работы функции. - - -==sequenceCount(pattern)(time, cond1, cond2, ...)== - -Аналогично функции sequenceMatch, но возвращает не факт наличия цепочки событий, а UInt64 - количество найденных цепочек. -Цепочки ищутся без перекрытия. То есть, следующая цепочка может начаться только после окончания предыдущей. - - -==uniqUpTo(N)(x)== - -Вычисляет количество различных значений аргумента, если оно меньше или равно N. -В случае, если количество различных значений аргумента больше N, возвращает N + 1. - -Рекомендуется использовать для маленьких N - до 10. Максимальное значение N - 100. - -Для состояния агрегатной функции используется количество оперативки равное 1 + N * размер одного значения байт. -Для строк запоминается некриптографический хэш, имеющий размер 8 байт. То есть, для строк вычисление приближённое. - -Функция также работает для нескольких аргументов. - -Работает максимально быстро за исключением патологических случаев, когда используется большое значение N и количество уникальных значений чуть меньше N. - -Пример применения: -Задача: показывать в отчёте только поисковые фразы, по которым было хотя бы 5 уникальных посетителей. -Решение: пишем в запросе %%GROUP BY SearchPhrase HAVING uniqUpTo(4)(UserID) >= 5%% - -==topK(N)(x)== - -Returns the K most frequent argument values as an array sorted by their relative frequency. - -Recommended for use with small Ns, up to 10. The maximum N value is 65536. - -For the state of an aggregate function, it uses approximately the amount of memory equal to K * (the size of the key + 16) for counters, and 48 * N bytes for alpha value map. - -Usage example: -Problem: Generate a report that shows top 5 frequent queries. -Solution: Write in the query SELECT topK(5)(SearchPhrase) - -==Комбинаторы агрегатных функций== - -К имени агрегатной функции может быть приписан некоторый суффикс. При этом, работа агрегатной функции некоторым образом модифицируется. -Существуют комбинаторы %%If%% и %%Array%%. Смотрите разделы ниже. - - -==Комбинатор -If. Условные агрегатные функции== - -К имени любой агрегатной функции может быть приписан суффикс -%%If%%. В этом случае, агрегатная функция принимает ещё один дополнительный аргумент - условие (типа UInt8). Агрегатная функция будет обрабатывать только те строки, для которых условие сработало. Если условие ни разу не сработало - возвращается некоторое значение по умолчанию (обычно - нули, пустые строки). - -Примеры: %%sumIf(column, cond)%%, %%countIf(cond)%%, %%avgIf(x, cond)%%, %%quantilesTimingIf(level1, level2)(x, cond)%%, %%argMinIf(arg, val, cond)%% и т. п. - -С помощью условных агрегатных функций, вы можете вычислить агрегаты сразу для нескольких условий, не используя подзапросы и JOIN-ы. -Например, в Яндекс.Метрике, условные агрегатные функции используются для реализации функциональности сравнения сегментов. - - -==Комбинатор -Array. Агрегатные функции для аргументов-массивов== - -К имени любой агрегатной функции может быть приписан суффикс -%%Array%%. В этом случае, агрегатная функция вместо аргументов типов T принимает аргументы типов Array(T) (массивы). Если агрегатная функция принимает несколько аргументов, то это должны быть массивы одинаковых длин. При обработке массивов, агрегатная функция работает, как исходная агрегатная функция по всем элементам массивов. - -Пример 1: %%sumArray(arr)%% - просуммировать все элементы всех массивов arr. В данном примере можно было бы написать проще: %%sum(arraySum(arr))%%. -Пример 2: %%uniqArray(arr)%% - посчитать количество уникальных элементов всех массивов arr. Это можно было бы сделать проще: %%uniq(arrayJoin(arr))%%, но не всегда есть возможность добавить arrayJoin в запрос. - -Комбинаторы -%%If%% и -%%Array%% можно сочетать. При этом, должен сначала идти Array, а потом If. Примеры: %%uniqArrayIf(arr, cond)%%, %%quantilesTimingArrayIf(level1, level2)(arr, cond)%%. Из-за такого порядка получается, что аргумент cond не должен быть массивом. - - -==Комбинатор -State.== - -В случае применения этого комбинатора, агрегатная функция возвращает не готовое значение (например, в случае функции uniq - количество уникальных значений), а промежуточное состояние агрегации (например, в случае функции uniq - хэш-таблицу для рассчёта количества уникальных значений), которое имеет тип %%AggregateFunction(...)%% и может использоваться для дальнейшей обработки или может быть сохранено в таблицу для последующей доагрегации - смотрите разделы "AggregatingMergeTree" и "функции для работы с промежуточными состояниями агрегации". - - -==Комбинатор -Merge.== - -В случае применения этого комбинатора, агрегатная функция будет принимать в качестве аргумента промежуточное состояние агрегации, доагрегировать (объединять вместе) эти состояния, и возвращать готовое значение. - - -==Комбинатор -MergeState.== - -Выполняет слияние промежуточных состояний агрегации, аналогично комбинатору -Merge, но возвращает не готовое значение, а промежуточное состояние агрегации, аналогично комбинатору -State. - - -
    -
    -

    Словари

    -
    -
    - -Словарь - это отображение (ключ -> атрибуты), которое можно использовать в запросе в виде функций. -Это можно рассматривать как более удобный и максимально эффективный вариант JOIN-а с таблицами-справочниками (dimension tables). - -Существуют встроенные и подключаемые (внешние) словари. - -==Встроенные словари== - -ClickHouse содержит встроенную возможность работы с геобазой. - -Это позволяет: -- для идентификатора региона получить его имя на нужном языке; -- по идентификатору региона получить идентификатор города, области, федерального округа, страны, континента; -- проверить, что один регион входит в другой; -- получить цепочку родительских регионов. - -Все функции поддерживают "транслокальность", то есть возможность использовать одновременно разные точки зрения на принадлежность регионов. Подробнее смотрите в разделе "Функции для работы со словарями Яндекс.Метрики". - -В пакете по умолчанию, встроенные словари выключены. -Для включения, раскомментируйте параметры path_to_regions_hierarchy_file и path_to_regions_names_files в конфигурационном файле сервера. - -Геобаза загружается из текстовых файлов. -Если вы работаете в Яндексе, то для их создания вы можете воспользоваться инструкцией: -https://github.yandex-team.ru/raw/Metrika/ClickHouse_private/master/doc/create_embedded_geobase_dictionaries.txt - -Положите файлы regions_hierarchy*.txt в директорию path_to_regions_hierarchy_file. Этот конфигурационный параметр должен содержать путь к файлу regions_hierarchy.txt (иерархия регионов по умолчанию), а другие файлы (regions_hierarchy_ua.txt) должны находиться рядом в той же директории. - -Положите файлы regions_names_*.txt в директорию path_to_regions_names_files. - -Также вы можете создать эти файлы самостоятельно. Формат файлов такой: - -regions_hierarchy*.txt: TabSeparated (без заголовка), столбцы: -- идентификатор региона (UInt32); -- идентификатор родительского региона (UInt32); -- тип региона (UInt8): 1 - континент, 3 - страна, 4 - федеральный округ, 5 - область, 6 - город; остальные типы не имеют значения; -- население (UInt32) - не обязательный столбец. - -regions_names_*.txt: TabSeparated (без заголовка), столбцы: -- идентификатор региона (UInt32); -- имя региона (String) - не может содержать табы или переводы строк, даже экранированные. - -Для хранения в оперативке используется плоский массив. Поэтому, идентификаторы не должны быть больше миллиона. - -Словари могут обновляться без перезапуска сервера. Но набор доступных словарей не обновляется. -Для обновления проверяется время модификации файлов; если файл изменился, то словарь будет обновлён. -Периодичность проверки настраивается конфигурационным параметром builtin_dictionaries_reload_interval. -Обновление словарей (кроме загрузки при первом использовании) не блокирует запросы - во время обновления запросы используют старую версию словарей. Если при обновлении возникнет ошибка, то ошибка пишется в лог сервера, а запросы продолжат использовать старую версию словарей. - -Рекомендуется периодически обновлять словари с геобазой. При обновлении, генерируйте новые файлы, записывая их в отдельное место, а только когда всё готово - переименовывайте в файлы, которые использует сервер. - -Также имеются функции для работы с идентификаторами операционных систем и поисковых систем Яндекс.Метрики, пользоваться которыми не нужно. - - -==Внешние словари== - -Существует возможность подключать свои собственные словари из различных источников данных. -Источником данных для словаря может быть файл на локальной файловой системе, сервер ClickHouse, сервер MySQL, MongoDB или любой ODBC источник. -Словарь может полностью храниться в оперативке и периодически обновляться, или быть частично закэшированным в оперативке и динамически подгружать отсутствующие значения. - -Конфигурация внешних словарей находится в отдельном файле или файлах, указанных в конфигурационном параметре dictionaries_config. -Этот параметр содержит абсолютный или относительный путь к файлу с конфигурацией словарей. Относительный путь - относительно директории с конфигурационным файлом сервера. Путь может содержать wildcard-ы * и ? - тогда рассматриваются все подходящие файлы. Пример: dictionaries/*.xml. - -Конфигурация словарей, а также множество файлов с конфигурацией, может обновляться без перезапуска сервера. Сервер проверяет обновления каждые 5 секунд. То есть, словари могут подключаться динамически. - -Создание словарей может производиться при старте сервера или при первом использовании. Это определяется конфигурационном параметром dictionaries_lazy_load (в основном конфигурационном файле сервера). Параметр не обязателен, по умолчанию - true. Если true, то каждый словарь создаётся при первом использовании; если словарь не удалось создать - вызов функции, использующей словарь, кидает исключение. Если false, то все словари создаются при старте сервера, и в случае ошибки, сервер завершает работу. - -Конфигурационный файл словарей имеет вид: - -%% -<dictionaries> - <comment>Не обязательный элемент с любым содержимым; полностью игнорируется.</comment> - - <!-- Можно задать произвольное количество разных словарей. --> - <dictionary> - <!-- Имя словаря. Под этим именем словарь будет доступен для использования. --> - <name>os</name> - - <!-- Источник данных. --> - <source> - - <!-- Источник - файл на локальной файловой системе. --> - <file> - <!-- Путь на локальной файловой системе. --> - <path>/opt/dictionaries/os.tsv</path> - <!-- С помощью какого формата понимать файл. --> - <format>TabSeparated</format> - </file> - - <!-- или источник - таблица на сервере MySQL. - <mysql> - <!- - Эти параметры могут быть указаны как снаружи (общие для всех реплик), так и внутри конкретной реплики - -> - <port>3306</port> - <user>clickhouse</user> - <password>qwerty</password> - <!- - Можно указать от одной до произвольного количества реплик для отказоустойчивости. - -> - <replica> - <host>example01-1</host> - <priority>1</priority> <!- - Меньше значение - больше приоритет. - -> - </replica> - <replica> - <host>example01-2</host> - <priority>1</priority> - </replica> - <db>conv_main</db> - <table>counters</table> - </mysql> - --> - - <!-- или источник - таблица на сервере ClickHouse. - <clickhouse> - <host>example01-01-1</host> - <port>9000</port> - <user>default</user> - <password></password> - <db>default</db> - <table>counters</table> - </clickhouse> - <!- - Если адрес похож на localhost, то запрос будет идти без сетевого взаимодействия. - Для отказоустойчивости, вы можете создать Distributed таблицу на localhost и прописать её. - -> - --> - - <!-- или источник - исполняемый файл. Если layout.cache - список нужных ключей будет записан в поток STDIN программы --> - <executable> - <!-- Путь или имя программы (если директория есть в переменной окружения PATH) и параметры --> - <command>cat /opt/dictionaries/os.tsv</command> - <!-- С помощью какого формата понимать вывод и формировать список ключей. --> - <format>TabSeparated</format> - </executable> - - <!-- или источник - http сервер. Если layout.cache - список нужных ключей будет послан как POST запрос --> - <http> - <url>http://[::1]/os.tsv</url> - <!-- С помощью какого формата понимать ответ и формировать список ключей. --> - <format>TabSeparated</format> - </http> - - </source> - - <!-- Периодичность обновления для полностью загружаемых словарей. 0 - никогда не обновлять. --> - <lifetime> - <min>300</min> - <max>360</max> - <!-- Периодичность обновления выбирается равномерно-случайно между min и max, - чтобы размазать по времени нагрузку при обновлении словарей на большом количестве серверов. --> - </lifetime> - - <!-- или - <!- - Периодичность обновления для полностью загружаемых словарей или время инвалидации для кэшируемых словарей. - 0 - никогда не обновлять. - -> - <lifetime>300</lifetime> - --> - - <layout> <!-- Способ размещения в памяти. --> - <flat /> - <!-- или - <hashed /> - или - <cache> - <!- - Размер кэша в количестве ячеек; округляется вверх до степени двух. - -> - <size_in_cells>1000000000</size_in_cells> - </cache> - --> - </layout> - - <!-- Структура. --> - <structure> - <!-- Описание столбца, являющегося идентификатором (ключом) словаря. --> - <id> - <!-- Имя столбца с идентификатором. --> - <name>Id</name> - </id> - - <attribute> <!-- id уже входит в атрибуты и дополнительно указывать его здесь не нужно. --> - <!-- Имя столбца. --> - <name>Name</name> - <!-- Тип столбца. (Как столбец понимается при загрузке. - В случае MySQL, в таблице может быть TEXT, VARCHAR, BLOB, но загружается всё как String) --> - <type>String</type> - <!-- Какое значение использовать для несуществующего элемента. В примере - пустая строка. --> - <null_value></null_value> - </attribute> - - <!-- Может быть указано произвольное количество атрибутов. --> - <attribute> - <name>ParentID</name> - <type>UInt64</type> - <null_value>0</null_value> - <!-- Определяет ли иерархию - отображение в идентификатор родителя (по умолчанию, false). --> - <hierarchical>true</hierarchical> - <!-- Можно считать отображение id -> attribute инъективным, чтобы оптимизировать GROUP BY. (по умолчанию, false) --> - <injective>true</injective> - </attribute> - </structure> - </dictionary> -</dictionaries> -%% - -Идентификатор (ключевой атрибут) словаря должен быть числом, помещающимся в UInt64. Также есть возможность задавать произвольные составные ключи (см. раздел "Словари с составными ключами"). Замечание: составной ключ может состоять и из одного элемента, что даёт возможность использовать в качестве ключа, например, строку. - -Существует шесть способов размещения словаря в памяти. - -===1. flat=== -В виде плоских массивов. Самый эффективный способ. Он подходит, если все ключи меньше 500 000. Если при создании словаря обнаружен ключ больше, то кидается исключение и словарь не создаётся. Словарь загружается в оперативку целиком. Словарь использует количество оперативки, пропорциональное максимальному значению ключа. Ввиду ограничения на 500 000, потребление оперативки вряд ли может быть большим. -Поддерживаются все виды источников. При обновлении, данные (из файла, из таблицы) читаются целиком. - -===2. hashed=== -В виде хэш-таблиц. Слегка менее эффективный способ. Словарь тоже загружается в оперативку целиком, и может содержать произвольное количество элементов с произвольными идентификаторами. На практике, имеет смысл использовать до десятков миллионов элементов, пока хватает оперативки. -Поддерживаются все виды источников. При обновлении, данные (из файла, из таблицы) читаются целиком. - -===3. cache=== -Наименее эффективный способ. Подходит, если словарь не помещается в оперативку. Представляет собой кэш из фиксированного количества ячеек, в которых могут быть расположены часто используемые данные. Поддерживается источник MySQL, ClickHouse, executable, http; источник-файл не поддерживается. При поиске в словаре, сначала просматривается кэш. На каждый блок данных, все не найденные в кэше ключи (или устаревшие ключи) собираются в пачку, и с этой пачкой делается запрос к источнику вида %%SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)%%. Затем полученные данные записываются в кэш. - -===4. range_hashed=== - -В таблице прописаны какие-то данные для диапазонов дат, для каждого ключа. Дать возможность доставать эти данные для заданного ключа, для заданной даты. - -Пример: в таблице записаны скидки для каждого рекламодателя в виде: -%% -id рекламодателя дата начала действия скидки дата конца величина -123 2015-01-01 2015-01-15 0.15 -123 2015-01-16 2015-01-31 0.25 -456 2015-01-01 2015-01-15 0.05 -%% - -Добавляем layout = range_hashed. -При использовании такого layout, в structure должны быть элементы range_min, range_max. - -Пример: - -%% -<structure> - <id> - <name>Id</name> - </id> - <range_min> - <name>first</name> - </range_min> - <range_max> - <name>last</name> - </range_max> - ... -%% - -Эти столбцы должны иметь тип Date. Другие типы пока не поддерживаем. -Столбцы обозначают закрытый диапазон дат. - -Для работы с такими словарями, функции dictGetT должны принимать ещё один аргумент - дату: - -dictGetT('dict_name', 'attr_name', id, date) - -Функция достаёт значение для данного id и для диапазона дат, в который входит переданная дата. Если не найден id или для найденного id не найден диапазон, то возвращается значение по умолчанию для словаря. - -Если есть перекрывающиеся диапазоны, то можно использовать любой подходящий. - -Если граница диапазона является NULL или является некорректной датой (1900-01-01, 2039-01-01), то диапазон следует считать открытым. Диапазон может быть открытым с обеих сторон. - -В оперативке данные представлены в виде хэш-таблицы со значением в виде упорядоченного массива диапазонов и соответствующих им значений. - -Пример словаря по диапазонам: - -%% -<dictionaries> - <dictionary> - <name>xxx</name> - <source> - <mysql> - <password>xxx</password> - <port>3306</port> - <user>xxx</user> - <replica> - <host>xxx</host> - <priority>1</priority> - </replica> - <db>dicts</db> - <table>xxx</table> - </mysql> - </source> - <lifetime> - <min>300</min> - <max>360</max> - </lifetime> - <layout> - <range_hashed /> - </layout> - <structure> - <id> - <name>Abcdef</name> - </id> - <range_min> - <name>StartDate</name> - </range_min> - <range_max> - <name>EndDate</name> - </range_max> - <attribute> - <name>XXXType</name> - <type>String</type> - <null_value /> - </attribute> - </structure> - </dictionary> -</dictionaries> -%% - -===5. complex_key_hashed=== - -Для использования с составными ключами. Аналогичен hashed. - -===6. complex_key_cache=== - -Для использования с составными ключами. Аналогичен cache. - -===Примечания=== - -Рекомендуется использовать способ flat, если возможно, или hashed, complex_key_hashed. Скорость работы словарей с таким размещением в памяти является безупречной. - -Способы cache и complex_key_cache следует использовать лишь если это неизбежно. Скорость работы кэша очень сильно зависит от правильности настройки и сценария использования. Словарь типа cache нормально работает лишь при достаточно больших hit rate-ах (рекомендуется 99% и выше). Посмотреть средний hit rate можно в таблице system.dictionaries. Укажите достаточно большой размер кэша. Количество ячеек следует подобрать экспериментальным путём - выставить некоторое значение, с помощью запроса добиться полной заполненности кэша, посмотреть на потребление оперативки (эта информация находится в таблице system.dictionaries); затем пропорционально увеличить количество ячеек так, чтобы расходовалось разумное количество оперативки. В качестве источника для кэша рекомендуется MySQL, MongoDB, так как ClickHouse плохо обрабатывает запросы со случайными чтениями. - -Во всех случаях, производительность будет выше, если вызывать функцию для работы со словарём после GROUP BY, или если доставаемый атрибут помечен как инъективный. Для cache словарей, производительность будет лучше, если вызывать функцию после LIMIT-а - для этого можно использовать подзапрос с LIMIT-ом, и снаружи вызывать функцию со словарём. - -Атрибут называется инъективным, если разным ключам соответствуют разные значения атрибута. Тогда при использовании в GROUP BY функции, достающей значение атрибута по ключу, эта функция автоматически выносится из GROUP BY. - -При обновлении словарей из файла, сначала проверяется время модификации файла, и загрузка производится только если файл изменился. -При обновлении из MySQL, для flat и hashed словарей, сначала делается запрос SHOW TABLE STATUS и смотрится время обновления таблицы. И если оно не NULL, то оно сравнивается с запомненным временем. Это работает для MyISAM таблиц, а для InnoDB таблиц время обновления неизвестно, поэтому загрузка из InnoDB делается при каждом обновлении. - -Для cache-словарей может быть задано время устаревания (lifetime) данных в кэше. Если от загрузки данных в ячейке прошло больше времени, чем lifetime, то значение не используется, и будет запрошено заново при следующей необходимости его использовать. - -Если словарь не удалось ни разу загрузить, то при попытке его использования, будет брошено исключение. -Если при запросе к источнику cached словаря возникла ошибка, то будет брошено исключение. -Обновление словарей (кроме загрузки при первом использовании) не блокирует запросы - во время обновления используется старая версия словаря. Если при обновлении возникнет ошибка, то ошибка пишется в лог сервера, а запросы продолжат использовать старую версию словарей. - -Список внешних словарей и их статус можно посмотреть в таблице system.dictionaries. - -Для использования внешних словарей, смотрите раздел "Функции для работы с внешними словарями". - -Обратите внимание, что вы можете преобразовать значения по небольшому словарю, указав всё содержимое словаря прямо в запросе SELECT - смотрите раздел "Функция transform". Эта функциональность никак не связана с внешними словарями. - -===Словари с составными ключами=== - -В качестве ключа может выступать кортеж (tuple) из полей произвольных типов. Параметр layout в этом случае должен быть равен %%complex_key_hashed%% или %%complex_key_cache%%. - -Структура ключа задаётся не в элементе %%<id>%%, а в элементе %%<key>%%. Поля ключа задаются в том же формате, что и атрибуты словаря. Пример: - -%% -<structure> - <key> - <attribute> - <name>field1</name> - <type>String</type> - </attribute> - <attribute> - <name>field2</name> - <type>UInt32</type> - </attribute> - ... - </key> - ... -%% - -При использовании такого словаря, в функции dictGet* в качестве ключа передаётся Tuple со значениями полей. Пример: %%dictGetString('dict_name', 'attr_name', tuple('field1', 123))%%. - -
    -
    -

    Настройки

    -
    -
    - -Здесь будут рассмотрены настройки, которые можно задать с помощью запроса SET или в конфигурационном файле. Напомню, что эти настройки могут быть выставлены в пределах сессии или глобально. Настройки, которые можно задать только в конфигурационном файле сервера, здесь рассмотрены не будут. - - -==max_block_size== - -Данные в ClickHouse обрабатываются по блокам (наборам кусочков столбцов). Внутренние циклы обработки одного блока достаточно эффективны, но при этом существуют заметные издержки на каждый блок. max_block_size - это рекомендация, какого размера блоки (в количестве строк) загружать из таблицы. Размер блока должен быть не слишком маленьким, чтобы издержки на каждый блок оставались незаметными, и не слишком большим, чтобы запрос с LIMIT-ом, который завершается уже после первого блока, выполнялся быстро; чтобы не использовалось слишком много оперативки при вынимании большого количества столбцов в несколько потоков; чтобы оставалась хоть какая-нибудь кэш-локальность. - -По умолчанию - 65 536. - -Из таблицы не всегда загружаются блоки размера %%max_block_size%%. Если ясно, что нужно прочитать меньше данных, то будет считан блок меньшего размера. - -==preferred_block_size_bytes== -Служит для тех же целей что и %%max_block_size%%, но задает реккомедуемый размер блоков в байтах, выбирая адаптивное количество строк в блоке. -При этом размер блока не может быть более %%max_block_size%% строк. -По-умолчанию выключен (равен 0), работает только при чтении из MergeTree-движков. - -==max_insert_block_size== - -Формировать блоки указанного размера, при вставке в таблицу. -Эта настройка действует только в тех случаях, когда сервер сам формирует такие блоки. -Например, при INSERT-е через HTTP интерфейс, сервер парсит формат данных, и формирует блоки указанного размера. -А при использовании clickhouse-client, клиент сам парсит данные, и настройка max_insert_block_size на сервере не влияет на размер вставляемых блоков. -При использовании INSERT SELECT, настройка так же не имеет смысла, так как данные будут вставляться теми блоками, которые вышли после SELECT-а. - -По умолчанию - 1 048 576. - -Это намного больше, чем max_block_size. Это сделано, потому что некоторые движки таблиц (*MergeTree) будут на каждый вставляемый блок формировать кусок данных на диске, что является довольно большой сущностью. Также, в таблицах типа *MergeTree, данные сортируются при вставке, и достаточно большой размер блока позволяет отсортировать больше данных в оперативке. - - -==max_threads== - -Максимальное количество потоков обработки запроса -- без учёта потоков для чтения данных с удалённых серверов (смотрите параметр max_distributed_connections). - -Этот параметр относится к потокам, которые выполняют параллельно одни стадии конвейера выполнения запроса. -Например, если чтение из таблицы, вычисление выражений с функциями, фильтрацию с помощью WHERE и предварительную агрегацию для GROUP BY можно делать параллельно с использованием как минимум max_threads потоков, то будет использовано max_threads потоков. - -По умолчанию - 8. - -Если на сервере обычно исполняется менее одного запроса SELECT одновременно, то выставите этот параметр в значение чуть меньше количества реальных процессорных ядер. - -Для запросов, которые быстро завершаются из-за LIMIT-а, имеет смысл выставить max_threads поменьше. Например, если нужное количество записей находится в каждом блоке, то при max_threads = 8 будет считано 8 блоков, хотя достаточно было прочитать один. - -Чем меньше max_threads, тем меньше будет использоваться оперативки. - - -==max_compress_block_size== - -Максимальный размер блоков не сжатых данных перед сжатием при записи в таблицу. По умолчанию - 1 048 576 (1 MiB). При уменьшении размера, незначительно уменьшается коэффициент сжатия, незначительно возрастает скорость сжатия и разжатия за счёт кэш-локальности, и уменьшается потребление оперативки. Как правило, не имеет смысла менять эту настройку. - -Не путайте блоки для сжатия (кусок памяти, состоящий из байт) и блоки для обработки запроса (пачка строк из таблицы). - - -==min_compress_block_size== - -Для таблиц типа *MergeTree. В целях уменьшения задержек при обработке запросов, блок сжимается при записи следующей засечки, если его размер не меньше min_compress_block_size. По умолчанию - 65 536. - -Реальный размер блока, если несжатых данных меньше max_compress_block_size, будет не меньше этого значения и не меньше объёма данных на одну засечку. - -Рассмотрим пример. Пусть index_granularity, указанная при создании таблицы - 8192. - -Пусть мы записываем столбец типа UInt32 (4 байта на значение). При записи 8192 строк, будет всего 32 КБ данных. Так как min_compress_block_size = 65 536, сжатый блок будет сформирован на каждые две засечки. - -Пусть мы записываем столбец URL типа String (средний размер - 60 байт на значение). При записи 8192 строк, будет, в среднем, чуть меньше 500 КБ данных. Так как это больше 65 536 строк, то сжатый блок будет сформирован на каждую засечку. В этом случае, при чтении с диска данных из диапазона в одну засечку, не будет разжато лишних данных. - -Как правило, не имеет смысла менять эту настройку. - - -==max_query_size== - -Максимальный кусок запроса, который будет считан в оперативку для разбора парсером языка SQL. -Запрос INSERT также содержит данные для INSERT-а, которые обрабатываются отдельным, потоковым парсером (расходующим O(1) оперативки), и не учитываются в этом ограничении. - -По умолчанию - 256 KiB. - - -==interactive_delay== - -Интервал в микросекундах для проверки, не запрошена ли остановка выполнения запроса, и отправки прогресса. -По умолчанию - 100 000 (проверять остановку запроса и отправлять прогресс десять раз в секунду). - - -==connect_timeout== -==receive_timeout== -==send_timeout== - -Таймауты в секундах на сокет, по которому идёт общение с клиентом. -По умолчанию - 10, 300, 300. - - -==poll_interval== - -Блокироваться в цикле ожидания запроса в сервере на указанное количество секунд. -По умолчанию - 10. - - -==max_distributed_connections== - -Максимальное количество одновременных соединений с удалёнными серверами при распределённой обработке одного запроса к одной таблице типа Distributed. Рекомендуется выставлять не меньше, чем количество серверов в кластере. - -По умолчанию - 100. - - -Следующие параметры имеют значение только на момент создания таблицы типа Distributed (и при запуске сервера), поэтому их не имеет смысла менять в рантайме. - -==distributed_connections_pool_size== - -Максимальное количество одновременных соединений с удалёнными серверами при распределённой обработке всех запросов к одной таблице типа Distributed. Рекомендуется выставлять не меньше, чем количество серверов в кластере. - -По умолчанию - 128. - - -==connect_timeout_with_failover_ms== - -Таймаут в миллисекундах на соединение с удалённым сервером, для движка таблиц Distributed, если используются секции shard и replica в описании кластера. -В случае неуспеха, делается несколько попыток соединений с разными репликами. -По умолчанию - 50. - - -==connections_with_failover_max_tries== - -Максимальное количество попыток соединения с каждой репликой, для движка таблиц Distributed. -По умолчанию - 3 - - -==extremes== - -Считать ли экстремальные значения (минимумы и максимумы по столбцам результата запроса). Принимает 0 или 1. По умолчанию - 0 (выключено). -Подробнее смотрите раздел "Экстремальные значения". - - -==use_uncompressed_cache== - -Использовать ли кэш разжатых блоков. Принимает 0 или 1. По умолчанию - 0 (выключено). -Кэш разжатых блоков (только для таблиц семейства MergeTree) позволяет существенно уменьшить задержки и увеличить пропускную способность при обработке большого количества коротких запросов. Включите эту настройку для пользователей, от которых идут частые короткие запросы. Также обратите внимание на конфигурационный параметр uncompressed_cache_size (настраивается только в конфигурационном файле) - размер кэша разжатых блоков. По умолчанию - 8 GiB. Кэш разжатых блоков заполняется по мере надобности; наиболее невостребованные данные автоматически удаляются. - -Для запросов, читающих хоть немного приличный объём данных (миллион строк и больше), кэш разжатых блоков автоматически выключается, чтобы оставить место для действительно мелких запросов. Поэтому, можно держать настройку use_uncompressed_cache всегда выставленной в 1. - - -==replace_running_query== - -При использовании HTTP-интерфейса, может быть передан параметр query_id - произвольная строка, являющаяся идентификатором запроса. -Если в этот момент, уже существует запрос от того же пользователя с тем же query_id, то поведение определяется параметром replace_running_query. - -0 - (по умолчанию) кинуть исключение (не давать выполнить запрос, если запрос с таким же query_id уже выполняется); -1 - отменить старый запрос и начать выполнять новый. - -Эта настройка, выставленная в 1, используется в Яндекс.Метрике для реализации suggest-а значений для условий сегментации. После ввода очередного символа, если старый запрос ещё не выполнился, его следует отменить. - - -==load_balancing== - -На какие реплики (среди живых реплик) предпочитать отправлять запрос (при первой попытке) при распределённой обработке запроса. - -random (по умолчанию) - -Для каждой реплики считается количество ошибок. Запрос отправляется на реплику с минимальным числом ошибок, а если таких несколько, то на случайную из них. -Недостатки: не учитывается близость серверов; если на репликах оказались разные данные, то вы будете получать так же разные данные. - -nearest_hostname - -Для каждой реплики считается количество ошибок. Каждые 5 минут, число ошибок целочисленно делится на 2 - таким образом, обеспечивается расчёт числа ошибок за недавнее время с экспоненциальным сглаживанием. Если есть одна реплика с минимальным числом ошибок (то есть, на других репликах недавно были ошибки) - запрос отправляется на неё. Если есть несколько реплик с одинаковым минимальным числом ошибок, то запрос отправляется на реплику, имя хоста которой в конфигурационном файле минимально отличается от имени хоста сервера (по количеству отличающихся символов на одинаковых позициях, до минимальной длины обеих имён хостов). - -Для примера, example01-01-1 и example01-01-2.yandex.ru отличаются в одной позиции, а example01-01-1 и example01-02-2 - в двух. -Этот способ может показаться несколько дурацким, но он не использует внешние данные о топологии сети, и не сравнивает IP-адреса, что было бы сложным для наших IPv6-адресов. - -Таким образом, если есть равнозначные реплики, предпочитается ближайшая по имени. -Также можно сделать предположение, что при отправке запроса на один и тот же сервер, в случае отсутствия сбоев, распределённый запрос будет идти тоже на одни и те же серверы. То есть, даже если на репликах расположены разные данные, запрос будет возвращать в основном одинаковые результаты. - -in_order - -Реплики перебираются в таком порядке, в каком они указаны. Количество ошибок не имеет значения. -Этот способ подходит для тех случаев, когда вы точно знаете, какая реплика предпочтительнее. - - -==totals_mode== - -Каким образом вычислять TOTALS при наличии HAVING, а также при наличии max_rows_to_group_by и group_by_overflow_mode = 'any'. -Смотрите раздел "Модификатор WITH TOTALS". - -==totals_auto_threshold== - -Порог для totals_mode = 'auto'. -Смотрите раздел "Модификатор WITH TOTALS". - - -==default_sample== - -Число с плавающей запятой от 0 до 1. По умолчанию - 1. -Позволяет выставить коэффициент сэмплирования по умолчанию для всех запросов SELECT. -(Для таблиц, не поддерживающих сэмплирование, будет кидаться исключение.) -Если равно 1 - сэмплирование по умолчанию не делается. - - -==max_parallel_replicas== - -Максимальное количество используемых реплик каждого шарда при выполнении запроса. -Для консистентности (чтобы получить разные части одного и того же разбиения), эта опция работает только при заданном ключе сэмплирования. -Отставание реплик не контролируется. - - -==compile== - -Включить компиляцию запросов. По умолчанию - 0 (выключено). - -Компиляция предусмотрена только для части конвейера обработки запроса - для первой стадии агрегации (GROUP BY). -В случае, если эта часть конвейера была скомпилирована, запрос может работать быстрее, за счёт разворачивания коротких циклов и инлайнинга вызовов агрегатных функций. Максимальный прирост производительности (до четырёх раз в редких случаях) достигается на запросах с несколькими простыми агрегатными функциями. Как правило, прирост производительности незначителен. В очень редких случаях возможно замедление выполнения запроса. - -==min_count_to_compile== - -После скольких раз, когда скомпилированный кусок кода мог пригодиться, выполнить его компиляцию. По умолчанию - 3. -В случае, если значение равно нулю, то компиляция выполняется синхронно, и запрос будет ждать окончания процесса компиляции перед продолжением выполнения. Это можно использовать для тестирования, иначе используйте значения, начиная с 1. Как правило, компиляция занимает по времени около 5-10 секунд. -В случае, если значение равно 1 или больше, компиляция выполняется асинхронно, в отдельном потоке. При готовности результата, он сразу же будет использован, в том числе, уже выполняющимися в данный момент запросами. - -Скомпилированный код требуется для каждого разного сочетания используемых в запросе агрегатных функций и вида ключей в GROUP BY. -Результаты компиляции сохраняются в директории build в виде .so файлов. Количество результатов компиляции не ограничено, так как они не занимают много места. При перезапуске сервера, старые результаты будут использованы, за исключением случая обновления сервера - тогда старые результаты удаляются. - -==input_format_skip_unknown_fields== - -Если значение истинно, то при выполнении INSERT из входных данных пропускаются (не рассматриваются) колонки с неизвестными именами, иначе в данной ситуации будет сгенерировано исключение. -Работает для форматов JSONEachRow и TSKV. - -==output_format_json_quote_64bit_integers== - -Если значение истинно, то при использовании JSON* форматов UInt64 и Int64 числа выводятся в кавычках (из соображений совместимости с большинством реализаций JavaScript), иначе - без кавычек. - -==input_format_allow_errors_num== -==input_format_allow_errors_ratio== - -Максимальное количество ошибок при чтении из текстовых форматов (таких как CSV или TSV). -В случае ошибки, если оба параметра не равны нулю и количество ошибок меньше соответствующего значения, ошибочная строка игнорируется и чтение продолжается со следующей строки. - - -==Ограничения на сложность запроса== - -Ограничения на сложность запроса - часть настроек. -Используются, чтобы обеспечить более безопасное исполнение запросов из пользовательского интерфейса. -Почти все ограничения действуют только на SELECT-ы. -При распределённой обработке запроса, ограничения действуют на каждом сервере по-отдельности. - -Ограничения вида "максимальное количество чего-нибудь" могут принимать значение 0, которое обозначает "не ограничено". -Для большинства ограничений также присутствует настройка вида overflow_mode - что делать, когда ограничение превышено. -Оно может принимать одно из двух значений: throw или break; а для ограничения на агрегацию (group_by_overflow_mode) есть ещё значение any. -throw - кинуть исключение (по умолчанию). -break - прервать выполнение запроса и вернуть неполный результат, как будто исходные данные закончились. -any (только для group_by_overflow_mode) - продолжить агрегацию по ключам, которые успели войти в набор, но не добавлять новые ключи в набор. - - -===readonly=== - -При значении 0 можно выполнять любые запросы. -При значении 1 можно выполнять только запросы на чтение (например, SELECT и SHOW). Запросы на запись и изменение настроек (INSERT, SET) запрещены. -При значении 2 можно выполнять запросы на чтение (SELECT, SHOW) и изменение настроек (SET). - -Включив режим readonly, вы уже не сможете выключить его в текущей сессии. - -При использовании метода GET HTTP интерфейса, автоматически выставляется readonly = 1. То есть, для запросов, модифицирующие данные, можно использовать только метод POST. Сам запрос при этом можно отправлять как в теле POST-а, так и в параметре URL. - -===max_memory_usage=== - -Максимальное количество потребляемой памяти при выполнении запроса на одном сервере. По умолчанию - 10 GB. - -Настройка не учитывает объём свободной памяти или общий объём памяти на машине. -Ограничение действует на один запрос, в пределах одного сервера. -Текущее потребление оперативки для каждого запроса можно посмотреть с помощью SHOW PROCESSLIST. -Также отслеживается пиковое потребление оперативки для каждого запроса, и выводится в лог. - -Некоторые случаи потребления оперативки не отслеживаются: -- большие константы (например, очень длинная константная строка); -- состояния некоторых агрегатных функций; - -Потребление оперативки не полностью учитывается для состояний агрегатных функций min, max, any, anyLast, argMin, argMax от аргументов String и Array. - - -===max_rows_to_read=== - -Следующие ограничения могут проверяться на каждый блок (а не на каждую строку). То есть, ограничения могут быть немного нарушены. -При выполнении запроса в несколько потоков, следующие ограничения действуют в каждом потоке по-отдельности. - -Максимальное количество строчек, которое можно прочитать из таблицы при выполнении запроса. - -===max_bytes_to_read=== - -Максимальное количество байт (несжатых данных), которое можно прочитать из таблицы при выполнении запроса. - -===read_overflow_mode=== - -Что делать, когда количество прочитанных данных превысило одно из ограничений: throw или break. По умолчанию: throw. - -===max_rows_to_group_by=== - -Максимальное количество уникальных ключей, получаемых в процессе агрегации. Позволяет ограничить потребление оперативки при агрегации. - -===group_by_overflow_mode=== - -Что делать, когда количество уникальных ключей при агрегации превысило ограничение: throw, break или any. По умолчанию: throw. -Использование значения any позволяет выполнить GROUP BY приближённо. Качество такого приближённого вычисления сильно зависит от статистических свойств данных. - -===max_rows_to_sort=== - -Максимальное количество строк до сортировки. Позволяет ограничить потребление оперативки при сортировке. - -===max_bytes_to_sort=== - -Максимальное количество байт до сортировки. - -===sort_overflow_mode=== - -Что делать, если количество строк, полученное перед сортировкой, превысило одно из ограничений: throw или break. По умолчанию: throw. - -===max_result_rows=== - -Ограничение на количество строк результата. Проверяются также для подзапросов и на удалённых серверах при выполнении части распределённого запроса. - -===max_result_bytes=== - -Ограничение на количество байт результата. Аналогично. - -===result_overflow_mode=== - -Что делать, если объём результата превысил одно из ограничений: throw или break. По умолчанию: throw. -Использование break по смыслу похоже на LIMIT. - -===max_execution_time=== - -Максимальное время выполнения запроса в секундах. -На данный момент не проверяется при одной из стадий сортировки а также при слиянии и финализации агрегатных функций. - -===timeout_overflow_mode=== - -Что делать, если запрос выполняется дольше max_execution_time: throw или break. По умолчанию: throw. - -===min_execution_speed=== - -Минимальная скорость выполнения запроса в строчках в секунду. Проверяется на каждый блок данных по истечении timeout_before_checking_execution_speed. Если скорость выполнения запроса оказывается меньше, то кидается исключение. - -===timeout_before_checking_execution_speed=== - -Проверять, что скорость выполнения запроса не слишком низкая (не меньше min_execution_speed), после прошествия указанного времени в секундах. - -===max_columns_to_read=== - -Максимальное количество столбцов, которых можно читать из таблицы в одном запросе. Если запрос требует чтения большего количества столбцов - кинуть исключение. - -===max_temporary_columns=== - -Максимальное количество временных столбцов, которых необходимо одновременно держать в оперативке, в процессе выполнения запроса, включая константные столбцы. Если временных столбцов оказалось больше - кидается исключение. - -===max_temporary_non_const_columns=== - -То же самое, что и max_temporary_columns, но без учёта столбцов-констант. -Стоит заметить, что столбцы-константы довольно часто образуются в процессе выполнения запроса, но расходуют примерно нулевое количество вычислительных ресурсов. - -===max_subquery_depth=== - -Максимальная вложенность подзапросов. Если подзапросы более глубокие - кидается исключение. По умолчанию: 100. - -===max_pipeline_depth=== - -Максимальная глубина конвейера выполнения запроса. Соответствует количеству преобразований, которое проходит каждый блок данных в процессе выполнения запроса. Считается в пределах одного сервера. Если глубина конвейера больше - кидается исключение. По умолчанию: 1000. - -===max_ast_depth=== - -Максимальная вложенность синтаксического дерева запроса. Если превышена - кидается исключение. -На данный момент, проверяются не во время парсинга а уже после парсинга запроса. То есть, во время парсинга может быть создано слишком глубокое синтаксическое дерево, но запрос не будет выполнен. По умолчанию: 1000. - -===max_ast_elements=== - -Максимальное количество элементов синтаксического дерева запроса. Если превышено - кидается исключение. -Аналогично, проверяется уже после парсинга запроса. По умолчанию: 50 000. - -===max_rows_in_set=== - -Максимальное количество строчек для множества в секции IN, создаваемого из подзапроса. - -===max_bytes_in_set=== - -Максимальное количество байт (несжатых данных), занимаемое множеством в секции IN, создаваемым из подзапроса. - -===set_overflow_mode=== - -Что делать, когда количество данных превысило одно из ограничений: throw или break. По умолчанию: throw. - -===max_rows_in_distinct=== - -Максимальное количество различных строчек при использовании DISTINCT. - -===max_bytes_in_distinct=== - -Максимальное количество байт, занимаемых хэш-таблицей, при использовании DISTINCT. - -===distinct_overflow_mode=== - -Что делать, когда количество данных превысило одно из ограничений: throw или break. По умолчанию: throw. - -===max_rows_to_transfer=== - -Максимальное количество строчек, которых можно передать на удалённый сервер или сохранить во временную таблицу, при использовании GLOBAL IN. - -===max_bytes_to_transfer=== - -Максимальное количество байт (несжатых данных), которых можно передать на удалённый сервер или сохранить во временную таблицу, при использовании GLOBAL IN. - -===transfer_overflow_mode=== - -Что делать, когда количество данных превысило одно из ограничений: throw или break. По умолчанию: throw. - - -==Профили настроек== - -Профили настроек - это множество настроек, сгруппированных под одним именем. Для каждого пользователя ClickHouse указывается некоторый профиль. -Все настройки профиля можно применить, установив настройку с именем profile. Пример: - -%% -SET profile = 'web' -%% - -- установить профиль web - то есть, установить все настройки, относящиеся к профилю web. - -Профили настроек объявляются в конфигурационном файле пользователей. Обычно это - users.xml. Пример: - -%% -<!-- Профили настроек. --> -<profiles> - <!-- Настройки по умолчанию --> - <default> - <!-- Максимальное количество потоков при выполнении одного запроса. --> - <max_threads>8</max_threads> - </default> - - <!-- Настройки для запросов из пользовательского интерфейса --> - <web> - <max_rows_to_read>1000000000</max_rows_to_read> - <max_bytes_to_read>100000000000</max_bytes_to_read> - - <max_rows_to_group_by>1000000</max_rows_to_group_by> - <group_by_overflow_mode>any</group_by_overflow_mode> - - <max_rows_to_sort>1000000</max_rows_to_sort> - <max_bytes_to_sort>1000000000</max_bytes_to_sort> - - <max_result_rows>100000</max_result_rows> - <max_result_bytes>100000000</max_result_bytes> - <result_overflow_mode>break</result_overflow_mode> - - <max_execution_time>600</max_execution_time> - <min_execution_speed>1000000</min_execution_speed> - <timeout_before_checking_execution_speed>15</timeout_before_checking_execution_speed> - - <max_columns_to_read>25</max_columns_to_read> - <max_temporary_columns>100</max_temporary_columns> - <max_temporary_non_const_columns>50</max_temporary_non_const_columns> - - <max_subquery_depth>2</max_subquery_depth> - <max_pipeline_depth>25</max_pipeline_depth> - <max_ast_depth>50</max_ast_depth> - <max_ast_elements>100</max_ast_elements> - - <readonly>1</readonly> - </web> -</profiles> -%% - -В примере задано два профиля: default и web. Профиль default имеет специальное значение - он всегда обязан присутствовать и применяется при запуске сервера. То есть, профиль default содержит настройки по умолчанию. Профиль web - обычный профиль, который может быть установлен с помощью запроса SET или с помощью параметра URL при запросе по HTTP. - -Профили настроек могут наследоваться от друг-друга - это реализуется указанием настройки profile перед остальными настройками, перечисленными в профиле. - -
    -
    -

    Конфигурационные файлы

    -
    -
    - -Основной конфигурационный файл сервера - config.xml. Он расположен в директории /etc/clickhouse-server/. - -Отдельные настройки могут быть переопределены в файлах *.xml и *.conf из директорий conf.d и config.d рядом с конфигом. -У элементов этих конфигурационных файлов могут быть указаны атрибуты replace или remove. -Если ни один не указан - объединить содержимое элементов рекурсивно с заменой значений совпадающих детей. -Если указано replace - заменить весь элемент на указанный. -Если указано remove - удалить элемент. - -Также в конфиге могут быть указаны "подстановки". Если у элемента присутствует атрибут incl, то в качестве значения будет использована соответствующая подстановка из файла. По умолчанию, путь к файлу с подстановками - /etc/metrika.xml. Он может быть изменён в конфиге в элементе include_from. Значения подстановок указываются в элементах /yandex/имя_подстановки этого файла. - -Подстановки могут также выполняться из ZooKeeper. Для этого укажите у элемента атрибут from_zk="/path/to/node". Значение элемента заменится на содержимое узла /path/to/node в ZooKeeper. В ZooKeeper-узел также можно положить целое XML-поддерево, оно будет вставлено в исходный элемент. - -В config.xml может быть указан отдельный конфиг с настройками пользователей, профилей и квот. Относительный путь к нему указывается в элементе users_config. По умолчанию - users.xml. Если users_config не указан, то настройки пользователей, профилей и квот, указываются непосредственно в config.xml. Для users_config могут также существовать переопределения в файлах из директории users_config.d (например, users.d) и подстановки. - -Для каждого конфигурационного файла, сервер при запуске генерирует файлы file-preprocessed.xml. Эти файлы содержат все выполненные подстановки и переопределения, и предназначены для информационных целей. Если в конфигурационных файлах были использованы ZooKeeper-подстановки, но при старте сервера ZooKeeper недоступен, то сервер загрузит конфигурацию из preprocessed-файла. - -Сервер следит за изменениями конфигурационных файлов, а также файлов и ZooKeeper-узлов, которые были использованы при выполнении подстановок и переопределений, и перезагружает настройки пользователей и кластеров на лету. То есть, можно изменять кластера, пользователей и их настройки без перезапуска сервера. - -
    -
    -

    Права доступа

    -
    -
    - -Пользователи и права доступа настраиваются в конфиге пользователей. Обычно это users.xml. - -Пользователи прописаны в секции users. Рассмотрим фрагмент файла users.xml: - -%% -<!-- Пользователи и ACL. --> -<users> - <!-- Если имя пользователя не указано, используется пользователь default. --> - <default> - <!-- Password could be specified in plaintext or in SHA256 (in hex format). - - If you want to specify password in plaintext (not recommended), place it in 'password' element. - Example: <password>qwerty</password>. - Password could be empty. - - If you want to specify SHA256, place it in 'password_sha256_hex' element. - Example: <password_sha256_hex>65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5</password_sha256_hex> - - How to generate decent password: - Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-' - In first line will be password and in second - corresponding SHA256. - --> - <password></password> - - <!-- Список сетей, из которых разрешён доступ. - Каждый элемент списка имеет одну из следующих форм: - <ip> IP-адрес или маска подсети. Например, 222.111.222.3 или 10.0.0.1/8 или 2a02:6b8::3 или 2a02:6b8::3/64. - <host> Имя хоста. Например: example01. Для проверки делается DNS-запрос, и все полученные адреса сравниваются с адресом клиента. - <host_regexp> Регулярное выражение для имён хостов. Например, ^example\d\d-\d\d-\d\.yandex\.ru$ - Для проверки, для адреса клиента делается DNS PTR-запрос и к результату применяется регулярное выражение. - Потом для результата PTR-запроса делается снова DNS-запрос, и все полученные адреса сравниваются с адресом клиента. - Настоятельно рекомендуется, чтобы регулярное выражение заканчивалось на \.yandex\.ru$. - - Если вы устанавливаете ClickHouse самостоятельно, укажите здесь: - <networks> - <ip>::/0</ip> - </networks> - --> - <networks incl="networks" /> - - <!-- Профиль настроек, использующийся для пользователя. --> - <profile>default</profile> - - <!-- Квота, использующаяся для пользователя. --> - <quota>default</quota> - </default> - - <!-- Для запросов из пользовательского интерфейса Метрики через API для данных по отдельным счётчикам. --> - <web> - <password></password> - <networks incl="networks" /> - <profile>web</profile> - <quota>default</quota> - </web> -%% - -Здесь видно объявление двух пользователей - default и web. Пользователя web мы добавили самостоятельно. -Пользователь default выбирается в случаях, когда имя пользователя не передаётся, поэтому такой пользователь должен присутствовать в конфигурационном файле обязательно. Также пользователь default используется при распределённой обработки запроса - система ходит на удалённые серверы под ним. Поэтому, у пользователя default должен быть пустой пароль и не должно быть выставлено существенных ограничений или квот - иначе распределённые запросы сломаются. - -Пароль указывается либо в открытом виде (не рекомендуется), либо в виде SHA-256. Хэш не содержит соль. В связи с этим, не следует рассматривать такие пароли, как защиту от потенциального злоумышленника. Скорее, они нужны для защиты от сотрудников. - -Указывается список сетей, из которых разрешён доступ. В этом примере, список сетей для обеих пользователей, загружается из отдельного файла (/etc/metrika.xml), содержащего подстановку networks. Вот его фрагмент: - -%% -<yandex> - ... - <networks> - <ip>::/64</ip> - <ip>93.111.222.128/26</ip> - <ip>2a02:6b8:0:111::/64</ip> - ... - </networks> -</yandex> -%% - -Можно было бы указать этот список сетей непосредственно в users.xml, или в файле в директории users.d (подробнее смотрите раздел "Конфигурационные файлы"). - -В конфиге приведён комментарий, указывающий, как можно открыть доступ отовсюду. - -Для продакшен использования, указывайте только элементы вида ip (IP-адреса и их маски), так как использование host и host_regexp может вызывать лишние задержки. - -Далее указывается используемый профиль настроек пользователя (смотрите раздел "Профили настроек"). Вы можете указать профиль по умолчанию - default. Профиль может называться как угодно; один и тот же профиль может быть указан для разных пользователей. Наиболее важная вещь, которую вы можете прописать в профиле настроек - настройку readonly, равную 1, что обеспечивает доступ только на чтение. - -Затем указывается используемая квота (смотрите раздел "Квоты"). Вы можете указать квоту по умолчанию - default. Она настроена в конфиге по умолчанию так, что только считает использование ресурсов, но никак их не ограничивает. Квота может называться как угодно; одна и та же квота может быть указана для разных пользователей - в этом случае, подсчёт использования ресурсов делается для каждого пользователя по отдельности. - -
    -
    -

    Квоты

    -
    -
    - -Квоты позволяют ограничить использование ресурсов за некоторый интервал времени, или просто подсчитывать использование ресурсов. -Квоты настраиваются в конфиге пользователей. Обычно это users.xml. - -В системе есть возможность ограничить сложность одного запроса. Для этого смотрите раздел "Ограничения на сложность запроса". -В отличие от них, квоты: -- ограничивают не один запрос, а множество запросов, которые могут быть выполнены за интервал времени; -- при распределённой обработке запроса, учитывают ресурсы, потраченные на всех удалённых серверах. - -Рассмотрим фрагмент файла users.xml, описывающего квоты. - -%% -<!-- Квоты. --> -<quotas> - <!-- Имя квоты. --> - <default> - <!-- Ограничения за интервал времени. Можно задать много интервалов с разными ограничениями. --> - <interval> - <!-- Длина интервала. --> - <duration>3600</duration> - - <!-- Без ограничений. Просто считать соответствующие данные за указанный интервал. --> - <queries>0</queries> - <errors>0</errors> - <result_rows>0</result_rows> - <read_rows>0</read_rows> - <execution_time>0</execution_time> - </interval> - </default> -%% - -Видно, что квота по умолчанию просто считает использование ресурсов за каждый час, но не ограничивает их. -Подсчитанное использование ресурсов за каждый интервал, выводится в лог сервера после каждого запроса. - -%% - <statbox> - <!-- Ограничения за интервал времени. Можно задать много интервалов с разными ограничениями. --> - <interval> - <!-- Длина интервала. --> - <duration>3600</duration> - - <queries>1000</queries> - <errors>100</errors> - <result_rows>1000000000</result_rows> - <read_rows>100000000000</read_rows> - <execution_time>900</execution_time> - </interval> - - <interval> - <duration>86400</duration> - - <queries>10000</queries> - <errors>1000</errors> - <result_rows>5000000000</result_rows> - <read_rows>500000000000</read_rows> - <execution_time>7200</execution_time> - </interval> - </statbox> -%% - -Для квоты с именем statbox заданы ограничения за каждый час и за каждые 24 часа (86 400 секунд). Интервал времени считается начиная от некоторого implementation defined фиксированного момента времени. То есть, интервал длины 24 часа начинается не обязательно в полночь. - -Когда интервал заканчивается, все накопленные значения сбрасываются. То есть, в следующий час, расчёт квоты за час, начинается заново. - -Рассмотрим величины, которые можно ограничить: - -queries - общее количество запросов; -errors - количество запросов, при выполнении которых было выкинуто исключение; -result_rows - суммарное количество строк, отданных в виде результата; -read_rows - суммарное количество исходных строк, прочитанных из таблиц, для выполнения запроса, на всех удалённых серверах; -execution_time - суммарное время выполнения запросов, в секундах (wall time); - -Если за хотя бы один интервал, ограничение превышено, то кидается исключение с текстом о том, какая величина превышена, за какой интервал, и когда начнётся новый интервал (когда снова можно будет задавать запросы). - -Для квоты может быть включена возможность указывать "ключ квоты", чтобы производить учёт ресурсов для многих ключей независимо. Рассмотрим это на примере: - -%% - <!-- Для глобального конструктора отчётов. --> - <web_global> - <!-- keyed - значит в параметре запроса передаётся "ключ" quota_key, - и квота считается по отдельности для каждого значения ключа. - Например, в качестве ключа может передаваться логин пользователя в Метрике, - и тогда квота будет считаться для каждого логина по отдельности. - Имеет смысл использовать только если quota_key передаётся не пользователем, а программой. - - Также можно написать <keyed_by_ip /> - тогда в качестве ключа квоты используется IP-адрес. - (но стоит учесть, что пользователь может достаточно легко менять IPv6-адрес) - --> - <keyed /> -%% - -Квота прописывается для пользователей в секции users конфига. Смотрите раздел "Права доступа". - -При распределённой обработке запроса, накопленные величины хранятся на сервере-инициаторе запроса. То есть, если пользователь пойдёт на другой сервер - там квота будет действовать "с нуля". - -При перезапуске сервера, квоты сбрасываются. -
    - - -
    - -Яндекс.Метрика - -
    - - - - - From 2b5d8d53d9c5659c9eb3bc47fca0f48585327fe4 Mon Sep 17 00:00:00 2001 From: Yuriy Date: Mon, 23 Sep 2019 21:02:57 +0300 Subject: [PATCH 206/309] updated mariadb-connector-c --- contrib/mariadb-connector-c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/mariadb-connector-c b/contrib/mariadb-connector-c index 4d473f89bb8..18016300b00 160000 --- a/contrib/mariadb-connector-c +++ b/contrib/mariadb-connector-c @@ -1 +1 @@ -Subproject commit 4d473f89bb86ae485a116f6271201b214d0ac4cc +Subproject commit 18016300b00825a3fcbc6fb2aa37ac3e51416f71 From 8407ee17c6e5bd178e5082934e07812a719c9a46 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 23 Sep 2019 21:06:32 +0300 Subject: [PATCH 207/309] Reverted unrelated modification --- dbms/src/Common/DiskSpaceMonitor.cpp | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/dbms/src/Common/DiskSpaceMonitor.cpp b/dbms/src/Common/DiskSpaceMonitor.cpp index c8347b5d106..5b07e11f31b 100644 --- a/dbms/src/Common/DiskSpaceMonitor.cpp +++ b/dbms/src/Common/DiskSpaceMonitor.cpp @@ -1,12 +1,10 @@ #include -#include -#include #include +#include #include - namespace DB { @@ -47,7 +45,7 @@ std::filesystem::path getMountPoint(std::filesystem::path absolute_path) return absolute_path; } -/// Returns name of filesystem mounted to mount_point + /// Returns name of filesystem mounted to mount_point #if !defined(__linux__) [[noreturn]] #endif @@ -67,7 +65,7 @@ std::string getFilesystemName([[maybe_unused]] const std::string & mount_point) throw DB::Exception("Cannot find name of filesystem by mount point " + mount_point, ErrorCodes::SYSTEM_ERROR); return fs_info.mnt_fsname; #else - throw DB::Exception("The function getFilesystemName is supported on Linux only", ErrorCodes::NOT_IMPLEMENTED); + throw DB::Exception("Supported on linux only", ErrorCodes::NOT_IMPLEMENTED); #endif } @@ -84,7 +82,7 @@ bool Disk::tryReserve(UInt64 bytes) const std::lock_guard lock(mutex); if (bytes == 0) { - LOG_DEBUG(&Logger::get("DiskSpaceMonitor"), "Reserving 0 bytes on disk " << backQuote(name)); + LOG_DEBUG(&Logger::get("DiskSpaceMonitor"), "Reserving 0 bytes on disk " << name); ++reservation_count; return true; } @@ -95,8 +93,7 @@ bool Disk::tryReserve(UInt64 bytes) const { LOG_DEBUG( &Logger::get("DiskSpaceMonitor"), - "Reserving " << formatReadableSizeWithBinarySuffix(bytes) << " on disk " << backQuote(name) - << ", having unreserved " << formatReadableSizeWithBinarySuffix(unreserved_space) << "."); + "Reserving " << bytes << " bytes on disk " << name << " having unreserved " << unreserved_space << " bytes."); ++reservation_count; reserved_bytes += bytes; return true; @@ -286,14 +283,14 @@ Volume::Volume( max_data_part_size = static_cast(sum_size * ratio / disks.size()); for (size_t i = 0; i < disks.size(); ++i) if (sizes[i] < max_data_part_size) - LOG_WARNING(logger, "Disk " << backQuote(disks[i]->getName()) << " on volume " << backQuote(config_prefix) << - " have not enough space (" << formatReadableSizeWithBinarySuffix(sizes[i]) << + LOG_WARNING(logger, "Disk " << disks[i]->getName() << " on volume " << config_prefix << + " have not enough space (" << sizes[i] << ") for containing part the size of max_data_part_size (" << - formatReadableSizeWithBinarySuffix(max_data_part_size) << ")"); + max_data_part_size << ")"); } constexpr UInt64 MIN_PART_SIZE = 8u * 1024u * 1024u; if (max_data_part_size < MIN_PART_SIZE) - LOG_WARNING(logger, "Volume " << backQuote(name) << " max_data_part_size is too low (" + LOG_WARNING(logger, "Volume '" << name << "' max_data_part_size is too low (" << formatReadableSizeWithBinarySuffix(max_data_part_size) << " < " << formatReadableSizeWithBinarySuffix(MIN_PART_SIZE) << ")"); } @@ -508,7 +505,7 @@ StoragePolicySelector::StoragePolicySelector( ErrorCodes::EXCESSIVE_ELEMENT_IN_CONFIG); policies.emplace(name, std::make_shared(name, config, config_prefix + "." + name, disks)); - LOG_INFO(&Logger::get("StoragePolicySelector"), "Storage policy " << backQuote(name) << " loaded"); + LOG_INFO(&Logger::get("StoragePolicySelector"), "Storage policy " << name << " loaded"); } constexpr auto default_storage_policy_name = "default"; From 8ca6d6ed5db4294bb9e0d0dab15d57fce64f4ff4 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Mon, 23 Sep 2019 21:00:13 +0300 Subject: [PATCH 208/309] Prepare for MemorySanitizer build in CI. This is a preparatory commit just to get the msan build running in some shape. It disables many external libraries that are not yet tested. --- cmake/sanitize.cmake | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index 9e8ef3e857a..04f2b80d346 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -14,8 +14,11 @@ if (SANITIZE) endif () elseif (SANITIZE STREQUAL "memory") - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} -fsanitize=memory -fsanitize-memory-track-origins") - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} -fsanitize=memory -fsanitize-memory-track-origins") + set (MSAN_FLAGS "-fsanitize=memory -fsanitize-memory-track-origins -fno-optimize-sibling-calls") + + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${MSAN_FLAGS}") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${MSAN_FLAGS}") + if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=memory") endif() @@ -23,6 +26,28 @@ if (SANITIZE) set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libmsan") endif () + # Temporarily disable many external libraries that don't work under + # MemorySanitizer yet. + set (ENABLE_HDFS 0 CACHE BOOL "") + set (ENABLE_CAPNP 0 CACHE BOOL "") + set (ENABLE_RDKAFKA 0 CACHE BOOL "") + set (ENABLE_ICU 0 CACHE BOOL "") + set (ENABLE_POCO_MONGODB 0 CACHE BOOL "") + set (ENABLE_POCO_NETSSL 0 CACHE BOOL "") + set (ENABLE_POCO_ODBC 0 CACHE BOOL "") + set (ENABLE_ODBC 0 CACHE BOOL "") + set (ENABLE_MYSQL 0 CACHE BOOL "") + set (ENABLE_EMBEDDED_COMPILER 0 CACHE BOOL "") + set (USE_INTERNAL_CAPNP_LIBRARY 0 CACHE BOOL "") + set (USE_SIMDJSON 0 CACHE BOOL "") + set (ENABLE_READLINE 0 CACHE BOOL "") + set (ENABLE_ORC 0 CACHE BOOL "") + set (ENABLE_PARQUET 0 CACHE BOOL "") + set (USE_CAPNP 0 CACHE BOOL "") + set (USE_INTERNAL_ORC_LIBRARY 0 CACHE BOOL "") + set (USE_ORC 0 CACHE BOOL "") + set (ENABLE_SSL 0 CACHE BOOL "") + elseif (SANITIZE STREQUAL "thread") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} -fsanitize=thread") set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} -fsanitize=thread") From e7b675a301fa4740f4ffbe18062887703e536c74 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Mon, 23 Sep 2019 21:15:21 +0300 Subject: [PATCH 209/309] Update ReadWriteBufferFromHTTP.cpp --- dbms/src/IO/ReadWriteBufferFromHTTP.cpp | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.cpp b/dbms/src/IO/ReadWriteBufferFromHTTP.cpp index 89e87020012..4d046bfe2c6 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.cpp +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.cpp @@ -1,13 +1 @@ #include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int TOO_MANY_REDIRECTS; -} - -} - From faf6e06f56f5ecfb2c25bfff5455b33150343fdd Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Mon, 23 Sep 2019 21:16:34 +0300 Subject: [PATCH 210/309] Update ReadWriteBufferFromHTTP.h --- dbms/src/IO/ReadWriteBufferFromHTTP.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.h b/dbms/src/IO/ReadWriteBufferFromHTTP.h index 62db3c22a2c..d2140e14792 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.h +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.h @@ -61,7 +61,8 @@ public: void updateSession(const Poco::URI & uri) { - if (redirects++ From 6cc2f8da97a0815b45def58f6a18149aa956ba37 Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Mon, 23 Sep 2019 21:22:10 +0300 Subject: [PATCH 211/309] Add 'clang-8-darwin' choise and update Docker build image. --- docker/packager/binary/Dockerfile | 18 +++++++++++++++++- docker/packager/packager | 11 +++++++++-- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile index 82fa93ec570..1ffeab01440 100644 --- a/docker/packager/binary/Dockerfile +++ b/docker/packager/binary/Dockerfile @@ -57,7 +57,23 @@ RUN apt-get update -y \ gperf \ cmake \ gdb \ - rename + rename \ + wget + +# Build and install tools for cross-linking to Darwin + +ENV CC=clang-8 +ENV CXX=clang++-8 + +# libtapi is required to support .tbh format from recent MacOS SDKs +RUN git clone https://github.com/tpoechtrager/apple-libtapi.git +RUN cd apple-libtapi && INSTALLPREFIX=/cctools ./build.sh && ./install.sh + +RUN git clone https://github.com/tpoechtrager/cctools-port.git +RUN cd cctools-port/cctools && ./configure --prefix=/cctools --with-libtapi=/cctools --target=x86_64-apple-darwin && make install + +RUN wget https://github.com/phracker/MacOSX-SDKs/releases/download/10.14-beta4/MacOSX10.14.sdk.tar.xz +RUN tar xJf MacOSX10.14.sdk.tar.xz -C /cctools COPY build.sh / CMD ["/bin/bash", "/build.sh"] diff --git a/docker/packager/packager b/docker/packager/packager index 733cbff71d1..f9cd6974c5d 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -118,6 +118,13 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, cache, di cmake_flags.append('-DCMAKE_C_COMPILER=`which {}`'.format(cc)) cmake_flags.append('-DCMAKE_CXX_COMPILER=`which {}`'.format(cxx)) + if "darwin" in compiler: + cmake_flags.append("-DCMAKE_AR:FILEPATH=/cctools/bin/x86_64-apple-darwin-ar") \ + .append("-DCMAKE_RANLIB:FILEPATH=/cctools/bin/x86_64-apple-darwin-ranlib") \ + .append("-DCMAKE_SYSTEM_NAME=Darwin") \ + .append("-DSDK_PATH=/cctools/MacOSX10.14.sdk") \ + .append("-DLINKER_NAME=/cctools/bin/x86_64-apple-darwin-ld") + if sanitizer: result.append("SANITIZER={}".format(sanitizer)) if build_type: @@ -166,12 +173,12 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, cache, di if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') - parser = argparse.ArgumentParser(description="ClickHouse building script via virtualization mechanisms") + parser = argparse.ArgumentParser(description="ClickHouse building script using prebuilt Docker image") parser.add_argument("--package-type", choices=IMAGE_MAP.keys(), required=True) parser.add_argument("--clickhouse-repo-path", default="../../") parser.add_argument("--output-dir", required=True) parser.add_argument("--build-type", choices=("debug", ""), default="") - parser.add_argument("--compiler", choices=("clang-6.0", "clang-7", "gcc-7", "clang-8", "gcc-8", "gcc-9"), default="gcc-7") + parser.add_argument("--compiler", choices=("clang-6.0", "clang-7", "gcc-7", "clang-8", "clang-8-darwin", "gcc-8", "gcc-9"), default="gcc-7") parser.add_argument("--sanitizer", choices=("address", "thread", "memory", "undefined", ""), default="") parser.add_argument("--unbundled", action="store_true") parser.add_argument("--split-binary", action="store_true") From 11e7ea51e9d2023f1e8bf2f51f6e2f5486291206 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 23 Sep 2019 22:22:28 +0300 Subject: [PATCH 212/309] Fixed compatibility for distributed queries between 19.14 and earlier versions #7068 --- dbms/src/Interpreters/ClusterProxy/executeQuery.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dbms/src/Interpreters/ClusterProxy/executeQuery.cpp b/dbms/src/Interpreters/ClusterProxy/executeQuery.cpp index 989595b3647..9a0494cca45 100644 --- a/dbms/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/dbms/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -18,8 +18,6 @@ Context removeUserRestrictionsFromSettings(const Context & context, const Settin { Settings new_settings = settings; new_settings.queue_max_wait_ms = Cluster::saturate(new_settings.queue_max_wait_ms, settings.max_execution_time); - new_settings.connection_pool_max_wait_ms = Cluster::saturate(new_settings.connection_pool_max_wait_ms, settings.max_execution_time); - new_settings.replace_running_query_max_wait_ms = Cluster::saturate(new_settings.replace_running_query_max_wait_ms, settings.max_execution_time); /// Does not matter on remote servers, because queries are sent under different user. new_settings.max_concurrent_queries_for_user = 0; @@ -39,8 +37,8 @@ Context removeUserRestrictionsFromSettings(const Context & context, const Settin } BlockInputStreams executeQuery( - IStreamFactory & stream_factory, const ClusterPtr & cluster, - const ASTPtr & query_ast, const Context & context, const Settings & settings) + IStreamFactory & stream_factory, const ClusterPtr & cluster, + const ASTPtr & query_ast, const Context & context, const Settings & settings) { BlockInputStreams res; From 630872837ea38dff82576551d1fbd9bea95775ee Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 23 Sep 2019 22:26:04 +0300 Subject: [PATCH 213/309] Skip null columns while checknig num rows. --- dbms/src/Core/Block.cpp | 5 ++++- dbms/src/Core/Block.h | 2 +- dbms/src/Functions/IFunction.cpp | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/dbms/src/Core/Block.cpp b/dbms/src/Core/Block.cpp index b045b9ec1ff..c64cf387a3b 100644 --- a/dbms/src/Core/Block.cpp +++ b/dbms/src/Core/Block.cpp @@ -219,11 +219,14 @@ size_t Block::getPositionByName(const std::string & name) const } -void Block::checkNumberOfRows() const +void Block::checkNumberOfRows(bool allow_null_columns) const { ssize_t rows = -1; for (const auto & elem : data) { + if (!elem.column && allow_null_columns) + continue; + if (!elem.column) throw Exception("Column " + elem.name + " in block is nullptr, in method checkNumberOfRows." , ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); diff --git a/dbms/src/Core/Block.h b/dbms/src/Core/Block.h index 4a93e5ed803..ae8b07718dd 100644 --- a/dbms/src/Core/Block.h +++ b/dbms/src/Core/Block.h @@ -90,7 +90,7 @@ public: size_t columns() const { return data.size(); } /// Checks that every column in block is not nullptr and has same number of elements. - void checkNumberOfRows() const; + void checkNumberOfRows(bool allow_null_columns = false) const; /// Approximate number of bytes in memory - for profiling and limits. size_t bytes() const; diff --git a/dbms/src/Functions/IFunction.cpp b/dbms/src/Functions/IFunction.cpp index af2a9db02b3..9a3633a9790 100644 --- a/dbms/src/Functions/IFunction.cpp +++ b/dbms/src/Functions/IFunction.cpp @@ -385,7 +385,7 @@ static ColumnPtr replaceLowCardinalityColumnsByNestedAndGetDictionaryIndexes( } #ifndef NDEBUG - block.checkNumberOfRows(); + block.checkNumberOfRows(true); #endif return indexes; From e9e10670da4bda8d075d126d2a6922d185a3f213 Mon Sep 17 00:00:00 2001 From: Ivan <5627721+abyss7@users.noreply.github.com> Date: Mon, 23 Sep 2019 22:29:47 +0300 Subject: [PATCH 214/309] Update Docker image for CI packaging Add support for cross-compilation to Darwin --- docker/packager/binary/Dockerfile | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile index 82fa93ec570..2ad696cd279 100644 --- a/docker/packager/binary/Dockerfile +++ b/docker/packager/binary/Dockerfile @@ -57,7 +57,23 @@ RUN apt-get update -y \ gperf \ cmake \ gdb \ - rename + rename \ + wget + +# Build and install tools for cross-linking to Darwin + +ENV CC=clang-8 +ENV CXX=clang++-8 + +# libtapi is required to support .tbh format from recent MacOS SDKs +RUN git clone https://github.com/tpoechtrager/apple-libtapi.git +RUN cd apple-libtapi && INSTALLPREFIX=/cctools ./build.sh && ./install.sh + +RUN git clone https://github.com/tpoechtrager/cctools-port.git +RUN cd cctools-port/cctools && ./configure --prefix=/cctools --with-libtapi=/cctools --target=x86_64-apple-darwin && make install + +RUN wget https://github.com/phracker/MacOSX-SDKs/releases/download/10.14-beta4/MacOSX10.14.sdk.tar.xz +RUN tar xJf MacOSX10.14.sdk.tar.xz -C /cctools COPY build.sh / CMD ["/bin/bash", "/build.sh"] From 1f9b8418e0e955341213adf2837b6468939a7f15 Mon Sep 17 00:00:00 2001 From: chertus Date: Mon, 23 Sep 2019 22:36:47 +0300 Subject: [PATCH 215/309] pmj skip not intersected optimisation --- dbms/src/Core/Settings.h | 2 + dbms/src/Interpreters/AnalyzedJoin.cpp | 2 + dbms/src/Interpreters/AnalyzedJoin.h | 6 ++ dbms/src/Interpreters/MergeJoin.cpp | 82 ++++++++++++++++++++++++-- dbms/src/Interpreters/MergeJoin.h | 1 + 5 files changed, 89 insertions(+), 4 deletions(-) diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index cacaf883fb7..94ec494f138 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -289,6 +289,8 @@ struct Settings : public SettingsCollection M(SettingOverflowMode, join_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.") \ M(SettingBool, join_any_take_last_row, false, "When disabled (default) ANY JOIN will take the first found row for a key. When enabled, it will take the last row seen if there are multiple rows for the same key.") \ M(SettingBool, partial_merge_join, false, "Use partial merge join instead of hash join for LEFT and INNER JOINs.") \ + M(SettingBool, partial_merge_join_optimisations, false, "Enable optimisations in partial merge join") \ + M(SettingUInt64, partial_merge_join_rows_in_right_blocks, 10000, "Split right-hand joining data in blocks of specified size.") \ \ M(SettingUInt64, max_rows_to_transfer, 0, "Maximum size (in rows) of the transmitted external table obtained when the GLOBAL IN/JOIN section is executed.") \ M(SettingUInt64, max_bytes_to_transfer, 0, "Maximum size (in uncompressed bytes) of the transmitted external table obtained when the GLOBAL IN/JOIN section is executed.") \ diff --git a/dbms/src/Interpreters/AnalyzedJoin.cpp b/dbms/src/Interpreters/AnalyzedJoin.cpp index 53f763d54dd..cdf047fc5e5 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.cpp +++ b/dbms/src/Interpreters/AnalyzedJoin.cpp @@ -27,6 +27,8 @@ AnalyzedJoin::AnalyzedJoin(const Settings & settings) : size_limits(SizeLimits{settings.max_rows_in_join, settings.max_bytes_in_join, settings.join_overflow_mode}) , join_use_nulls(settings.join_use_nulls) , partial_merge_join(settings.partial_merge_join) + , partial_merge_join_optimisations(settings.partial_merge_join_optimisations) + , partial_merge_join_rows_in_right_blocks(settings.partial_merge_join_rows_in_right_blocks) {} void AnalyzedJoin::addUsingKey(const ASTPtr & ast) diff --git a/dbms/src/Interpreters/AnalyzedJoin.h b/dbms/src/Interpreters/AnalyzedJoin.h index 10075ec2792..f9d0d9d0f79 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.h +++ b/dbms/src/Interpreters/AnalyzedJoin.h @@ -40,6 +40,8 @@ class AnalyzedJoin const SizeLimits size_limits; const bool join_use_nulls; const bool partial_merge_join; + const bool partial_merge_join_optimisations; + const size_t partial_merge_join_rows_in_right_blocks; Names key_names_left; Names key_names_right; /// Duplicating names are qualified. @@ -66,6 +68,8 @@ public: : size_limits(limits) , join_use_nulls(use_nulls) , partial_merge_join(false) + , partial_merge_join_optimisations(false) + , partial_merge_join_rows_in_right_blocks(0) , key_names_right(key_names_right_) { table_join.kind = kind; @@ -78,6 +82,8 @@ public: bool forceNullableRight() const { return join_use_nulls && isLeftOrFull(table_join.kind); } bool forceNullableLeft() const { return join_use_nulls && isRightOrFull(table_join.kind); } + size_t maxRowsInRightBlock() const { return partial_merge_join_rows_in_right_blocks; } + bool enablePartialMergeJoinOptimisations() const { return partial_merge_join_optimisations; } void addUsingKey(const ASTPtr & ast); void addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast); diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index b6a02a605ef..5168b9e13c4 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -15,6 +15,8 @@ namespace ErrorCodes { extern const int SET_SIZE_LIMIT_EXCEEDED; extern const int NOT_IMPLEMENTED; + extern const int PARAMETER_OUT_OF_BOUND; + extern const int LOGICAL_ERROR; } namespace @@ -60,6 +62,26 @@ int nullableCompareAt(const IColumn & left_column, const IColumn & right_column, return left_column.compareAt(lhs_pos, rhs_pos, right_column, null_direction_hint); } +Block extractMinMax(const Block & block, const Block & keys) +{ + if (block.rows() == 0) + throw Exception("Unexpected empty block", ErrorCodes::LOGICAL_ERROR); + + Block min_max = keys.cloneEmpty(); + MutableColumns columns = min_max.mutateColumns(); + + for (size_t i = 0; i < columns.size(); ++i) + { + auto & src_column = block.getByName(keys.getByPosition(i).name); + + columns[i]->insertFrom(*src_column.column, 0); + columns[i]->insertFrom(*src_column.column, block.rows() - 1); + } + + min_max.setColumns(std::move(columns)); + return min_max; +} + } struct MergeJoinEqualRange @@ -111,6 +133,35 @@ public: return getNextEqualRangeImpl(rhs); } + int intersect(const Block & right_block, const Block & right_table_keys) + { + const Block min_max = extractMinMax(right_block, right_table_keys); + if (end() == 0 || min_max.rows() != 2) + throw Exception("Unexpected block size", ErrorCodes::LOGICAL_ERROR); + + size_t last_position = end() - 1; + int first_vs_max = 0; + int last_vs_min = 0; + + for (size_t i = 0; i < impl.sort_columns.size(); ++i) + { + auto & left_column = *impl.sort_columns[i]; + auto & right_column = *min_max.getByPosition(i).column; + + if (!first_vs_max) + first_vs_max = nullableCompareAt(left_column, right_column, position(), 1); + + if (!last_vs_min) + last_vs_min = nullableCompareAt(left_column, right_column, last_position, 0); + } + + if (first_vs_max > 0) + return 1; + if (last_vs_min < 0) + return -1; + return 0; + } + private: SortCursorImpl impl; bool has_nullable_columns = false; @@ -279,6 +330,7 @@ MergeJoin::MergeJoin(std::shared_ptr table_join_, const Block & ri , is_all(table_join->strictness() == ASTTableJoin::Strictness::All) , is_inner(isInner(table_join->kind())) , is_left(isLeft(table_join->kind())) + , skip_not_intersected(table_join->enablePartialMergeJoinOptimisations()) { if (!isLeft(table_join->kind()) && !isInner(table_join->kind())) throw Exception("Partial merge supported for LEFT and INNER JOINs only", ErrorCodes::NOT_IMPLEMENTED); @@ -313,8 +365,6 @@ void MergeJoin::joinTotals(Block & block) const void MergeJoin::mergeRightBlocks() { - const size_t max_merged_block_size = 128 * 1024 * 1024; - if (right_blocks.empty()) return; @@ -323,12 +373,16 @@ void MergeJoin::mergeRightBlocks() for (const auto & block : right_blocks) unsorted_blocks.push_back(block); + size_t max_rows_in_block = table_join->maxRowsInRightBlock(); + if (!max_rows_in_block) + throw Exception("partial_merge_join_rows_in_right_blocks cannot be zero", ErrorCodes::PARAMETER_OUT_OF_BOUND); + /// TODO: there should be no splitted keys by blocks for RIGHT|FULL JOIN - MergeSortingBlocksBlockInputStream stream(unsorted_blocks, right_sort_description, max_merged_block_size); + MergeSortingBlocksBlockInputStream stream(unsorted_blocks, right_sort_description, max_rows_in_block); right_blocks.clear(); while (Block block = stream.read()) - right_blocks.push_back(block); + right_blocks.emplace_back(std::move(block)); } bool MergeJoin::addJoinedBlock(const Block & src_block) @@ -369,6 +423,16 @@ void MergeJoin::joinBlock(Block & block) { if (left_cursor.atEnd()) break; + + if (skip_not_intersected) + { + int intersection = left_cursor.intersect(*it, right_table_keys); + if (intersection < 0) + break; /// (left) ... (right) + if (intersection > 0) + continue; /// (right) ... (left) + } + leftJoin(left_cursor, block, *it, left_columns, right_columns, left_key_tail); } @@ -385,6 +449,16 @@ void MergeJoin::joinBlock(Block & block) { if (left_cursor.atEnd()) break; + + if (skip_not_intersected) + { + int intersection = left_cursor.intersect(*it, right_table_keys); + if (intersection < 0) + break; /// (left) ... (right) + if (intersection > 0) + continue; /// (right) ... (left) + } + innerJoin(left_cursor, block, *it, left_columns, right_columns, left_key_tail); } diff --git a/dbms/src/Interpreters/MergeJoin.h b/dbms/src/Interpreters/MergeJoin.h index 89165c70890..6d7d467fc8f 100644 --- a/dbms/src/Interpreters/MergeJoin.h +++ b/dbms/src/Interpreters/MergeJoin.h @@ -44,6 +44,7 @@ private: const bool is_all; const bool is_inner; const bool is_left; + const bool skip_not_intersected; void changeLeftColumns(Block & block, MutableColumns && columns); void addRightColumns(Block & block, MutableColumns && columns); From d187b5ed42e8034fbfa4b3543382d8f23cfac339 Mon Sep 17 00:00:00 2001 From: Silviu Caragea Date: Mon, 23 Sep 2019 22:47:12 +0300 Subject: [PATCH 216/309] Get proper stack address on osx --- dbms/src/Common/checkStackSize.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dbms/src/Common/checkStackSize.cpp b/dbms/src/Common/checkStackSize.cpp index 7459277b563..415d885103a 100644 --- a/dbms/src/Common/checkStackSize.cpp +++ b/dbms/src/Common/checkStackSize.cpp @@ -36,7 +36,9 @@ void checkStackSize() // Stack size for the main thread is 8MB on OSX excluding the guard page size. pthread_t thread = pthread_self(); max_stack_size = pthread_main_np() ? (8 * 1024 * 1024) : pthread_get_stacksize_np(thread); - stack_address = pthread_get_stackaddr_np(thread); + + // stack_address points to the start of the stack, not the end how it's returned by pthread_get_stackaddr_np + stack_address = reinterpret_cast(reinterpret_cast(pthread_get_stackaddr_np(thread)) - max_stack_size); #else pthread_attr_t attr; #if defined(__FreeBSD__) From c9252964c329e432cad30a14acc624d83ba64b03 Mon Sep 17 00:00:00 2001 From: filimonov <1549571+filimonov@users.noreply.github.com> Date: Mon, 23 Sep 2019 23:36:51 +0200 Subject: [PATCH 217/309] add obsoletes section to clickhouse-server spec --- utils/release/release_lib.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/release/release_lib.sh b/utils/release/release_lib.sh index 4eaa4d4ebbd..823ba7f0cc9 100644 --- a/utils/release/release_lib.sh +++ b/utils/release/release_lib.sh @@ -231,6 +231,8 @@ function make_rpm { echo "Requires: clickhouse-common-static = $VERSION_FULL-2" >> ${PACKAGE}-$VERSION_FULL-2.spec echo "Requires: tzdata" >> ${PACKAGE}-$VERSION_FULL-2.spec echo "Requires: initscripts" >> ${PACKAGE}-$VERSION_FULL-2.spec + echo "Obsoletes: clickhouse-server-common < $VERSION_FULL" >> ${PACKAGE}-$VERSION_FULL-2.spec + cat ${PACKAGE}-$VERSION_FULL-2.spec_tmp >> ${PACKAGE}-$VERSION_FULL-2.spec rpm_pack From 834fe73badb6d6a3a8c53a17b1c81ac914b6473c Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Tue, 24 Sep 2019 02:27:24 +0300 Subject: [PATCH 218/309] Update hash_functions.md --- docs/en/query_language/functions/hash_functions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/query_language/functions/hash_functions.md b/docs/en/query_language/functions/hash_functions.md index b3410dd3b17..153d0c5f71f 100644 --- a/docs/en/query_language/functions/hash_functions.md +++ b/docs/en/query_language/functions/hash_functions.md @@ -177,7 +177,7 @@ SELECT farmHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:0 ## javaHash {#hash_functions-javahash} -Calculates [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) from a string. `JavaHash` does not ensure fast response and quality, so this function should be considered deprecated. Use this function if you need to get the hash value using the same algorithm. +Calculates [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452) from a string. This hash function is neither fast nor having a good quality. The only reason to use it is when this algorithm is already used in another system and you have to calculate exactly the same result. ```sql SELECT javaHash(''); @@ -213,7 +213,7 @@ Calculates `HiveHash` from a string. SELECT hiveHash(''); ``` -This is just [JavaHash](#hash_functions-javahash) with zeroed out sign bit. This function is used in [Apache Hive](https://en.wikipedia.org/wiki/Apache_Hive) for versions before 3.0. +This is just [JavaHash](#hash_functions-javahash) with zeroed out sign bit. This function is used in [Apache Hive](https://en.wikipedia.org/wiki/Apache_Hive) for versions before 3.0. This hash function is neither fast nor having a good quality. The only reason to use it is when this algorithm is already used in another system and you have to calculate exactly the same result. **Returned value** From 37481d80192c93a90579912dc6f28c8106373f1d Mon Sep 17 00:00:00 2001 From: BayoNet Date: Tue, 24 Sep 2019 02:28:53 +0300 Subject: [PATCH 219/309] DOCAPI-7437: EN review, RU translation for os_thread_priority docs (#7055) * Typo fix. * Links fix. * Fixed links in docs. * More fixes. * Update settings.md (#55) * DOCAPI-7437: RU translation * DOCAPI-7437: Fix. --- docs/en/operations/settings/settings.md | 8 ++++---- docs/ru/operations/settings/settings.md | 17 +++++++++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index a2dbf5122fa..7c738cbb940 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -899,16 +899,16 @@ Error count of each replica is capped at this value, preventing a single replica ## os_thread_priority {#setting-os_thread_priority} -Sets the priority ([nice](https://en.wikipedia.org/wiki/Nice_(Unix))) for threads that execute queries. OS scheduler considers this priority when choosing the next thread to run on each available CPU core. +Sets the priority ([nice](https://en.wikipedia.org/wiki/Nice_(Unix))) for threads that execute queries. The OS scheduler considers this priority when choosing the next thread to run on each available CPU core. !!! warning "Warning" - To use this setting, you need to set the `CAP_SYS_NICE` capability. The `clickhouse-server` package sets it up during installation. Some virtual environments don't allow to set the `CAP_SYS_NICE` capability. In this case `clickhouse-server` shows a message about it at the start. + To use this setting, you need to set the `CAP_SYS_NICE` capability. The `clickhouse-server` package sets it up during installation. Some virtual environments don't allow you to set the `CAP_SYS_NICE` capability. In this case, `clickhouse-server` shows a message about it at the start. Possible values: -You can set values in the `[-20, 19]` range. +- You can set values in the range `[-20, 19]`. -The lower value means a higher priority. Threads with low values of `nice` priority are executed more frequently than threads with high values. High values are preferable for long running non-interactive queries because it allows them to quickly give up resources in favour of short interactive queries when they arrive. +Lower values mean higher priority. Threads with low `nice` priority values are executed more frequently than threads with high values. High values are preferable for long running non-interactive queries because it allows them to quickly give up resources in favor of short interactive queries when they arrive. Default value: 0. diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 3453d8e3d7e..ee4f350462e 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -844,4 +844,21 @@ load_balancing = first_or_random - [Множественный JOIN](../../query_language/select.md#select-join) + +## os_thread_priority {#setting-os_thread_priority} + +Устанавливает приоритет ([nice](https://en.wikipedia.org/wiki/Nice_(Unix))) для потоков, исполняющих запросы. Планировщик ОС учитывает эти приоритеты при выборе следующего потока для исполнения на доступном ядре CPU. + +!!! warning "Предупреждение" + Для использования этой настройки необходимо установить свойство `CAP_SYS_NICE`. Пакет `clickhouse-server` устанавливает его во время инсталляции. Некоторые виртуальные окружения не позволяют установить `CAP_SYS_NICE`. В этом случае, `clickhouse-server` выводит сообщение при запуске. + +Допустимые значения: + +- Любое значение из диапазона `[-20, 19]`. + +Более низкие значения означают более высокий приоритет. Потоки с низкими значениями приоритета `nice` выполняются чаще, чем потоки с более высокими значениями. Высокие значения предпочтительно использовать для долгих неинтерактивных запросов, поскольку это позволяет бысто выделить ресурс в пользу коротких интерактивных запросов. + +Значение по умолчанию — 0. + + [Оригинальная статья](https://clickhouse.yandex/docs/ru/operations/settings/settings/) From b46e31d198834f22cd4f95907c80585b01ea7fb1 Mon Sep 17 00:00:00 2001 From: BayoNet Date: Tue, 24 Sep 2019 02:32:02 +0300 Subject: [PATCH 220/309] DOCAPI-7743: EN review, RU translation for CREATE TABLE AS table_function() docs (#7056) * Typo fix. * Links fix. * Fixed links in docs. * More fixes. * Update index.md (#57) * DOCAPI-7743: RU translation. * Update select.md * Update index.md --- .../query_language/table_functions/index.md | 6 ++-- docs/ru/query_language/select.md | 19 +++++++----- .../query_language/table_functions/index.md | 29 +++++++++++++++++-- 3 files changed, 40 insertions(+), 14 deletions(-) diff --git a/docs/en/query_language/table_functions/index.md b/docs/en/query_language/table_functions/index.md index dc2665ce7a5..0e27ba7b497 100644 --- a/docs/en/query_language/table_functions/index.md +++ b/docs/en/query_language/table_functions/index.md @@ -1,6 +1,6 @@ # Table Functions -Table function is a method of constructing a table. +Table functions are methods for constructing tables. You can use table functions in: @@ -10,7 +10,7 @@ You can use table functions in: * [FROM](../select.md#select-from) clause of the `SELECT` query. - The method of creating a temporary table, that is available only in current query. The table is deleted after the query finishes. + The method for creating a temporary table that is available only in the current query. The table is deleted when the query finishes. !!! warning "Warning" You can't use table functions if the [allow_ddl](../../operations/settings/permissions_for_queries.md#settings_allow_ddl) setting is disabled. @@ -19,7 +19,7 @@ Function | Description ---------|------------ [file](file.md) | Creates a [File](../../operations/table_engines/file.md)-engine table. [merge](merge.md) | Creates a [Merge](../../operations/table_engines/merge.md)-engine table. -[numbers](numbers.md) | Creates a table with the single column filled with integer numbers. +[numbers](numbers.md) | Creates a table with a single column filled with integer numbers. [remote](remote.md) | Allows you to access remote servers without creating a [Distributed](../../operations/table_engines/distributed.md)-engine table. [url](url.md) | Creates a [Url](../../operations/table_engines/url.md)-engine table. [mysql](mysql.md) | Creates a [MySQL](../../operations/table_engines/mysql.md)-engine table. diff --git a/docs/ru/query_language/select.md b/docs/ru/query_language/select.md index ca7df787350..002f1443979 100644 --- a/docs/ru/query_language/select.md +++ b/docs/ru/query_language/select.md @@ -95,20 +95,23 @@ FROM ### Секция FROM Если секция FROM отсутствует, то данные будут читаться из таблицы `system.one`. -Таблица system.one содержит ровно одну строку (то есть, эта таблица выполняет такую же роль, как таблица DUAL, которую можно найти в других СУБД). +Таблица `system.one` содержит ровно одну строку (то есть, эта таблица выполняет такую же роль, как таблица DUAL, которую можно найти в других СУБД). -В секции FROM указывается таблица, из которой будут читаться данные, либо подзапрос, либо табличная функция; дополнительно могут присутствовать ARRAY JOIN и обычный JOIN (смотрите ниже). +Cекция `FROM` определяет источник данных: -Вместо таблицы, может быть указан подзапрос SELECT в скобках. -В этом случае, конвейер обработки подзапроса будет встроен в конвейер обработки внешнего запроса. -В отличие от стандартного SQL, после подзапроса не нужно указывать его синоним. Для совместимости, присутствует возможность написать AS name после подзапроса, но указанное имя нигде не используется. +- Таблица +- Подзапрос +- [Табличная функция](table_functions/index.md) -Вместо таблицы, может быть указана табличная функция. Подробнее смотрите раздел "Табличные функции". +Также могут присутствовать `ARRAY JOIN` и обычный `JOIN` (смотрите ниже). + +Вместо таблицы, может быть указан подзапрос `SELECT` в скобках. +В отличие от стандартного SQL, после подзапроса не обязательно указывать его синоним. Для выполнения запроса, из соответствующей таблицы, вынимаются все столбцы, перечисленные в запросе. Из подзапросов выкидываются столбцы, не нужные для внешнего запроса. -Если в запросе не перечислено ни одного столбца (например, SELECT count() FROM t), то из таблицы всё равно вынимается один какой-нибудь столбец (предпочитается самый маленький), для того, чтобы можно было хотя бы посчитать количество строк. +Если в запросе не перечислено ни одного столбца (например, `SELECT count() FROM t`), то из таблицы всё равно вынимается один какой-нибудь столбец (предпочитается самый маленький), для того, чтобы можно было посчитать количество строк. -Модификатор FINAL может быть использован при SELECT-е из таблиц типа ReplacingMergeTree, SummingMergeTree, AggregatingMergeTree, CollapsingMergeTree, VersionedCollapsingMergeTree. При указании FINAL, данные будут выбираться полностью "домерженными". Стоит учитывать, что использование FINAL приводит к выбору кроме указанных в SELECT-е столбцов также столбцов, относящихся к первичному ключу. Также, запрос будет выполняться в один поток, и при выполнении запроса будет выполняться слияние данных. Это приводит к тому, что при использовании FINAL, запрос выполняется медленнее. В большинстве случаев, следует избегать использования FINAL. +Модификатор `FINAL` может быть использован в запросе `SELECT` из таблиц семейства [MergeTree](../operations/table_engines/mergetree.md). При указании `FINAL`, данные будут выбираться полностью "домерженными". Стоит учитывать, что использование `FINAL` приводит к выбору кроме указанных в `SELECT` столбцов также столбцов, относящихся к первичному ключу. Также, запрос будет выполняться в один поток, и при выполнении запроса будет выполняться слияние данных. Это приводит к тому, что при использовании `FINAL`, запрос выполняется медленнее. В большинстве случаев, следует избегать использования `FINAL`. ### Секция SAMPLE {#select-sample-clause} diff --git a/docs/ru/query_language/table_functions/index.md b/docs/ru/query_language/table_functions/index.md index 704c9fa7123..2883ae7c032 100644 --- a/docs/ru/query_language/table_functions/index.md +++ b/docs/ru/query_language/table_functions/index.md @@ -1,7 +1,30 @@ # Табличные функции -Табличные функции могут указываться в секции FROM вместо имени БД и таблицы. -Табличные функции можно использовать только если не выставлена настройка readonly. -Табличные функции не имеют отношения к другим функциям. +Табличные функции — это метод создания таблиц. + +Табличные функции можно использовать в: + +* Секции [FROM](../select.md#select-from) запроса `SELECT`. + + Это способ создания временной таблицы, которая доступна только в текущем запросе. + +* Запросе [CREATE TABLE AS ](../create.md#create-table-query). + + Это один из методов создания таблицы. + +!!! warning "Предупреждение" + Если настройка [allow_ddl](../../operations/settings/permissions_for_queries.md#settings_allow_ddl) выключена, то использовать табличные функции невозможно. + +Функция | Описание +---------|------------ +[file](file.md) | Создаёт таблицу с движком [File](../../operations/table_engines/file.md). +[merge](merge.md) | Создаёт таблицу с движком [Merge](../../operations/table_engines/merge.md). +[numbers](numbers.md) | Создаёт таблицу с единственным столбцом, заполненным целыми числами. +[remote](remote.md) | Предоставляет доступ к удалённым серверам, не создавая таблицу с движком [Distributed](../../operations/table_engines/distributed.md). +[url](url.md) | Создаёт таблицу с движком [Url](../../operations/table_engines/url.md). +[mysql](mysql.md) | Создаёт таблицу с движком [MySQL](../../operations/table_engines/mysql.md). +[jdbc](jdbc.md) | Создаёт таблицу с дижком [JDBC](../../operations/table_engines/jdbc.md). +[odbc](odbc.md) | Создаёт таблицу с движком [ODBC](../../operations/table_engines/odbc.md). +[hdfs](hdfs.md) | Создаёт таблицу с движком [HDFS](../../operations/table_engines/hdfs.md). [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/table_functions/) From ff9682cc3207a9aeb36b6b9dde337c294aa2794d Mon Sep 17 00:00:00 2001 From: dasmfm <2@borisklimenko.ru> Date: Tue, 24 Sep 2019 02:42:52 +0300 Subject: [PATCH 221/309] Doc fix for SELECT ... FINAL (#6157) * Update select.md * Update select.md --- docs/en/query_language/select.md | 3 ++- docs/ru/query_language/select.md | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/en/query_language/select.md b/docs/en/query_language/select.md index 5310b6dfa12..43df5c66280 100644 --- a/docs/en/query_language/select.md +++ b/docs/en/query_language/select.md @@ -112,7 +112,8 @@ In contrast to standard SQL, a synonym does not need to be specified after a sub To execute a query, all the columns listed in the query are extracted from the appropriate table. Any columns not needed for the external query are thrown out of the subqueries. If a query does not list any columns (for example, `SELECT count() FROM t`), some column is extracted from the table anyway (the smallest one is preferred), in order to calculate the number of rows. -The `FINAL` modifier can be used in the `SELECT` select query for aggregating engines from the [MergeTree](../operations/table_engines/mergetree.md) family. When you specify `FINAL`, data is selected fully "merged". Keep in mind that using `FINAL` leads to a selection that includes columns related to the primary key, in addition to the columns specified in the `SELECT`. Additionally, the query will be executed in a single stream, and data will be merged during query execution. This means that when using `FINAL`, the query is processed slowly. In the most cases, avoid using `FINAL`. +The `FINAL` modifier can be used in the `SELECT` select query for engines from the [MergeTree](../operations/table_engines/mergetree.md) family. When you specify `FINAL`, data is selected fully "merged". Keep in mind that using `FINAL` leads to reading columns related to the primary key, in addition to the columns specified in the query. Additionally, the query will be executed in a single thread, and data will be merged during query execution. This means that when using `FINAL`, the query is processed slowly. In the most cases, avoid using `FINAL`. +The `FINAL` modifier can be applied for all engines of MergeTree family that do data transformations in background merges (except GraphiteMergeTree). ### SAMPLE Clause {#select-sample-clause} diff --git a/docs/ru/query_language/select.md b/docs/ru/query_language/select.md index 002f1443979..b70e8a0ceb3 100644 --- a/docs/ru/query_language/select.md +++ b/docs/ru/query_language/select.md @@ -111,7 +111,8 @@ Cекция `FROM` определяет источник данных: Для выполнения запроса, из соответствующей таблицы, вынимаются все столбцы, перечисленные в запросе. Из подзапросов выкидываются столбцы, не нужные для внешнего запроса. Если в запросе не перечислено ни одного столбца (например, `SELECT count() FROM t`), то из таблицы всё равно вынимается один какой-нибудь столбец (предпочитается самый маленький), для того, чтобы можно было посчитать количество строк. -Модификатор `FINAL` может быть использован в запросе `SELECT` из таблиц семейства [MergeTree](../operations/table_engines/mergetree.md). При указании `FINAL`, данные будут выбираться полностью "домерженными". Стоит учитывать, что использование `FINAL` приводит к выбору кроме указанных в `SELECT` столбцов также столбцов, относящихся к первичному ключу. Также, запрос будет выполняться в один поток, и при выполнении запроса будет выполняться слияние данных. Это приводит к тому, что при использовании `FINAL`, запрос выполняется медленнее. В большинстве случаев, следует избегать использования `FINAL`. +Модификатор `FINAL` может быть использован в запросе `SELECT` из таблиц семейства [MergeTree](../operations/table_engines/mergetree.md). При указании `FINAL`, данные будут выбираться полностью "домерженными". Стоит учитывать, что использование `FINAL` приводит к чтению также столбцов, относящихся к первичному ключу. Также, запрос будет выполняться в один поток, и при выполнении запроса будет выполняться слияние данных. Это приводит к тому, что при использовании `FINAL`, запрос выполняется медленнее. В большинстве случаев, следует избегать использования `FINAL`. +Модификатор `FINAL` может быть использован для всех таблиц семейства `MergeTree`, которые производят преобразования данных в процессе фоновых слияний (кроме GraphiteMergeTree). ### Секция SAMPLE {#select-sample-clause} From 2f580c2bbbed2b2460c5dcf89b1f7cd684a32d6d Mon Sep 17 00:00:00 2001 From: BayoNet Date: Tue, 24 Sep 2019 02:45:47 +0300 Subject: [PATCH 222/309] DOCAPI-7745: EN review, RU translation for optimize_throw_if_noop docs (#7051) * Typo fix. * Update settings.md (#50) * Update misc.md (#51) * DOCAPI-7745: RU translation. * DOCAPI-7745: Fixes. --- docs/en/operations/settings/settings.md | 6 ++++-- docs/en/query_language/misc.md | 10 +++++----- docs/ru/operations/settings/settings.md | 13 +++++++++++++ docs/ru/query_language/misc.md | 14 +++++++++----- 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 7c738cbb940..5591f82d037 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -862,9 +862,9 @@ Default value: 0. ## optimize_throw_if_noop {#setting-optimize_throw_if_noop} -Enables or disables throwing an exception if the [OPTIMIZE](../../query_language/misc.md#misc_operations-optimize) query have not performed a merge. +Enables or disables throwing an exception if an [OPTIMIZE](../../query_language/misc.md#misc_operations-optimize) query didn't perform a merge. -By default `OPTIMIZE` returns successfully even if it haven't done anything. This setting allows to distinguish this situation and get the reason in exception message. +By default, `OPTIMIZE` returns successfully even if it didn't do anything. This setting lets you differentiate these situations and get the reason in an exception message. Possible values: @@ -872,6 +872,8 @@ Possible values: - 0 — Throwing an exception is disabled. Default value: 0. + + ## distributed_replica_error_half_life {#settings-distributed_replica_error_half_life} - Type: seconds diff --git a/docs/en/query_language/misc.md b/docs/en/query_language/misc.md index 3a2fa03100e..4272bb5c155 100644 --- a/docs/en/query_language/misc.md +++ b/docs/en/query_language/misc.md @@ -177,16 +177,16 @@ Changes already made by the mutation are not rolled back. OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition] [FINAL] ``` -This query tries to initialize an unscheduled merge of data parts for tables with a table engine of [MergeTree](../operations/table_engines/mergetree.md) family. Other kinds of table engines are not supported. +This query tries to initialize an unscheduled merge of data parts for tables with a table engine from the [MergeTree](../operations/table_engines/mergetree.md) family. Other kinds of table engines aren't supported. -When `OPTIMIZE` is used with [ReplicatedMergeTree](../operations/table_engines/replication.md) family of table engines, ClickHouse creates a task for merging and waits for execution on all nodes (if the `replication_alter_partitions_sync` setting is enabled). +When `OPTIMIZE` is used with the [ReplicatedMergeTree](../operations/table_engines/replication.md) family of table engines, ClickHouse creates a task for merging and waits for execution on all nodes (if the `replication_alter_partitions_sync` setting is enabled). -- If `OPTIMIZE` doesn't perform merging for any reason, it doesn't notify the client about it. To enable notification use the [optimize_throw_if_noop](../operations/settings/settings.md#setting-optimize_throw_if_noop) setting. +- If `OPTIMIZE` doesn't perform a merge for any reason, it doesn't notify the client. To enable notifications, use the [optimize_throw_if_noop](../operations/settings/settings.md#setting-optimize_throw_if_noop) setting. - If you specify a `PARTITION`, only the specified partition is optimized. - If you specify `FINAL`, optimization is performed even when all the data is already in one part. -!!! warning - OPTIMIZE can't fix the "Too many parts" error. +!!! warning "Warning" + `OPTIMIZE` can't fix the "Too many parts" error. ## RENAME {#misc_operations-rename} diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index ee4f350462e..e3273527ef4 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -845,6 +845,19 @@ load_balancing = first_or_random - [Множественный JOIN](../../query_language/select.md#select-join) +## optimize_throw_if_noop {#setting-optimize_throw_if_noop} + +Включает или отключает генерирование исключения в в случаях, когда запрос [OPTIMIZE](../../query_language/misc.md#misc_operations-optimize) не выполняет мёрж. + +По умолчанию, `OPTIMIZE` завершается успешно и в тех случаях, когда он ничего не сделал. Настройка позволяет отделить подобные случаи и включает генерирование исключения с поясняющим сообщением. + +Возможные значения: + +- 1 — генерирование исключения включено. +- 0 — генерирование исключения выключено. + +Значение по умолчанию — 0. + ## os_thread_priority {#setting-os_thread_priority} Устанавливает приоритет ([nice](https://en.wikipedia.org/wiki/Nice_(Unix))) для потоков, исполняющих запросы. Планировщик ОС учитывает эти приоритеты при выборе следующего потока для исполнения на доступном ядре CPU. diff --git a/docs/ru/query_language/misc.md b/docs/ru/query_language/misc.md index d169e5715e9..00cb0e7fd93 100644 --- a/docs/ru/query_language/misc.md +++ b/docs/ru/query_language/misc.md @@ -176,12 +176,16 @@ KILL MUTATION WHERE database = 'default' AND table = 'table' AND mutation_id = ' OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition] [FINAL] ``` -Просит движок таблицы сделать что-нибудь, что может привести к более оптимальной работе. -Поддерживается только движками `*MergeTree`, в котором выполнение этого запроса инициирует внеочередное слияние кусков данных. -Если указан `PARTITION`, то оптимизация будет производиться только для указаной партиции. -Если указан `FINAL`, то оптимизация будет производиться даже когда все данные уже лежат в одном куске. +Запрос пытается запустить внеплановый мёрж кусков данных для таблиц семейства [MergeTree](../operations/table_engines/mergetree.md). Другие движки таблиц не поддерживаются. -!!! warning "Внимание"Запрос OPTIMIZE не может устранить причину появления ошибки "Too many parts". +Если `OPTIMIZE` применяется к таблицам семейства [ReplicatedMergeTree](../operations/table_engines/replication.md), ClickHouse создаёт задачу на мёрж и ожидает её исполнения на всех узлах (если активирована настройка `replication_alter_partitions_sync`). + +- Если `OPTIMIZE` не выполняет мёрж по любой причине, ClickHouse не оповещает об этом клиента. Чтобы включить оповещения, используйте настройку [optimize_throw_if_noop](../operations/settings/settings.md#setting-optimize_throw_if_noop). +- Если указать `PARTITION`, то оптимизация выполняется только для указанной партиции. +- Если указать `FINAL`, то оптимизация выполняется даже в том случае, если все данные уже лежат в одном куске. + +!!! warning "Внимание" + Запрос `OPTIMIZE` не может устранить причину появления ошибки "Too many parts". ## RENAME {#misc_operations-rename} From 779648f7ccd774e65a08df24ae88df01144d853d Mon Sep 17 00:00:00 2001 From: BayoNet Date: Tue, 24 Sep 2019 02:50:26 +0300 Subject: [PATCH 223/309] DOCAPI-7695: EN review, RU translation. Functions support for indexes (#7045) * Typo fix. * DOCAPI-7695: Typo fixed * Update mergetree.md (#49) * DOCAPI-7695: RU translation * Update mergetree.md --- docs/en/operations/table_engines/mergetree.md | 10 ++--- docs/ru/operations/table_engines/mergetree.md | 42 +++++++++++++++++++ .../functions/array_functions.md | 4 +- 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/docs/en/operations/table_engines/mergetree.md b/docs/en/operations/table_engines/mergetree.md index 31006df639d..22e0c5e6068 100644 --- a/docs/en/operations/table_engines/mergetree.md +++ b/docs/en/operations/table_engines/mergetree.md @@ -315,9 +315,9 @@ INDEX sample_index3 (lower(str), str) TYPE ngrambf_v1(3, 256, 2, 0) GRANULARITY #### Functions Support -Conditions in the `WHERE` clause contain calls of functions over the columns. If the column is a part of some index, ClickHouse tries to use this index when performing the functions. ClickHouse supports different subset of functions for using indexes. +Conditions in the `WHERE` clause contains calls of the functions that operate with columns. If the column is a part of an index, ClickHouse tries to use this index when performing the functions. ClickHouse supports different subsets of functions for using indexes. -The `set` index can be used with all functions. Functions subsets for other indexes are in the table below. +The `set` index can be used with all functions. Function subsets for other indexes are shown in the table below. Function (operator) / Index | primary key | minmax | ngrambf_v1 | tokenbf_v1 | bloom_filter ----------------------------|-------------|--------|------------|------------|--------------- @@ -326,7 +326,7 @@ Function (operator) / Index | primary key | minmax | ngrambf_v1 | tokenbf_v1 | b [like](../../query_language/functions/string_search_functions.md#function-like) | ✔ | ✔ | ✔ | ✗ | ✗ [notLike](../../query_language/functions/string_search_functions.md#function-notlike) | ✔ | ✔ | ✔ | ✔ | ✗ [startsWith](../../query_language/functions/string_functions.md#function-startswith) | ✔ | ✔ | ✔ | ✔ | ✗ -[endsWith](../../query_language/functions/string_functions.md#function-endswith) | ✗ | ✗ | ✔ | ✔ | +[endsWith](../../query_language/functions/string_functions.md#function-endswith) | ✗ | ✗ | ✔ | ✔ | ✗ [multiSearchAny](../../query_language/functions/string_search_functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✔ | ✗ [in](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ [notIn](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ @@ -338,9 +338,9 @@ Function (operator) / Index | primary key | minmax | ngrambf_v1 | tokenbf_v1 | b [notEmpty](../../query_language/functions/array_functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ hasToken | ✗ | ✗ | ✗ | ✔ | ✗ -Functions with a constant argument less than ngram size couldn't be used by `ngrambf_v1` for the query optimization. +Functions with a constant argument that is less than ngram size can't be used by `ngrambf_v1` for query optimization. -Bloom filters can have false positive matches, so the `ngrambf_v1`, `tokenbf_v1`, `bloom_filter` indexes couldn't be used for optimizing queries where the result of a function is expected to be false, for example: +Bloom filters can have false positive matches, so the `ngrambf_v1`, `tokenbf_v1`, and `bloom_filter` indexes can't be used for optimizing queries where the result of a function is expected to be false, for example: - Can be optimized: - `s LIKE '%test%'` diff --git a/docs/ru/operations/table_engines/mergetree.md b/docs/ru/operations/table_engines/mergetree.md index 54debe40089..0e03a6a0d75 100644 --- a/docs/ru/operations/table_engines/mergetree.md +++ b/docs/ru/operations/table_engines/mergetree.md @@ -295,6 +295,48 @@ INDEX b (u64 * length(str), i32 + f64 * 100, date, str) TYPE minmax GRANULARITY INDEX b (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100) GRANULARITY 4 ``` +#### Поддержка для функций + +Условия в секции `WHERE` содержат вызовы функций, оперирующих со столбцами. Если столбец - часть индекса, ClickHouse пытается использовать индекс при выполнении функции. Для разных видов индексов, ClickHouse поддерживает различные наборы функций, которые могут использоваться индексами. + +Индекс `set` используется со всеми функциями. Наборы функций для остальных индексов представлены в таблице ниже. + +Function (operator) / Index | primary key | minmax | ngrambf_v1 | tokenbf_v1 | bloom_filter +----------------------------|-------------|--------|------------|------------|--------------- +[equals (=, ==)](../../query_language/functions/comparison_functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ +[notEquals(!=, <>)](../../query_language/functions/comparison_functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ +[like](../../query_language/functions/string_search_functions.md#function-like) | ✔ | ✔ | ✔ | ✗ | ✗ +[notLike](../../query_language/functions/string_search_functions.md#function-notlike) | ✔ | ✔ | ✔ | ✔ | ✗ +[startsWith](../../query_language/functions/string_functions.md#function-startswith) | ✔ | ✔ | ✔ | ✔ | ✗ +[endsWith](../../query_language/functions/string_functions.md#function-endswith) | ✗ | ✗ | ✔ | ✔ | ✗ +[multiSearchAny](../../query_language/functions/string_search_functions.md#function-multisearchany) | ✗ | ✗ | ✔ | ✔ | ✗ +[in](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ +[notIn](../../query_language/functions/in_functions.md#in-functions) | ✔ | ✔ | ✔ | ✔ | ✔ +[less (<)](../../query_language/functions/comparison_functions.md#function-less) | ✔ | ✔ | ✗ | ✗ | ✗ +[greater (>)](../../query_language/functions/comparison_functions.md#function-greater) | ✔ | ✔ | ✗ | ✗ | ✗ +[lessOrEquals (<=)](../../query_language/functions/comparison_functions.md#function-lessorequals) | ✔ | ✔ | ✗ | ✗ | ✗ +[greaterOrEquals (>=)](../../query_language/functions/comparison_functions.md#function-greaterorequals) | ✔ | ✔ | ✗ | ✗ | ✗ +[empty](../../query_language/functions/array_functions.md#function-empty) | ✔ | ✔ | ✗ | ✗ | ✗ +[notEmpty](../../query_language/functions/array_functions.md#function-notempty) | ✔ | ✔ | ✗ | ✗ | ✗ +hasToken | ✗ | ✗ | ✗ | ✔ | ✗ + +Функции с постоянным агрументом, который меньше, чем размер ngram не могут использовать индекс `ngrambf_v1` для оптимизации запроса. + +Фильтры Блума могут иметь ложнопозитивные срабатывания, следовательно индексы `ngrambf_v1`, `tokenbf_v1` и `bloom_filter` невозможно использовать для оптимизации запросов, в которых результат функции предполается false, например: + +- Можно оптимизировать: + - `s LIKE '%test%'` + - `NOT s NOT LIKE '%test%'` + - `s = 1` + - `NOT s != 1` + - `startsWith(s, 'test')` +- Нельзя оптимизировать: + - `NOT s LIKE '%test%'` + - `s NOT LIKE '%test%'` + - `NOT s = 1` + - `s != 1` + - `NOT startsWith(s, 'test')` + ## Конкурентный доступ к данным Для конкурентного доступа к таблице используется мультиверсионность. То есть, при одновременном чтении и обновлении таблицы, данные будут читаться из набора кусочков, актуального на момент запроса. Длинных блокировок нет. Вставки никак не мешают чтениям. diff --git a/docs/ru/query_language/functions/array_functions.md b/docs/ru/query_language/functions/array_functions.md index a3d3fff9ff2..19e3bb965c5 100644 --- a/docs/ru/query_language/functions/array_functions.md +++ b/docs/ru/query_language/functions/array_functions.md @@ -1,12 +1,12 @@ # Функции по работе с массивами -## empty +## empty {#function-empty} Возвращает 1 для пустого массива, и 0 для непустого массива. Тип результата - UInt8. Функция также работает для строк. -## notEmpty +## notEmpty {#function-notempty} Возвращает 0 для пустого массива, и 1 для непустого массива. Тип результата - UInt8. From 1143eefda6498b2915b16a1d4cf67ca0929f6a79 Mon Sep 17 00:00:00 2001 From: BayoNet Date: Tue, 24 Sep 2019 02:55:50 +0300 Subject: [PATCH 224/309] DOCAPI-7991: EN review, RU translation for the update of Log engines docs (#7040) * Typo fix. * Update log_family.md (#43) * Update tinylog.md (#44) * DOCAPI-7991: RU translation. --- .../en/operations/table_engines/log_family.md | 6 ++--- docs/en/operations/table_engines/tinylog.md | 9 +++----- .../ru/operations/table_engines/log_family.md | 23 ++++++------------- docs/ru/operations/table_engines/tinylog.md | 17 ++------------ 4 files changed, 15 insertions(+), 40 deletions(-) diff --git a/docs/en/operations/table_engines/log_family.md b/docs/en/operations/table_engines/log_family.md index aef0e21f08c..9353ea796b6 100644 --- a/docs/en/operations/table_engines/log_family.md +++ b/docs/en/operations/table_engines/log_family.md @@ -16,7 +16,7 @@ Engines: - Append data to the end of file when writing. - Support locks for concurrent data access. - During `INSERT` query the table is locked, and other queries for reading and writing data both wait for unlocking. If there are no writing data queries, any number of reading data queries can be performed concurrently. + During `INSERT` queries, the table is locked, and other queries for reading and writing data both wait for the table to unlock. If there are no data writing queries, any number of data reading queries can be performed concurrently. - Do not support [mutation](../../query_language/alter.md#alter-mutations) operations. - Do not support indexes. @@ -30,9 +30,9 @@ Engines: ## Differences -The `TinyLog` engine is the simplest in the family and provides the poorest functionality and lowest efficiency. The `TinyLog` engine does not support a parallel reading of data. It reads the data slower than other engines of the family that have parallel reading, and it uses almost as many descriptors as the `Log` engine because it stores each column in a separate file. Use it in simple low-load scenarios. +The `TinyLog` engine is the simplest in the family and provides the poorest functionality and lowest efficiency. The `TinyLog` engine doesn't support parallel data reading by several threads. It reads data slower than other engines in the family that support parallel reading and it uses almost as many descriptors as the `Log` engine because it stores each column in a separate file. Use it in simple low-load scenarios. -The `Log` and `StripeLog` engines support parallel reading of data. When reading data ClickHouse uses multiple threads. Each thread processes separated data block. The `Log` engine uses the separate file for each column of the table. The `StripeLog` stores all the data in one file. Thus the `StripeLog` engine uses fewer descriptors in the operating system, but the `Log` engine provides a more efficient reading of the data. +The `Log` and `StripeLog` engines support parallel data reading. When reading data, ClickHouse uses multiple threads. Each thread processes a separate data block. The `Log` engine uses a separate file for each column of the table. `StripeLog` stores all the data in one file. As a result, the `StripeLog` engine uses fewer descriptors in the operating system, but the `Log` engine provides higher efficiency when reading data. [Original article](https://clickhouse.yandex/docs/en/operations/table_engines/log_family/) diff --git a/docs/en/operations/table_engines/tinylog.md b/docs/en/operations/table_engines/tinylog.md index fe4913118e9..7be5e36c0b6 100644 --- a/docs/en/operations/table_engines/tinylog.md +++ b/docs/en/operations/table_engines/tinylog.md @@ -1,12 +1,9 @@ # TinyLog -Engine belongs to the family of log engines. See [Log Engine Family](log_family.md) for common properties of log engines and for their differences. +The engine belongs to the log engine family. See [Log Engine Family](log_family.md) for common properties of log engines and their differences. -The typical way using this table engine is write-once method: firstly write the data one time, then read it as many times as needed. For example, you can use `TinyLog`-type tables for intermediary data that is processed in small batches. - -Queries are executed in a single stream. In other words, this engine is intended for relatively small tables (recommended up to about 1,000,000 rows). It makes sense to use this table engine if you have many small tables, since it is simpler than the [Log](log.md) engine (fewer files need to be opened). - -The situation when you have a large number of small tables guarantees poor productivity, but may already be used when working with another DBMS, and you may find it easier to switch to using `TinyLog`-type tables. +This table engine is typically used with the write-once method: write data one time, then read it as many times as necessary. For example, you can use `TinyLog`-type tables for intermediary data that is processed in small batches. Note that storing data in a large number of small tables is inefficient. +Queries are executed in a single stream. In other words, this engine is intended for relatively small tables (up to about 1,000,000 rows). It makes sense to use this table engine if you have many small tables, since it's simpler than the [Log](log.md) engine (fewer files need to be opened). [Original article](https://clickhouse.yandex/docs/en/operations/table_engines/tinylog/) diff --git a/docs/ru/operations/table_engines/log_family.md b/docs/ru/operations/table_engines/log_family.md index 14247331fc9..ef59b79abeb 100644 --- a/docs/ru/operations/table_engines/log_family.md +++ b/docs/ru/operations/table_engines/log_family.md @@ -1,6 +1,6 @@ #Семейство Log -Движки разработаны для сценариев, когда необходимо записывать много таблиц с небольшим объемом данных (менее 1 миллиона строк). +Движки разработаны для сценариев, когда необходимо быстро записывать много таблиц с небольшим объемом данных (менее 1 миллиона строк), а затем читать их целиком. Движки семейства: @@ -13,33 +13,24 @@ Движки: - Хранят данные на диске. - - Добавляют данные в конец файла при записи. +- Поддерживают блокировки для конкурентного доступа к данным. + + Во время запросов `INSERT` таблица блокируется, а другие запросы на чтение и запись ожидают разблокировки таблицы. Если запросов на запись данных нет, то можно выполнять любое количество конкуретных запросов на чтение. - Не поддерживают операции [мутации](../../query_language/alter.md#alter-mutations). - - Не поддерживают индексы. Это означает, что запросы `SELECT` не эффективны для выборки диапазонов данных. - Записывают данные не атомарно. - Вы можете получить таблицу с повреждёнными данными, если что-то нарушит операцию записи (например, аварийное завершение работы сервера). + Вы можете получить таблицу с повреждёнными данными, если что-то прервёт операцию записи (например, аварийное завершение работы сервера). ## Отличия -Движки `Log` и `StripeLog` поддерживают: +Движок `TinyLog` самый простой в семье и обеспечивает самые низкие функциональность и эффективность. Движок `TinyLog` не поддерживает параллельного чтения данных в несколько потоков. Движок читает данные медленнее, чем оба других движка с параллельным чтением, и использует почти столько же дескрипторов, сколько и движок `Log`, поскольку хранит каждый столбец в отдельном файле. Его можно использовать в простых сценариях с низкой нагрузкой. -- Блокировки для конкурентного доступа к данным. - - Во время выполнения запроса `INSERT` таблица заблокирована и другие запросы на чтение и запись данных ожидают снятия блокировки. При отсутствии запросов на запись данных можно одновременно выполнять любое количество запросов на чтение данных. - -- Параллельное чтение данных. - - ClickHouse читает данные в несколько потоков. Каждый поток обрабатывает отдельный блок данных. - -Движок `Log` сохраняет каждый столбец таблицы в отдельном файле. Движок `StripeLog` хранит все данные в одном файле. Таким образом, движок `StripeLog` использует меньше дескрипторов в операционной системе, а движок `Log` обеспечивает более эффективное считывание данных. - -Движок `TinyLog` самый простой в семье и обеспечивает самые низкие функциональность и эффективность. Движок `TinyLog` не поддерживает ни параллельного чтения данных, ни конкурентного доступа к данным. Он хранит каждый столбец в отдельном файле. Движок читает данные медленнее, чем оба других движка с параллельным чтением, и использует почти столько же дескрипторов, сколько и движок `Log`. Его можно использовать в простых сценариях с низкой нагрузкой. +Движки `Log` и `StripeLog` поддерживают параллельное чтение. При чтении данных, ClickHouse использует множество потоков. Каждый поток обрабатывает отдельный блок данных. Движок `Log` сохраняет каждый столбец таблицы в отдельном файле. Движок `StripeLog` хранит все данные в одном файле. Таким образом, движок `StripeLog` использует меньше дескрипторов в операционной системе, а движок `Log` обеспечивает более эффективное считывание данных. [Оригинальная статья](https://clickhouse.yandex/docs/ru/operations/table_engines/log_family/) diff --git a/docs/ru/operations/table_engines/tinylog.md b/docs/ru/operations/table_engines/tinylog.md index 22b0ffb5e50..673647f628a 100644 --- a/docs/ru/operations/table_engines/tinylog.md +++ b/docs/ru/operations/table_engines/tinylog.md @@ -2,21 +2,8 @@ Движок относится к семейству движков Log. Смотрите общие свойства и различия движков в статье [Семейство Log](log_family.md). -Самый простой движок таблиц, который хранит данные на диске. -Каждый столбец хранится в отдельном сжатом файле. -При записи, данные дописываются в конец файлов. +Типичный способ использования этой движка — это write-once: сначала данные один раз записываются, а затем читаются столько раз, сколько это необходимо. Например, можно использовать таблицы с движком `TinyLog` для хранения промежуточных данных, которые обрабатываются небольшими блоками. Учтите, что хранить данные в большом количестве мелких таблиц неэффективно. -Конкурентный доступ к данным никак не ограничивается: - -- если вы одновременно читаете из таблицы и в другом запросе пишете в неё, то чтение будет завершено с ошибкой; -- если вы одновременно пишете в таблицу в нескольких запросах, то данные будут битыми. - -Типичный способ использования этой таблицы - это write-once: сначала один раз только пишем данные, а потом сколько угодно читаем. -Запросы выполняются в один поток. То есть, этот движок предназначен для сравнительно маленьких таблиц (рекомендуется до 1 000 000 строк). -Этот движок таблиц имеет смысл использовать лишь в случае, если у вас есть много маленьких таблиц, так как он проще, чем движок Log (требуется открывать меньше файлов). -Случай, когда у вас много маленьких таблиц, является гарантированно плохим по производительности, но может уже использоваться при работе с другой СУБД, и вам может оказаться удобнее перейти на использование таблиц типа TinyLog. -**Индексы не поддерживаются.** - -В Яндекс.Метрике таблицы типа TinyLog используются для промежуточных данных, обрабатываемых маленькими пачками. +Запросы выполняются в один поток. То есть, этот движок предназначен для сравнительно маленьких таблиц (до 1 000 000 строк). Этот движок таблиц имеет смысл использовать в том случае, когда у вас есть много маленьких таблиц, так как он проще, чем движок [Log](log.md) (требуется открывать меньше файлов). [Оригинальная статья](https://clickhouse.yandex/docs/ru/operations/table_engines/tinylog/) From 317e57bbc0614feb728079fa8defe402e81224da Mon Sep 17 00:00:00 2001 From: BayoNet Date: Tue, 24 Sep 2019 02:57:32 +0300 Subject: [PATCH 225/309] DOCAPI-7984: EN review, RU translation. ASOF JOIN ON docs (#7035) * Typo fix. * Update select.md (#41) * DOCAPI-7984: RU translation --- docs/en/query_language/select.md | 4 ++-- docs/ru/query_language/select.md | 36 ++++++++++++++++++++++---------- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/docs/en/query_language/select.md b/docs/en/query_language/select.md index 43df5c66280..fb1c529a75b 100644 --- a/docs/en/query_language/select.md +++ b/docs/en/query_language/select.md @@ -563,7 +563,7 @@ You can use the following types of syntax: ON equi_cond AND closest_match_cond ``` - You can use any number of equality conditions and exactly one closest match condition. For example, `SELECT count() FROM A ASOF LEFT JOIN B ON A.a == B.b AND B.t <= A.t`. There is just `table_2.some_col <= table_1.some_col` and `table_1.some_col >= table2.some_col` types of conditions are available. You cannot apply other conditions like `>` or `!=`. + You can use any number of equality conditions and exactly one closest match condition. For example, `SELECT count() FROM A ASOF LEFT JOIN B ON A.a == B.b AND B.t <= A.t`. Only `table_2.some_col <= table_1.some_col` and `table_1.some_col >= table2.some_col` condition types are available. You can't apply other conditions like `>` or `!=`. - `ASOF JOIN ... USING` @@ -590,7 +590,7 @@ event_1_2 | 13:00 | 42 event_2_3 | 13:00 | 42 ... ... ``` -`ASOF JOIN` can take the timestamp of a user event from `table_1` and find an event in `table_2` where the timestamp is closest (equal or less) to the timestamp of the event from `table_1`. Herewith the `user_id` column can be used for joining on equality and the `ev_time` column can be used for joining on the closest match. In our example, `event_1_1` can be joined with `event_2_1`, `event_1_2` can be joined with `event_2_3`, but `event_2_2` cannot be joined. +`ASOF JOIN` can take the timestamp of a user event from `table_1` and find an event in `table_2` where the timestamp is closest (equal to or less) to the timestamp of the event from `table_1`. Here, the `user_id` column can be used for joining on equality and the `ev_time` column can be used for joining on the closest match. In our example, `event_1_1` can be joined with `event_2_1` and `event_1_2` can be joined with `event_2_3`, but `event_2_2` can't be joined. !!! note "Note" diff --git a/docs/ru/query_language/select.md b/docs/ru/query_language/select.md index b70e8a0ceb3..d206ba42c0b 100644 --- a/docs/ru/query_language/select.md +++ b/docs/ru/query_language/select.md @@ -555,18 +555,34 @@ ClickHouse не поддерживает синтаксис с запятыми Таблицы для `ASOF JOIN` должны иметь столбец с отсортированной последовательностью. Этот столбец не может быть единственным в таблице и должен быть одного из типов: `UInt32`, `UInt64`, `Float32`, `Float64`, `Date` и `DateTime`. -Синтаксис `ASOF JOIN`: +Можно использовать следующие типы синтаксиса: -```sql -SELECT expression_list FROM table_1 ASOF JOIN table_2 USING(equi_column1, ... equi_columnN, asof_column) -``` +- `ASOF JOIN ... ON` -`ASOF JOIN` использует `equi_columnX` для объединения по равенству и `asof_column` для объединения по ближайшему совпадению. + ```sql + SELECT expressions_list + FROM table_1 + ASOF LEFT JOIN table_2 + ON equi_cond AND closest_match_cond + ``` + + Можно использовать произвольное количество условий равенства и одно условие на ближайшее совпадение. Например, `SELECT count() FROM A ASOF LEFT JOIN B ON A.a == B.b AND B.t <= A.t`. Можно использовать только условия `table_2.some_col <= table_1.some_col` и `table_1.some_col >= table2.some_col`. Условия типа `>` или `!=` не поддерживаются. + +- `ASOF JOIN ... USING` + + ```sql + SELECT expressions_list + FROM table_1 + ASOF JOIN table_2 + USING (equi_column1, ... equi_columnN, asof_column) + ``` + + Для слияния по равенству `ASOF JOIN` использует `equi_columnX`, а для слияния по ближайшему совпадению использует `asof_column` с условием `table_1.asof_column >= table2.asof_column`. Столбец `asof_column` должен быть последним в секции `USING`. Например, рассмотрим следующие таблицы: -```text - table_1 table_2 +``` + table_1 table_2 event | ev_time | user_id event | ev_time | user_id ----------|---------|---------- ----------|---------|---------- ... ... @@ -578,10 +594,8 @@ event_1_2 | 13:00 | 42 event_2_3 | 13:00 | 42 `ASOF JOIN` принимает метку времени пользовательского события из `table_1` и находит такое событие в `table_2` метка времени которого наиболее близка (равна или меньше) к метке времени события из `table_1`. При этом столбец `user_id` используется для объединения по равенству, а столбец `ev_time` для объединения по ближайшему совпадению. В нашем примере `event_1_1` может быть объединено с `event_2_1`, `event_1_2` может быть объединено с `event_2_3`, а `event_2_2` не объединяется. -Детали реализации: - -- `asof_column` должен быть последним в секции `USING`. -- `ASOF JOIN` не поддержан для движка таблиц [Join](../operations/table_engines/join.md). +!!! note "Примечание" + `ASOF JOIN` не поддержан для движка таблиц [Join](../operations/table_engines/join.md). Чтобы задать значение строгости по умолчанию, используйте сессионный параметр [join_default_strictness](../operations/settings/settings.md#settings-join_default_strictness). From 3d206755297cc67d2254cec63b654a1a3a114854 Mon Sep 17 00:00:00 2001 From: BayoNet Date: Tue, 24 Sep 2019 02:59:49 +0300 Subject: [PATCH 226/309] DOCAPI-7460: EN review. RU translation. Histogram function description (#7034) * Typo fix. * Update parametric_functions.md (#42) * DOCAPI-7460: RU translation. * Update parametric_functions.md --- .../agg_functions/parametric_functions.md | 6 +- .../agg_functions/parametric_functions.md | 69 +++++++++++++++++++ 2 files changed, 72 insertions(+), 3 deletions(-) diff --git a/docs/en/query_language/agg_functions/parametric_functions.md b/docs/en/query_language/agg_functions/parametric_functions.md index edf96df987f..47196e2a4eb 100644 --- a/docs/en/query_language/agg_functions/parametric_functions.md +++ b/docs/en/query_language/agg_functions/parametric_functions.md @@ -10,11 +10,11 @@ Calculates an adaptive histogram. It doesn't guarantee precise results. histogram(number_of_bins)(values) ``` -The functions uses [A Streaming Parallel Decision Tree Algorithm](http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf). The borders of histogram bins are adjusted as a new data enters a function, and in common case the widths of bins are not equal. +The functions uses [A Streaming Parallel Decision Tree Algorithm](http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf). The borders of histogram bins are adjusted as new data enters a function. In common case, the widths of bins are not equal. **Parameters** -`number_of_bins` — Upper limit for a number of bins for the histogram. Function automatically calculates the number of bins. It tries to reach the specified number of bins, but if it fails, it uses less number of bins. +`number_of_bins` — Upper limit for the number of bins in the histogram. The function automatically calculates the number of bins. It tries to reach the specified number of bins, but if it fails, it uses fewer bins. `values` — [Expression](../syntax.md#syntax-expressions) resulting in input values. **Returned values** @@ -69,7 +69,7 @@ FROM └────────┴───────┘ ``` -In this case you should remember, that you don't know the borders of histogram bins. +In this case, you should remember that you don't know the histogram bin borders. ## sequenceMatch(pattern)(time, cond1, cond2, ...) diff --git a/docs/ru/query_language/agg_functions/parametric_functions.md b/docs/ru/query_language/agg_functions/parametric_functions.md index 5bdf838d115..5adf20dfce5 100644 --- a/docs/ru/query_language/agg_functions/parametric_functions.md +++ b/docs/ru/query_language/agg_functions/parametric_functions.md @@ -2,6 +2,75 @@ Некоторые агрегатные функции могут принимать не только столбцы-аргументы (по которым производится свёртка), но и набор параметров - констант для инициализации. Синтаксис - две пары круглых скобок вместо одной. Первая - для параметров, вторая - для аргументов. +## histogram + +Рассчитывает адаптивную гистограмму. Не гарантирует точного результата. + +``` +histogram(number_of_bins)(values) +``` + +Функция использует [A Streaming Parallel Decision Tree Algorithm](http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf). Границы столбцов устанавливаются по мере поступления новых данных в функцию. В общем случае столбцы имею разную ширину. + +**Параметры** + +`number_of_bins` — максимальное количество корзин в гистограмме. Функция автоматически вычисляет количество корзин. Она пытается получить указанное количество корзин, но если не получилось, то в результате корзин будет меньше. +`values` — [выражение](../syntax.md#syntax-expressions), предоставляющее входные значения. + +**Возвращаемые значения** + +- [Массив](../../data_types/array.md) [кортежей](../../data_types/tuple.md) следующего вида: + + ``` + [(lower_1, upper_1, height_1), ... (lower_N, upper_N, height_N)] + ``` + + - `lower` — нижняя граница корзины. + - `upper` — верхняя граница корзины. + - `height` — количество значений в корзине. + +**Пример** + +```sql +SELECT histogram(5)(number + 1) +FROM ( + SELECT * + FROM system.numbers + LIMIT 20 +) +``` +```text +┌─histogram(5)(plus(number, 1))───────────────────────────────────────────┐ +│ [(1,4.5,4),(4.5,8.5,4),(8.5,12.75,4.125),(12.75,17,4.625),(17,20,3.25)] │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +С помощью функции [bar](../other_functions.md#function-bar) можно визуализировать гистограмму, например: + +```sql +WITH histogram(5)(rand() % 100) AS hist +SELECT + arrayJoin(hist).3 AS height, + bar(height, 0, 6, 5) AS bar +FROM +( + SELECT * + FROM system.numbers + LIMIT 20 +) +``` +```text +┌─height─┬─bar───┐ +│ 2.125 │ █▋ │ +│ 3.25 │ ██▌ │ +│ 5.625 │ ████▏ │ +│ 5.625 │ ████▏ │ +│ 3.375 │ ██▌ │ +└────────┴───────┘ +``` + +В этом случае необходимо помнить, что границы корзин гистограммы не известны. + ## sequenceMatch(pattern)(time, cond1, cond2, ...) Сопоставление с образцом для цепочки событий. From 5b38a7f44146cb9a6deff1477dc306706e242a8d Mon Sep 17 00:00:00 2001 From: BayoNet Date: Tue, 24 Sep 2019 03:01:19 +0300 Subject: [PATCH 227/309] DOCAPI-7438: EN review, RU translation. Update of quantileTiming docs (#7033) * Typo fix. * Update reference.md (#45) * DOCAPI-7438: RU translation --- .../query_language/agg_functions/reference.md | 16 +++--- docs/ru/data_types/float.md | 2 +- .../query_language/agg_functions/reference.md | 52 ++++++++++++++----- 3 files changed, 49 insertions(+), 21 deletions(-) diff --git a/docs/en/query_language/agg_functions/reference.md b/docs/en/query_language/agg_functions/reference.md index 9dd5a1586db..a7e9fa36741 100644 --- a/docs/en/query_language/agg_functions/reference.md +++ b/docs/en/query_language/agg_functions/reference.md @@ -850,7 +850,7 @@ Don't use this function for calculating timings. There is a more suitable functi ## quantileTiming {#agg_function-quantiletiming} -Computes the quantile of the specified level with determined precision. The function intended for calculating quantiles of page loading time in milliseconds. +Computes the quantile of the specified level with determined precision. The function is intended for calculating page loading time quantiles in milliseconds. ```sql quantileTiming(level)(expr) @@ -859,7 +859,7 @@ quantileTiming(level)(expr) **Parameters** - `level` — Quantile level. Range: [0, 1]. -- `expr` — [Expression](../syntax.md#syntax-expressions) returning number in the [Float*](../../data_types/float.md) type. The function expects input values in unix timestamp format in milliseconds, but it doesn't validate format. +- `expr` — [Expression](../syntax.md#syntax-expressions) returning a [Float*](../../data_types/float.md)-type number. The function expects input values in unix timestamp format in milliseconds, but it doesn't validate format. - If negative values are passed to the function, the behavior is undefined. - If the value is greater than 30,000 (a page loading time of more than 30 seconds), it is assumed to be 30,000. @@ -868,13 +868,13 @@ quantileTiming(level)(expr) The calculation is accurate if: -- Total number of values is not more than about 5670. -- Total number of values is more than about 5670, but the times of page loading is less than 1024ms. +- Total number of values doesn't exceed 5670. +- Total number of values exceeds 5670, but the page loading time is less than 1024ms. -Otherwise, the result of a calculation is rounded to the value, multiple of 16 ms. +Otherwise, the result of the calculation is rounded to the nearest multiple of 16 ms. !! note "Note" - For calculating quantiles of page loading times, this function is more effective and accurate compared to [quantile](#agg_function-quantile). + For calculating page loading time quantiles, this function is more effective and accurate than [quantile](#agg_function-quantile). **Returned value** @@ -883,9 +883,9 @@ Otherwise, the result of a calculation is rounded to the value, multiple of 16 m Type: `Float32`. !!! note "Note" - If no values were passed to the function (when using `quantileTimingIf`), [NaN](../../data_types/float.md#data_type-float-nan-inf) is returned. The purpose of this is to differentiate these cases from the cases which result in zero. See [ORDER BY clause](../select.md#select-order-by) for the note on sorting `NaN` values. + If no values are passed to the function (when using `quantileTimingIf`), [NaN](../../data_types/float.md#data_type-float-nan-inf) is returned. The purpose of this is to differentiate these cases from cases that result in zero. See [ORDER BY clause](../select.md#select-order-by) for notes on sorting `NaN` values. -The result is deterministic (it doesn't depend on the order of query processing). +The result is deterministic (it doesn't depend on the query processing order). **Example** diff --git a/docs/ru/data_types/float.md b/docs/ru/data_types/float.md index ce5132dcb9c..3eb9f4b8078 100644 --- a/docs/ru/data_types/float.md +++ b/docs/ru/data_types/float.md @@ -26,7 +26,7 @@ SELECT 1 - 0.9 - При вычислениях с плавающей запятой возможно появление таких категорий числа как бесконечность (`Inf`) и "не число" (`NaN`). Это необходимо учитывать при обработке результатов вычислений. - При чтении чисел с плавающей запятой из строк, в качестве результата может быть получено не обязательно ближайшее машинно-представимое число. -## NaN и Inf +## NaN и Inf {#data_type-float-nan-inf} В отличие от стандартного SQL, ClickHouse поддерживает следующие категории чисел с плавающей запятой: diff --git a/docs/ru/query_language/agg_functions/reference.md b/docs/ru/query_language/agg_functions/reference.md index 89922a30c6b..19a1a5ff56f 100644 --- a/docs/ru/query_language/agg_functions/reference.md +++ b/docs/ru/query_language/agg_functions/reference.md @@ -837,7 +837,7 @@ FROM t Функция `groupUniqArray(max_size)(x)` ограничивает размер результирующего массива до `max_size` элементов. Например, `groupUniqArray(1)(x)` равнозначно `[any(x)]`. -## quantile(level)(x) +## quantile(level)(x) {#agg_function-quantile} Приближённо вычисляет квантиль уровня level. level - константа, число с плавающей запятой от 0 до 1. Рекомендуется использовать значения level в диапазоне `[0.01, 0.99]`. @@ -864,27 +864,55 @@ FROM t Не используйте эту функцию для расчёта таймингов. Для этого есть более подходящая функция - `quantileTiming`. -## quantileTiming(level)(x) +## quantileTiming {#agg_function-quantiletiming} -Вычисляет квантиль уровня level с фиксированной точностью. -Работает для чисел. Предназначена для расчёта квантилей от времени загрузки страницы в миллисекундах. +Вычисляет квантиль заданного уровня с детерминированной точностью. Функция предназначена для расчётов квантилей времени загрузки страниц в миллисекундах. -Если значение больше 30000 (соответствует времени загрузки страницы большем 30 секундам) - результат приравнивается к 30000. +``` +quantileTiming(level)(expr) +``` -Если всего значений не больше примерно 5670, то вычисление точное. +**Параметры** -Иначе: +- `level` — уровень квантили. Диапазон: [0, 1]. +- `expr` — [выражение](../syntax.md#syntax-expressions), возвращающее число типа [Float*](../../data_types/float.md). Функция ожидает на вход значения в фомате UNIX-время в миллисекундах, но не проверяет формат входных значений. + + - Поведение функции не определено для отрицательных входных значений. + - Если входное значение больше 30,000 (т.е. время загрузки страницы превышает 30 секунд), оно приравнивается к 30,000. -- если время меньше 1024 мс., то вычисление точное. -- иначе вычисление идёт с округлением до числа, кратного 16 мс. +**Точность** -При передаче в функцию отрицательных значений, поведение не определено. +Вычисления точны если: -Возвращаемое значение имеет тип Float32. Когда в функцию не было передано ни одного значения (при использовании `quantileTimingIf`), возвращается nan. Это сделано, чтобы отличать такие случаи от нулей. Смотрите замечание о сортировке NaN-ов в разделе «Секция ORDER BY». +- Общее количество значений не превышает 5670. +- Общее количество значений больше 5670, но времена загрузки страниц меньше 1024мс. + +В противном случае, результат рассчетов округляется до ближайшего числа, кратного 16мс. + +!! note "Примечание" + Для расчёта квантилей времени загрузки страниц, функция работает эффективней и с более высокой точностью, чем функция [quantile](#agg_function-quantile). + +**Возвращаемое значение** + +- Квантиль заданного уровня. + +Тип: `Float32`. + +!!! note "Примечание" + Если в функцию не передано значений (для `quantileTimingIf`), возвращается [NaN](../../data_types/float.md#data_type-float-nan-inf). Это необходимо для того, что бы отделить такие случаи от случаев, в которых результат 0. Смотрите замечания о сортировке значений `NaN` в разделе [Секция ORDER BY](../select.md#select-order-by). Результат детерминирован (не зависит от порядка выполнения запроса). -Для своей задачи (расчёт квантилей времени загрузки страниц), использование этой функции эффективнее и результат точнее, чем для функции `quantile`. +**Пример** + +```sql +SELECT quantileTiming(0.5)(number / 2) FROM numbers(10) +``` +```text +┌─quantileTiming(0.5)(divide(number, 2))─┐ +│ 2 │ +└────────────────────────────────────────┘ +``` ## quantileTimingWeighted(level)(x, weight) From 6db4cb8117ee01dcea0a6e2a8c8349a23b8560ed Mon Sep 17 00:00:00 2001 From: BayoNet Date: Tue, 24 Sep 2019 03:02:36 +0300 Subject: [PATCH 228/309] DOCAPI-7430: EN review, RU translation. MergeTree INDEX bloom filter docs. (#7025) * Update mergetree.md (#38) * DOCAPI-7430: RU translation. --- docs/en/operations/table_engines/mergetree.md | 14 ++++++------ docs/ru/operations/table_engines/mergetree.md | 22 ++++++++++++++----- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/docs/en/operations/table_engines/mergetree.md b/docs/en/operations/table_engines/mergetree.md index 22e0c5e6068..82ea3b23cc7 100644 --- a/docs/en/operations/table_engines/mergetree.md +++ b/docs/en/operations/table_engines/mergetree.md @@ -47,7 +47,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] For a description of parameters, see the [CREATE query description](../../query_language/create.md). -!!! note "Note" +!!!note "Note" `INDEX` is an experimental feature, see [Data Skipping Indexes](#table_engine-mergetree-data_skipping-indexes). ### Query Clauses @@ -288,24 +288,24 @@ SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 - `ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` - Stores a [bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) that contains all ngrams from a block of data. Works only with strings. Can be used for optimization of `equals`, `like` and `in` expressions. + Stores a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) that contains all ngrams from a block of data. Works only with strings. Can be used for optimization of `equals`, `like` and `in` expressions. - `n` — ngram size, - `size_of_bloom_filter_in_bytes` — Bloom filter size in bytes (you can use large values here, for example, 256 or 512, because it can be compressed well). - - `number_of_hash_functions` — The number of hash functions used in the bloom filter. - - `random_seed` — The seed for bloom filter hash functions. + - `number_of_hash_functions` — The number of hash functions used in the Bloom filter. + - `random_seed` — The seed for Bloom filter hash functions. - `tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)` The same as `ngrambf_v1`, but stores tokens instead of ngrams. Tokens are sequences separated by non-alphanumeric characters. -- `bloom_filter([false_positive])` — Stores [bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) for the specified columns. +- `bloom_filter([false_positive])` — Stores a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) for the specified columns. - The `false_positive` optional parameter is the probability of false positive response from the filter. Possible values: (0, 1). Default value: 0.025. + The optional `false_positive` parameter is the probability of receiving a false positive response from the filter. Possible values: (0, 1). Default value: 0.025. Supported data types: `Int*`, `UInt*`, `Float*`, `Enum`, `Date`, `DateTime`, `String`, `FixedString`. - Supported for the following functions: [equals](../../query_language/functions/comparison_functions.md), [notEquals](../../query_language/functions/comparison_functions.md), [in](../../query_language/functions/in_functions.md), [notIn](../../query_language/functions/in_functions.md). + The following functions can use it: [equals](../../query_language/functions/comparison_functions.md), [notEquals](../../query_language/functions/comparison_functions.md), [in](../../query_language/functions/in_functions.md), [notIn](../../query_language/functions/in_functions.md). ```sql INDEX sample_index (u64 * length(s)) TYPE minmax GRANULARITY 4 diff --git a/docs/ru/operations/table_engines/mergetree.md b/docs/ru/operations/table_engines/mergetree.md index 0e03a6a0d75..61bdbc76457 100644 --- a/docs/ru/operations/table_engines/mergetree.md +++ b/docs/ru/operations/table_engines/mergetree.md @@ -44,7 +44,10 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] [SETTINGS name=value, ...] ``` -Описание параметров запроса смотрите в [описании запроса](../../query_language/create.md). +Описание параметров смотрите в [описании запроса CREATE](../../query_language/create.md). + +!!!note "Note" + `INDEX` — экспериментальная возможность, смотрите [Индексы пропуска данных](#table_engine-mergetree-data_skipping-indexes). ### Секции запроса @@ -244,7 +247,7 @@ ClickHouse не может использовать индекс, если зн ClickHouse использует эту логику не только для последовательностей дней месяца, но и для любого частично-монотонного первичного ключа. -### Дополнительные индексы (Экспериментальная функциональность) +### Индексы пропуска данных (экспериментальная функциональность) {#table_engine-mergetree-data_skipping-indexes} Для использования требуется установить настройку `allow_experimental_data_skipping_indices` в 1. (запустить `SET allow_experimental_data_skipping_indices = 1`). @@ -282,11 +285,18 @@ SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 #### Доступные индексы -* `minmax` -Хранит минимум и максимум выражения (если выражение - `tuple`, то для каждого элемента `tuple`), используя их для пропуска блоков аналогично первичному ключу. +- `minmax` — Хранит минимум и максимум выражения (если выражение - `tuple`, то для каждого элемента `tuple`), используя их для пропуска блоков аналогично первичному ключу. + +- `set(max_rows)` — Хранит уникальные значения выражения на блоке в количестве не более `max_rows` (если `max_rows = 0`, то ограничений нет), используя их для пропуска блоков, оценивая выполнимость `WHERE` выражения на хранимых данных. + +- `bloom_filter([false_positive])` — [фильтр Блума](https://en.wikipedia.org/wiki/Bloom_filter) для указанных стоблцов. + + Необязательный параметр `false_positive` — это вероятность получения ложноположительного срабатывания. Возможные значения: (0, 1). Значение по умолчанию: 0.025. + + Поддержанные типы данных: `Int*`, `UInt*`, `Float*`, `Enum`, `Date`, `DateTime`, `String`, `FixedString`. + + Фильтром могут пользоваться функции: [equals](../../query_language/functions/comparison_functions.md), [notEquals](../../query_language/functions/comparison_functions.md), [in](../../query_language/functions/in_functions.md), [notIn](../../query_language/functions/in_functions.md). -* `set(max_rows)` -Хранит уникальные значения выражения на блоке в количестве не более `max_rows` (если `max_rows = 0`, то ограничений нет), используя их для пропуска блоков, оценивая выполнимость `WHERE` выражения на хранимых данных. **Примеры** From f26fdc63a01c6f154f5eba50c23284d0d0d08cdb Mon Sep 17 00:00:00 2001 From: BayoNet Date: Tue, 24 Sep 2019 03:04:52 +0300 Subject: [PATCH 229/309] DOCAPI-7415: EN review, RU translation. Docs for the -Resample aggregate function combinator. (#7017) * Update combinators.md (#39) * DOCAPI-7415: RU translation * DOCAPI-7415: fix. --- .../agg_functions/combinators.md | 26 +++---- .../agg_functions/combinators.md | 71 +++++++++++++++++++ .../query_language/agg_functions/reference.md | 2 +- 3 files changed, 85 insertions(+), 14 deletions(-) diff --git a/docs/en/query_language/agg_functions/combinators.md b/docs/en/query_language/agg_functions/combinators.md index 2f4662ba21e..ccad56083c4 100644 --- a/docs/en/query_language/agg_functions/combinators.md +++ b/docs/en/query_language/agg_functions/combinators.md @@ -16,7 +16,7 @@ The -Array suffix can be appended to any aggregate function. In this case, the a Example 1: `sumArray(arr)` - Totals all the elements of all 'arr' arrays. In this example, it could have been written more simply: `sum(arraySum(arr))`. -Example 2: `uniqArray(arr)` – Count the number of unique elements in all 'arr' arrays. This could be done an easier way: `uniq(arrayJoin(arr))`, but it's not always possible to add 'arrayJoin' to a query. +Example 2: `uniqArray(arr)` – Counts the number of unique elements in all 'arr' arrays. This could be done an easier way: `uniq(arrayJoin(arr))`, but it's not always possible to add 'arrayJoin' to a query. -If and -Array can be combined. However, 'Array' must come first, then 'If'. Examples: `uniqArrayIf(arr, cond)`, `quantilesTimingArrayIf(level1, level2)(arr, cond)`. Due to this order, the 'cond' argument can't be an array. @@ -44,9 +44,9 @@ Merges the intermediate aggregation states in the same way as the -Merge combina Converts an aggregate function for tables into an aggregate function for arrays that aggregates the corresponding array items and returns an array of results. For example, `sumForEach` for the arrays `[1, 2]`, `[3, 4, 5]`and`[6, 7]`returns the result `[10, 13, 5]` after adding together the corresponding array items. -## -Resample +## -Resample {#agg_functions-combinator-resample} -Allows to divide data by groups, and then separately aggregates the data in those groups. Groups are created by splitting the values of one of the columns into intervals. +Lets you divide data into groups, and then separately aggregates the data in those groups. Groups are created by splitting the values from one column into intervals. ```sql Resample(start, end, step)(, resampling_key) @@ -54,16 +54,16 @@ Allows to divide data by groups, and then separately aggregates the data in thos **Parameters** -- `start` — Starting value of the whole required interval for the values of `resampling_key`. -- `stop` — Ending value of the whole required interval for the values of `resampling_key`. The whole interval doesn't include the `stop` value `[start, stop)`. -- `step` — Step for separating the whole interval by subintervals. The `aggFunction` is executed over each of those subintervals independently. -- `resampling_key` — Column, which values are used for separating data by intervals. -- `aggFunction_params` — Parameters of `aggFunction`. +- `start` — Starting value of the whole required interval for `resampling_key` values. +- `stop` — Ending value of the whole required interval for `resampling_key` values. The whole interval doesn't include the `stop` value `[start, stop)`. +- `step` — Step for separating the whole interval into subintervals. The `aggFunction` is executed over each of those subintervals independently. +- `resampling_key` — Column whose values are used for separating data into intervals. +- `aggFunction_params` — `aggFunction` parameters. **Returned values** -- Array of `aggFunction` results for each of subintervals. +- Array of `aggFunction` results for each subinterval. **Example** @@ -80,9 +80,9 @@ Consider the `people` table with the following data: └────────┴─────┴──────┘ ``` -Let's get the names of the persons which age lies in the intervals of `[30,60)` and `[60,75)`. As we use integer representation of age, then there are ages of `[30, 59]` and `[60,74]`. +Let's get the names of the people whose age lies in the intervals of `[30,60)` and `[60,75)`. Since we use integer representation for age, we get ages in the `[30, 59]` and `[60,74]` intervals. -For aggregating names into the array, we use the aggregate function [groupArray](reference.md#agg_function-grouparray). It takes a single argument. For our case, it is the `name` column. The `groupArrayResample` function should use the `age` column to aggregate names by age. To define required intervals, we pass the `(30, 75, 30)` arguments into the `groupArrayResample` function. +To aggregate names in an array, we use the [groupArray](reference.md#agg_function-grouparray) aggregate function. It takes one argument. In our case, it's the `name` column. The `groupArrayResample` function should use the `age` column to aggregate names by age. To define the required intervals, we pass the `30, 75, 30` arguments into the `groupArrayResample` function. ```sql SELECT groupArrayResample(30, 75, 30)(name, age) from people @@ -95,9 +95,9 @@ SELECT groupArrayResample(30, 75, 30)(name, age) from people Consider the results. -`Jonh` is out of the sample because he is too young. Other people are distributed according to the specified age intervals. +`Jonh` is out of the sample because he's too young. Other people are distributed according to the specified age intervals. -Now, let's count the total number of people and their average wage in the specified age intervals. +Now let's count the total number of people and their average wage in the specified age intervals. ```sql SELECT diff --git a/docs/ru/query_language/agg_functions/combinators.md b/docs/ru/query_language/agg_functions/combinators.md index 1fcdb111e17..68e32fb7032 100644 --- a/docs/ru/query_language/agg_functions/combinators.md +++ b/docs/ru/query_language/agg_functions/combinators.md @@ -46,4 +46,75 @@ Преобразует агрегатную функцию для таблиц в агрегатную функцию для массивов, которая применяет агрегирование для соответствующих элементов массивов и возвращает массив результатов. Например, `sumForEach` для массивов `[1, 2]`, `[3, 4, 5]` и `[6, 7]` даст результат `[10, 13, 5]`, сложив соответственные элементы массивов. + +## -Resample {#agg_functions-combinator-resample} + + +Позволяет поделить данные на группы, а затем по-отдельности агрегирует данные для этих групп. Группы образуются разбиением значений одного из столбцов на интервалы. + +```sql +Resample(start, end, step)(, resampling_key) +``` + +**Параметры** + +- `start` — начальное значение для интервала значений `resampling_key`. +- `stop` — конечное значение для интервала значений `resampling_key`. Интервал не включает значение `stop` (`[start, stop)`). +- `step` — шаг деления полного интервала на подинтервалы. Функция `aggFunction` выполняется для каждого из подинтервалов независимо. +- `resampling_key` — столбец, значения которого используются для разделения данных на интервалы. +- `aggFunction_params` — параметры `aggFunction`. + + +**Возвращаемые значения** + +- Массив результатов `aggFunction` для каждого подинтервала. + +**Пример** + + +Рассмотрим таблицу `people` со следующими данными: + +```text +┌─name───┬─age─┬─wage─┐ +│ John │ 16 │ 10 │ +│ Alice │ 30 │ 15 │ +│ Mary │ 35 │ 8 │ +│ Evelyn │ 48 │ 11.5 │ +│ David │ 62 │ 9.9 │ +│ Brian │ 60 │ 16 │ +└────────┴─────┴──────┘ +``` + +Получим имена людей, чей возраст находится в интервалах `[30,60)` и `[60,75)`. Поскольку мы используем целочисленное представление возраста, то интервалы будут выглядеть как `[30, 59]` и `[60,74]`. + +Чтобы собрать имена в массив, возьмём агрегатную функцию [groupArray](reference.md#agg_function-grouparray). Она принимает один аргумент. В нашем случае, это столбец `name`. Функция `groupArrayResample` должна использовать столбец `age` для агрегирования имён по возрасту. Чтобы определить необходимые интервалы, передадим в функцию `groupArrayResample` аргументы `30, 75, 30`. + +```sql +SELECT groupArrayResample(30, 75, 30)(name, age) from people +``` +```text +┌─groupArrayResample(30, 75, 30)(name, age)─────┐ +│ [['Alice','Mary','Evelyn'],['David','Brian']] │ +└───────────────────────────────────────────────┘ +``` + +Посмотрим на результаты. + +`Jonh` не попал в выдачу, поскольку слишком молод. Остальные распределены согласно заданным возрастным интервалам. + +Теперь посчитаем общее количество людей и их среднюю заработную плату в заданных возрастных интервалах. + + +```sql +SELECT + countResample(30, 75, 30)(name, age) AS amount, + avgResample(30, 75, 30)(wage, age) AS avg_wage +FROM people +``` +```text +┌─amount─┬─avg_wage──────────────────┐ +│ [3,2] │ [11.5,12.949999809265137] │ +└────────┴───────────────────────────┘ +``` + [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/agg_functions/combinators/) diff --git a/docs/ru/query_language/agg_functions/reference.md b/docs/ru/query_language/agg_functions/reference.md index 19a1a5ff56f..bcf362d4c50 100644 --- a/docs/ru/query_language/agg_functions/reference.md +++ b/docs/ru/query_language/agg_functions/reference.md @@ -661,7 +661,7 @@ uniqExact(x[, ...]) - [uniqCombined](#agg_function-uniqcombined) - [uniqHLL12](#agg_function-uniqhll12) -## groupArray(x), groupArray(max_size)(x) +## groupArray(x), groupArray(max_size)(x) {#agg_function-grouparray} Составляет массив из значений аргумента. Значения в массив могут быть добавлены в любом (недетерминированном) порядке. From 981e3e6ef73e70dc9ccb8892127788e293ec6ffa Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 24 Sep 2019 03:22:05 +0300 Subject: [PATCH 230/309] Fixed build with CMake 3.10. --- cmake/target.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/target.cmake b/cmake/target.cmake index eb11fc57cf1..1be6abe8152 100644 --- a/cmake/target.cmake +++ b/cmake/target.cmake @@ -1,12 +1,12 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux") set (OS_LINUX 1) - add_compile_definitions(OS_LINUX) + add_definitions(-D OS_LINUX) elseif (CMAKE_SYSTEM_NAME MATCHES "FreeBSD") set (OS_FREEBSD 1) - add_compile_definitions(OS_FREEBSD) + add_definitions(-D OS_FREEBSD) elseif (CMAKE_SYSTEM_NAME MATCHES "Darwin") set (OS_DARWIN 1) - add_compile_definitions(OS_DARWIN) + add_definitions(-D OS_DARWIN) endif () if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") From dcd40d1d07425ec44d4db24e264a1d30d5887e93 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 24 Sep 2019 03:45:40 +0300 Subject: [PATCH 231/309] Better log messages about disks --- dbms/src/Common/DiskSpaceMonitor.cpp | 23 +++++++++++++---------- dbms/src/IO/WriteHelpers.cpp | 21 +++++++++++++++++++++ dbms/src/IO/WriteHelpers.h | 6 ++++++ dbms/src/Parsers/IAST.cpp | 20 -------------------- dbms/src/Parsers/IAST.h | 6 +----- 5 files changed, 41 insertions(+), 35 deletions(-) diff --git a/dbms/src/Common/DiskSpaceMonitor.cpp b/dbms/src/Common/DiskSpaceMonitor.cpp index 5b07e11f31b..967aa34ee40 100644 --- a/dbms/src/Common/DiskSpaceMonitor.cpp +++ b/dbms/src/Common/DiskSpaceMonitor.cpp @@ -1,10 +1,12 @@ #include +#include +#include #include -#include #include + namespace DB { @@ -45,7 +47,7 @@ std::filesystem::path getMountPoint(std::filesystem::path absolute_path) return absolute_path; } - /// Returns name of filesystem mounted to mount_point +/// Returns name of filesystem mounted to mount_point #if !defined(__linux__) [[noreturn]] #endif @@ -65,7 +67,7 @@ std::string getFilesystemName([[maybe_unused]] const std::string & mount_point) throw DB::Exception("Cannot find name of filesystem by mount point " + mount_point, ErrorCodes::SYSTEM_ERROR); return fs_info.mnt_fsname; #else - throw DB::Exception("Supported on linux only", ErrorCodes::NOT_IMPLEMENTED); + throw DB::Exception("The function getFilesystemName is supported on Linux only", ErrorCodes::NOT_IMPLEMENTED); #endif } @@ -82,7 +84,7 @@ bool Disk::tryReserve(UInt64 bytes) const std::lock_guard lock(mutex); if (bytes == 0) { - LOG_DEBUG(&Logger::get("DiskSpaceMonitor"), "Reserving 0 bytes on disk " << name); + LOG_DEBUG(&Logger::get("DiskSpaceMonitor"), "Reserving 0 bytes on disk " << backQuote(name)); ++reservation_count; return true; } @@ -93,7 +95,8 @@ bool Disk::tryReserve(UInt64 bytes) const { LOG_DEBUG( &Logger::get("DiskSpaceMonitor"), - "Reserving " << bytes << " bytes on disk " << name << " having unreserved " << unreserved_space << " bytes."); + "Reserving " << formatReadableSizeWithBinarySuffix(bytes) << " on disk " << backQuote(name) + << ", having unreserved " << formatReadableSizeWithBinarySuffix(unreserved_space) << "."); ++reservation_count; reserved_bytes += bytes; return true; @@ -283,14 +286,14 @@ Volume::Volume( max_data_part_size = static_cast(sum_size * ratio / disks.size()); for (size_t i = 0; i < disks.size(); ++i) if (sizes[i] < max_data_part_size) - LOG_WARNING(logger, "Disk " << disks[i]->getName() << " on volume " << config_prefix << - " have not enough space (" << sizes[i] << + LOG_WARNING(logger, "Disk " << backQuote(disks[i]->getName()) << " on volume " << backQuote(config_prefix) << + " have not enough space (" << formatReadableSizeWithBinarySuffix(sizes[i]) << ") for containing part the size of max_data_part_size (" << - max_data_part_size << ")"); + formatReadableSizeWithBinarySuffix(max_data_part_size) << ")"); } constexpr UInt64 MIN_PART_SIZE = 8u * 1024u * 1024u; if (max_data_part_size < MIN_PART_SIZE) - LOG_WARNING(logger, "Volume '" << name << "' max_data_part_size is too low (" + LOG_WARNING(logger, "Volume " << backQuote(name) << " max_data_part_size is too low (" << formatReadableSizeWithBinarySuffix(max_data_part_size) << " < " << formatReadableSizeWithBinarySuffix(MIN_PART_SIZE) << ")"); } @@ -505,7 +508,7 @@ StoragePolicySelector::StoragePolicySelector( ErrorCodes::EXCESSIVE_ELEMENT_IN_CONFIG); policies.emplace(name, std::make_shared(name, config, config_prefix + "." + name, disks)); - LOG_INFO(&Logger::get("StoragePolicySelector"), "Storage policy " << name << " loaded"); + LOG_INFO(&Logger::get("StoragePolicySelector"), "Storage policy " << backQuote(name) << " loaded"); } constexpr auto default_storage_policy_name = "default"; diff --git a/dbms/src/IO/WriteHelpers.cpp b/dbms/src/IO/WriteHelpers.cpp index 5dc2358c4c0..0b5bce27b46 100644 --- a/dbms/src/IO/WriteHelpers.cpp +++ b/dbms/src/IO/WriteHelpers.cpp @@ -67,4 +67,25 @@ void writeException(const Exception & e, WriteBuffer & buf, bool with_stack_trac writeException(Exception(Exception::CreateFromPoco, *e.nested()), buf, with_stack_trace); } + +String backQuoteIfNeed(const String & x) +{ + String res(x.size(), '\0'); + { + WriteBufferFromString wb(res); + writeProbablyBackQuotedString(x, wb); + } + return res; +} + +String backQuote(const String & x) +{ + String res(x.size(), '\0'); + { + WriteBufferFromString wb(res); + writeBackQuotedString(x, wb); + } + return res; +} + } diff --git a/dbms/src/IO/WriteHelpers.h b/dbms/src/IO/WriteHelpers.h index 44d7b7ab540..ab3fad08860 100644 --- a/dbms/src/IO/WriteHelpers.h +++ b/dbms/src/IO/WriteHelpers.h @@ -906,4 +906,10 @@ inline String toString(const T & x) return buf.str(); } + +/// Quote the identifier with backquotes, if required. +String backQuoteIfNeed(const String & x); +/// Quote the identifier with backquotes. +String backQuote(const String & x); + } diff --git a/dbms/src/Parsers/IAST.cpp b/dbms/src/Parsers/IAST.cpp index 0cc15c51c23..eb9f1462666 100644 --- a/dbms/src/Parsers/IAST.cpp +++ b/dbms/src/Parsers/IAST.cpp @@ -26,26 +26,6 @@ const char * IAST::hilite_substitution = "\033[1;36m"; const char * IAST::hilite_none = "\033[0m"; -String backQuoteIfNeed(const String & x) -{ - String res(x.size(), '\0'); - { - WriteBufferFromString wb(res); - writeProbablyBackQuotedString(x, wb); - } - return res; -} - -String backQuote(const String & x) -{ - String res(x.size(), '\0'); - { - WriteBufferFromString wb(res); - writeBackQuotedString(x, wb); - } - return res; -} - size_t IAST::size() const { size_t res = 1; diff --git a/dbms/src/Parsers/IAST.h b/dbms/src/Parsers/IAST.h index f30bdef0e2b..c896ed2ce3f 100644 --- a/dbms/src/Parsers/IAST.h +++ b/dbms/src/Parsers/IAST.h @@ -5,6 +5,7 @@ #include #include #include +#include /// backQuote, backQuoteIfNeed #include #include @@ -223,9 +224,4 @@ private: }; -/// Quote the identifier with backquotes, if required. -String backQuoteIfNeed(const String & x); -/// Quote the identifier with backquotes. -String backQuote(const String & x); - } From 4761436a524a295d4dff987a481342ad1cd6ffae Mon Sep 17 00:00:00 2001 From: Sergei Bocharov Date: Tue, 24 Sep 2019 03:50:58 +0300 Subject: [PATCH 232/309] Docs(hash_functions): fix `xxHash32`, `xxHash64` (#6992) * docs(hash_functions): fix xxHash32, xxHash64 * Fixes after review * Fixes after review * Update hash_functions.md * Update hash_functions.md --- .../functions/hash_functions.md | 40 +++++++++++++++++-- .../functions/hash_functions.md | 40 +++++++++++++++++-- 2 files changed, 72 insertions(+), 8 deletions(-) diff --git a/docs/en/query_language/functions/hash_functions.md b/docs/en/query_language/functions/hash_functions.md index ff59a6639ab..fde41d97da4 100644 --- a/docs/en/query_language/functions/hash_functions.md +++ b/docs/en/query_language/functions/hash_functions.md @@ -355,10 +355,42 @@ SELECT murmurHash3_128('example_string') AS MurmurHash3, toTypeName(MurmurHash3) └──────────────────┴─────────────────┘ ``` -## xxHash32, xxHash64 +## xxHash32, xxHash64 {#hash_functions-xxhash32} -Calculates xxHash from a string. -Accepts a String-type argument. Returns UInt64 Or UInt32. -For more information, see the link: [xxHash](http://cyan4973.github.io/xxHash/) +Calculates `xxHash` from a string. It is proposed in two flavors, 32 and 64 bits. + +```sql +SELECT xxHash32(''); + +OR + +SELECT xxHash64(''); +``` + +**Returned value** + +A `Uint32` or `Uint64` data type hash value. + +Type: `xxHash`. + +**Example** + +Query: + +```sql +SELECT xxHash32('Hello, world!'); +``` + +Result: + +```text +┌─xxHash32('Hello, world!')─┐ +│ 834093149 │ +└───────────────────────────┘ +``` + +**See Also** + +- [xxHash](http://cyan4973.github.io/xxHash/). [Original article](https://clickhouse.yandex/docs/en/query_language/functions/hash_functions/) diff --git a/docs/ru/query_language/functions/hash_functions.md b/docs/ru/query_language/functions/hash_functions.md index 8a38f61980d..44793b0e290 100644 --- a/docs/ru/query_language/functions/hash_functions.md +++ b/docs/ru/query_language/functions/hash_functions.md @@ -361,10 +361,42 @@ SELECT murmurHash3_128('example_string') AS MurmurHash3, toTypeName(MurmurHash3) └──────────────────┴─────────────────┘ ``` -## xxHash32, xxHash64 +## xxHash32, xxHash64 {#hash_functions-xxhash32-xxhash64} -Вычисляет xxHash от строки. -Принимает аргумент типа String. Возвращает значение типа Uint64 или Uint32. -Дополнительные сведения см. по ссылке: [xxHash](http://cyan4973.github.io/xxHash/) +Вычисляет `xxHash` от строки. Предлагается в двух вариантах: 32 и 64 бита. + +```sql +SELECT xxHash32(''); + +OR + +SELECT xxHash64(''); +``` + +**Возвращаемое значение** + +Хэш-значение типа `Uint32` или `Uint64`. + +Тип: `xxHash`. + +**Пример** + +Запрос: + +```sql +SELECT xxHash32('Hello, world!'); +``` + +Ответ: + +```text +┌─xxHash32('Hello, world!')─┐ +│ 834093149 │ +└───────────────────────────┘ +``` + +**Смотрите также** + +- [xxHash](http://cyan4973.github.io/xxHash/). [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/hash_functions/) From 7384e04430eff36dfac4655c33240aa3cbb79d21 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 24 Sep 2019 04:29:26 +0300 Subject: [PATCH 233/309] Revert "CHYT-142: extend KeyCondition interface so that it returns BoolMask." This reverts commit 2cacc3cfd2ad4f5aa266eadec87cd59c9bfe9fb3. --- dbms/src/Interpreters/Set.cpp | 2 +- dbms/src/Interpreters/Set.h | 4 +- dbms/src/Storages/MergeTree/KeyCondition.cpp | 39 ++++++++++--------- dbms/src/Storages/MergeTree/KeyCondition.h | 14 +++---- .../MergeTree/MergeTreeDataSelectExecutor.cpp | 12 +++--- .../MergeTree/MergeTreeIndexFullText.cpp | 4 +- .../MergeTree/MergeTreeIndexMinMax.cpp | 2 +- 7 files changed, 39 insertions(+), 38 deletions(-) diff --git a/dbms/src/Interpreters/Set.cpp b/dbms/src/Interpreters/Set.cpp index 4313decd36d..68c219c3a91 100644 --- a/dbms/src/Interpreters/Set.cpp +++ b/dbms/src/Interpreters/Set.cpp @@ -470,7 +470,7 @@ MergeTreeSetIndex::MergeTreeSetIndex(const Columns & set_elements, std::vector & key_ranges, const DataTypes & data_types) +BoolMask MergeTreeSetIndex::mayBeTrueInRange(const std::vector & key_ranges, const DataTypes & data_types) { size_t tuple_size = indexes_mapping.size(); diff --git a/dbms/src/Interpreters/Set.h b/dbms/src/Interpreters/Set.h index a252f1ebc1e..987252e37ba 100644 --- a/dbms/src/Interpreters/Set.h +++ b/dbms/src/Interpreters/Set.h @@ -170,7 +170,7 @@ using Sets = std::vector; class IFunction; using FunctionPtr = std::shared_ptr; -/// Class for checkInRange function. +/// Class for mayBeTrueInRange function. class MergeTreeSetIndex { public: @@ -188,7 +188,7 @@ public: size_t size() const { return ordered_set.at(0)->size(); } - BoolMask checkInRange(const std::vector & key_ranges, const DataTypes & data_types); + BoolMask mayBeTrueInRange(const std::vector & key_ranges, const DataTypes & data_types); private: Columns ordered_set; diff --git a/dbms/src/Storages/MergeTree/KeyCondition.cpp b/dbms/src/Storages/MergeTree/KeyCondition.cpp index 0ebe8f79aba..b3e4c776605 100644 --- a/dbms/src/Storages/MergeTree/KeyCondition.cpp +++ b/dbms/src/Storages/MergeTree/KeyCondition.cpp @@ -886,7 +886,7 @@ String KeyCondition::toString() const */ template -static BoolMask forAnyParallelogram( +static bool forAnyParallelogram( size_t key_size, const Field * key_left, const Field * key_right, @@ -942,15 +942,16 @@ static BoolMask forAnyParallelogram( for (size_t i = prefix_size + 1; i < key_size; ++i) parallelogram[i] = Range(); - BoolMask result(false, false); - result = result | callback(parallelogram); + if (callback(parallelogram)) + return true; /// [x1] x [y1 .. +inf) if (left_bounded) { parallelogram[prefix_size] = Range(key_left[prefix_size]); - result = result | forAnyParallelogram(key_size, key_left, key_right, true, false, parallelogram, prefix_size + 1, callback); + if (forAnyParallelogram(key_size, key_left, key_right, true, false, parallelogram, prefix_size + 1, callback)) + return true; } /// [x2] x (-inf .. y2] @@ -958,14 +959,15 @@ static BoolMask forAnyParallelogram( if (right_bounded) { parallelogram[prefix_size] = Range(key_right[prefix_size]); - result = result | forAnyParallelogram(key_size, key_left, key_right, false, true, parallelogram, prefix_size + 1, callback); + if (forAnyParallelogram(key_size, key_left, key_right, false, true, parallelogram, prefix_size + 1, callback)) + return true; } - return result; + return false; } -BoolMask KeyCondition::checkInRange( +bool KeyCondition::mayBeTrueInRange( size_t used_key_size, const Field * left_key, const Field * right_key, @@ -991,7 +993,7 @@ BoolMask KeyCondition::checkInRange( return forAnyParallelogram(used_key_size, left_key, right_key, true, right_bounded, key_ranges, 0, [&] (const std::vector & key_ranges_parallelogram) { - auto res = checkInParallelogram(key_ranges_parallelogram, data_types); + auto res = mayBeTrueInParallelogram(key_ranges_parallelogram, data_types); /* std::cerr << "Parallelogram: "; for (size_t i = 0, size = key_ranges.size(); i != size; ++i) @@ -1002,11 +1004,11 @@ BoolMask KeyCondition::checkInRange( }); } - std::optional KeyCondition::applyMonotonicFunctionsChainToRange( Range key_range, MonotonicFunctionsChain & functions, - DataTypePtr current_type) + DataTypePtr current_type +) { for (auto & func : functions) { @@ -1039,7 +1041,7 @@ std::optional KeyCondition::applyMonotonicFunctionsChainToRange( return key_range; } -BoolMask KeyCondition::checkInParallelogram(const std::vector & parallelogram, const DataTypes & data_types) const +bool KeyCondition::mayBeTrueInParallelogram(const std::vector & parallelogram, const DataTypes & data_types) const { std::vector rpn_stack; for (size_t i = 0; i < rpn.size(); ++i) @@ -1087,7 +1089,7 @@ BoolMask KeyCondition::checkInParallelogram(const std::vector & parallelo if (!element.set_index) throw Exception("Set for IN is not created yet", ErrorCodes::LOGICAL_ERROR); - rpn_stack.emplace_back(element.set_index->checkInRange(parallelogram, data_types)); + rpn_stack.emplace_back(element.set_index->mayBeTrueInRange(parallelogram, data_types)); if (element.function == RPNElement::FUNCTION_NOT_IN_SET) rpn_stack.back() = !rpn_stack.back(); } @@ -1122,23 +1124,22 @@ BoolMask KeyCondition::checkInParallelogram(const std::vector & parallelo } if (rpn_stack.size() != 1) - throw Exception("Unexpected stack size in KeyCondition::checkInRange", ErrorCodes::LOGICAL_ERROR); + throw Exception("Unexpected stack size in KeyCondition::mayBeTrueInRange", ErrorCodes::LOGICAL_ERROR); - return rpn_stack[0]; + return rpn_stack[0].can_be_true; } -BoolMask KeyCondition::checkInRange( +bool KeyCondition::mayBeTrueInRange( size_t used_key_size, const Field * left_key, const Field * right_key, const DataTypes & data_types) const { - return checkInRange(used_key_size, left_key, right_key, data_types, true); + return mayBeTrueInRange(used_key_size, left_key, right_key, data_types, true); } - -BoolMask KeyCondition::getMaskAfter( +bool KeyCondition::mayBeTrueAfter( size_t used_key_size, const Field * left_key, const DataTypes & data_types) const { - return checkInRange(used_key_size, left_key, nullptr, data_types, false); + return mayBeTrueInRange(used_key_size, left_key, nullptr, data_types, false); } diff --git a/dbms/src/Storages/MergeTree/KeyCondition.h b/dbms/src/Storages/MergeTree/KeyCondition.h index 2a5c520b243..61989d1b2d9 100644 --- a/dbms/src/Storages/MergeTree/KeyCondition.h +++ b/dbms/src/Storages/MergeTree/KeyCondition.h @@ -235,17 +235,17 @@ public: const Names & key_column_names, const ExpressionActionsPtr & key_expr); - /// Whether the condition and its negation are (independently) feasible in the key range. + /// Whether the condition is feasible in the key range. /// left_key and right_key must contain all fields in the sort_descr in the appropriate order. /// data_types - the types of the key columns. - BoolMask checkInRange(size_t used_key_size, const Field * left_key, const Field * right_key, const DataTypes & data_types) const; + bool mayBeTrueInRange(size_t used_key_size, const Field * left_key, const Field * right_key, const DataTypes & data_types) const; - /// Whether the condition and its negation are feasible in the direct product of single column ranges specified by `parallelogram`. - BoolMask checkInParallelogram(const std::vector & parallelogram, const DataTypes & data_types) const; + /// Whether the condition is feasible in the direct product of single column ranges specified by `parallelogram`. + bool mayBeTrueInParallelogram(const std::vector & parallelogram, const DataTypes & data_types) const; - /// Are the condition and its negation valid in a semi-infinite (not limited to the right) key range. + /// Is the condition valid in a semi-infinite (not limited to the right) key range. /// left_key must contain all the fields in the sort_descr in the appropriate order. - BoolMask getMaskAfter(size_t used_key_size, const Field * left_key, const DataTypes & data_types) const; + bool mayBeTrueAfter(size_t used_key_size, const Field * left_key, const DataTypes & data_types) const; /// Checks that the index can not be used. bool alwaysUnknownOrTrue() const; @@ -330,7 +330,7 @@ public: static const AtomMap atom_map; private: - BoolMask checkInRange( + bool mayBeTrueInRange( size_t used_key_size, const Field * left_key, const Field * right_key, diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 40dc0bf6b52..99b4a49d111 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -269,8 +269,8 @@ BlockInputStreams MergeTreeDataSelectExecutor::readFromParts( if (part->isEmpty()) continue; - if (minmax_idx_condition && !minmax_idx_condition->checkInParallelogram( - part->minmax_idx.parallelogram, data.minmax_idx_column_types).can_be_true) + if (minmax_idx_condition && !minmax_idx_condition->mayBeTrueInParallelogram( + part->minmax_idx.parallelogram, data.minmax_idx_column_types)) continue; if (max_block_numbers_to_read) @@ -1200,8 +1200,8 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange( for (size_t i = 0; i < used_key_size; ++i) index[i]->get(range.begin, index_left[i]); - may_be_true = key_condition.getMaskAfter( - used_key_size, index_left.data(), data.primary_key_data_types).can_be_true; + may_be_true = key_condition.mayBeTrueAfter( + used_key_size, index_left.data(), data.primary_key_data_types); } else { @@ -1214,8 +1214,8 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange( index[i]->get(range.end, index_right[i]); } - may_be_true = key_condition.checkInRange( - used_key_size, index_left.data(), index_right.data(), data.primary_key_data_types).can_be_true; + may_be_true = key_condition.mayBeTrueInRange( + used_key_size, index_left.data(), index_right.data(), data.primary_key_data_types); } if (!may_be_true) diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index 246ad6784b2..264c91cd890 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -378,11 +378,11 @@ bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx rpn_stack.emplace_back(true, false); } else - throw Exception("Unexpected function type in BloomFilterCondition::RPNElement", ErrorCodes::LOGICAL_ERROR); + throw Exception("Unexpected function type in KeyCondition::RPNElement", ErrorCodes::LOGICAL_ERROR); } if (rpn_stack.size() != 1) - throw Exception("Unexpected stack size in BloomFilterCondition::mayBeTrueOnGranule", ErrorCodes::LOGICAL_ERROR); + throw Exception("Unexpected stack size in KeyCondition::mayBeTrueInRange", ErrorCodes::LOGICAL_ERROR); return rpn_stack[0].can_be_true; } diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp b/dbms/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp index 360e69eacc6..37c094db215 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp @@ -143,7 +143,7 @@ bool MergeTreeIndexConditionMinMax::mayBeTrueOnGranule(MergeTreeIndexGranulePtr for (const auto & range : granule->parallelogram) if (range.left.isNull() || range.right.isNull()) return true; - return condition.checkInParallelogram(granule->parallelogram, index.data_types).can_be_true; + return condition.mayBeTrueInParallelogram(granule->parallelogram, index.data_types); } From f8b7cc86931ab33362dcbe452db79bff408adf47 Mon Sep 17 00:00:00 2001 From: Yuriy Baranov Date: Tue, 24 Sep 2019 05:11:52 +0300 Subject: [PATCH 234/309] Revert "Updated MariaDB" --- contrib/CMakeLists.txt | 16 +- contrib/mariadb-connector-c | 2 +- .../mariadb-connector-c-cmake/CMakeLists.txt | 74 +++ .../common/include/mysql/mysql.h | 1 + .../common/include/mysql/mysqld_error.h | 1 + .../linux_x86_64/include/config.h | 269 ++++++++++ .../linux_x86_64/include/ma_config.h | 269 ++++++++++ .../linux_x86_64/include/mariadb_version.h | 36 ++ .../libmariadb/ma_client_plugin.c | 502 ++++++++++++++++++ libs/libmysqlxx/CMakeLists.txt | 3 +- libs/libmysqlxx/cmake/find_mysqlclient.cmake | 4 +- libs/libmysqlxx/src/Connection.cpp | 4 +- libs/libmysqlxx/src/Exception.cpp | 4 +- libs/libmysqlxx/src/Pool.cpp | 6 +- libs/libmysqlxx/src/Query.cpp | 4 +- libs/libmysqlxx/src/ResultBase.cpp | 4 +- libs/libmysqlxx/src/Row.cpp | 4 +- libs/libmysqlxx/src/StoreQueryResult.cpp | 4 +- libs/libmysqlxx/src/UseQueryResult.cpp | 4 +- 19 files changed, 1179 insertions(+), 32 deletions(-) create mode 100644 contrib/mariadb-connector-c-cmake/CMakeLists.txt create mode 100644 contrib/mariadb-connector-c-cmake/common/include/mysql/mysql.h create mode 100644 contrib/mariadb-connector-c-cmake/common/include/mysql/mysqld_error.h create mode 100644 contrib/mariadb-connector-c-cmake/linux_x86_64/include/config.h create mode 100644 contrib/mariadb-connector-c-cmake/linux_x86_64/include/ma_config.h create mode 100644 contrib/mariadb-connector-c-cmake/linux_x86_64/include/mariadb_version.h create mode 100644 contrib/mariadb-connector-c-cmake/linux_x86_64/libmariadb/ma_client_plugin.c diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 5e6f90b6a59..0833614594d 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -65,7 +65,7 @@ if (USE_INTERNAL_ZLIB_LIBRARY) endif () add_subdirectory (${INTERNAL_ZLIB_NAME}) - # TODO: make pull to Dead2/zlib-ng and remove: + # todo: make pull to Dead2/zlib-ng and remove: # We should use same defines when including zlib.h as used when zlib compiled target_compile_definitions (zlib PUBLIC ZLIB_COMPAT WITH_GZFILEOP) target_compile_definitions (zlibstatic PUBLIC ZLIB_COMPAT WITH_GZFILEOP) @@ -125,15 +125,11 @@ if (USE_INTERNAL_SSL_LIBRARY) endif () if (ENABLE_MYSQL AND USE_INTERNAL_MYSQL_LIBRARY) - set(CLIENT_PLUGIN_CACHING_SHA2_PASSWORD STATIC) - set(CLIENT_PLUGIN_SHA256_PASSWORD STATIC) - set(CLIENT_PLUGIN_REMOTE_IO OFF) - set(CLIENT_PLUGIN_DIALOG OFF) - set(CLIENT_PLUGIN_CLIENT_ED25519 OFF) - set(CLIENT_PLUGIN_MYSQL_CLEAR_PASSWORD OFF) - set(SKIP_TESTS 1) - set(LIBM glibc-compatibility) - add_subdirectory (mariadb-connector-c) + add_subdirectory (mariadb-connector-c-cmake) + target_include_directories(mysqlclient BEFORE PRIVATE ${ZLIB_INCLUDE_DIR}) + if(OPENSSL_INCLUDE_DIR) + target_include_directories(mysqlclient BEFORE PRIVATE ${OPENSSL_INCLUDE_DIR}) + endif() endif () if (USE_INTERNAL_RDKAFKA_LIBRARY) diff --git a/contrib/mariadb-connector-c b/contrib/mariadb-connector-c index 18016300b00..c6503d3acc8 160000 --- a/contrib/mariadb-connector-c +++ b/contrib/mariadb-connector-c @@ -1 +1 @@ -Subproject commit 18016300b00825a3fcbc6fb2aa37ac3e51416f71 +Subproject commit c6503d3acc85ca1a7f5e7e38b605d7c9410aac1e diff --git a/contrib/mariadb-connector-c-cmake/CMakeLists.txt b/contrib/mariadb-connector-c-cmake/CMakeLists.txt new file mode 100644 index 00000000000..2e80b0c325f --- /dev/null +++ b/contrib/mariadb-connector-c-cmake/CMakeLists.txt @@ -0,0 +1,74 @@ +set(MARIADB_CLIENT_SOURCE_DIR ${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c) +set(MARIADB_CLIENT_BINARY_DIR ${ClickHouse_BINARY_DIR}/contrib/mariadb-connector-c) + +set(SRCS +#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/bmove_upp.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/get_password.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_alloc.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_array.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_charset.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_compress.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_context.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_default.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_dtoa.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_errmsg.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_hash.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_init.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_io.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_list.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_ll2str.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_loaddata.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_net.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_password.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_pvio.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/mariadb_async.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/mariadb_charset.c +#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/mariadb_dyncol.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/mariadb_lib.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/mariadb_stmt.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_sha1.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_stmt_codec.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_string.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_time.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_tls.c +${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/openssl_crypt.c +#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/gnutls.c +#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/ma_schannel.c +#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/schannel.c +#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/auth_gssapi_client.c +#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/dialog.c +#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/gssapi_client.c +#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/gssapi_errmsg.c +${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/mariadb_cleartext.c +${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/my_auth.c +${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/old_password.c +${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/sha256_pw.c +${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/caching_sha2_pw.c +#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/sspi_client.c +#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/sspi_errmsg.c +${MARIADB_CLIENT_SOURCE_DIR}/plugins/connection/aurora.c +${MARIADB_CLIENT_SOURCE_DIR}/plugins/connection/replication.c +#${MARIADB_CLIENT_SOURCE_DIR}/plugins/io/remote_io.c +#${MARIADB_CLIENT_SOURCE_DIR}/plugins/pvio/pvio_npipe.c +#${MARIADB_CLIENT_SOURCE_DIR}/plugins/pvio/pvio_shmem.c +${MARIADB_CLIENT_SOURCE_DIR}/plugins/pvio/pvio_socket.c +#${MARIADB_CLIENT_SOURCE_DIR}/plugins/trace/trace_example.c +${CMAKE_CURRENT_SOURCE_DIR}/linux_x86_64/libmariadb/ma_client_plugin.c +) + +if(OPENSSL_LIBRARIES) + list(APPEND SRCS ${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/openssl.c) +endif() + +add_library(mysqlclient ${SRCS}) + +if(OPENSSL_LIBRARIES) + target_link_libraries(mysqlclient PRIVATE ${OPENSSL_LIBRARIES}) + target_compile_definitions(mysqlclient PRIVATE -D HAVE_OPENSSL -D HAVE_TLS) +endif() + +target_include_directories(mysqlclient PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/linux_x86_64/include) +target_include_directories(mysqlclient PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/common/include) +target_include_directories(mysqlclient PUBLIC ${MARIADB_CLIENT_SOURCE_DIR}/include) + +target_compile_definitions(mysqlclient PRIVATE -D THREAD) diff --git a/contrib/mariadb-connector-c-cmake/common/include/mysql/mysql.h b/contrib/mariadb-connector-c-cmake/common/include/mysql/mysql.h new file mode 100644 index 00000000000..741c7ba03c9 --- /dev/null +++ b/contrib/mariadb-connector-c-cmake/common/include/mysql/mysql.h @@ -0,0 +1 @@ +#include diff --git a/contrib/mariadb-connector-c-cmake/common/include/mysql/mysqld_error.h b/contrib/mariadb-connector-c-cmake/common/include/mysql/mysqld_error.h new file mode 100644 index 00000000000..95d26eef163 --- /dev/null +++ b/contrib/mariadb-connector-c-cmake/common/include/mysql/mysqld_error.h @@ -0,0 +1 @@ +#include diff --git a/contrib/mariadb-connector-c-cmake/linux_x86_64/include/config.h b/contrib/mariadb-connector-c-cmake/linux_x86_64/include/config.h new file mode 100644 index 00000000000..90c42c97df6 --- /dev/null +++ b/contrib/mariadb-connector-c-cmake/linux_x86_64/include/config.h @@ -0,0 +1,269 @@ + +/* + * Include file constants (processed in LibmysqlIncludeFiles.txt 1 + */ +#define HAVE_ALLOCA_H 1 +/* #undef HAVE_BIGENDIAN */ +#define HAVE_SETLOCALE 1 +#define HAVE_NL_LANGINFO 1 +#define HAVE_ARPA_INET_H 1 +#define HAVE_CRYPT_H 1 +#define HAVE_DIRENT_H 1 +#define HAVE_DLFCN_H 1 +#define HAVE_EXECINFO_H 1 +#define HAVE_FCNTL_H 1 +#define HAVE_FENV_H 1 +#define HAVE_FLOAT_H 1 +/* #undef HAVE_FPU_CONTROL_H */ +#define HAVE_GRP_H 1 +/* #undef HAVE_IEEEFP_H */ +#define HAVE_LIMITS_H 1 +#define HAVE_MALLOC_H 1 +#define HAVE_MEMORY_H 1 +#define HAVE_NETINET_IN_H 1 +#define HAVE_PATHS_H 1 +#define HAVE_PWD_H 1 +#define HAVE_SCHED_H 1 +/* #undef HAVE_SELECT_H */ +#define HAVE_STDDEF_H 1 +#define HAVE_STDINT_H 1 +#define HAVE_STDLIB_H 1 +#define HAVE_STRING_H 1 +#define HAVE_STRINGS_H 1 +/* #undef HAVE_SYNCH_H */ +/* #undef HAVE_SYS_FPU_H */ +#define HAVE_SYS_IOCTL_H 1 +#define HAVE_SYS_IPC_H 1 +#define HAVE_SYS_MMAN_H 1 +#define HAVE_SYS_PRCTL_H 1 +#define HAVE_SYS_SELECT_H 1 +#define HAVE_SYS_SHM_H 1 +#define HAVE_SYS_SOCKET_H 1 +#define HAVE_SYS_STAT_H 1 +/* #undef HAVE_SYS_STREAM_H */ +#define HAVE_SYS_TIMEB_H 1 +#define HAVE_SYS_TYPES_H 1 +#define HAVE_SYS_UN_H 1 +/* #undef HAVE_SYSENT_H */ +#define HAVE_TERMIO_H 1 +#define HAVE_TERMIOS_H 1 +#define HAVE_UNISTD_H 1 +#define HAVE_UTIME_H 1 +#define HAVE_UCONTEXT_H 1 + +/* + * function definitions - processed in LibmysqlFunctions.txt + */ +#define HAVE_ACCESS 1 +/* #undef HAVE_AIOWAIT */ +#define HAVE_ALARM 1 +/* #undef HAVE_ALLOCA */ +#define HAVE_BCMP 1 +/* #undef HAVE_BFILL */ +/* #undef HAVE_BMOVE */ +#define HAVE_BZERO 1 +#define HAVE_CLOCK_GETTIME 1 +/* #undef HAVE_COMPRESS */ +/* #undef HAVE_CRYPT */ +#define HAVE_DLERROR 1 +#define HAVE_DLOPEN 1 +#define HAVE_FCHMOD 1 +#define HAVE_FCNTL 1 +/* #undef HAVE_FCONVERT */ +#define HAVE_FDATASYNC 1 +#define HAVE_FESETROUND 1 +#define HAVE_FINITE 1 +#define HAVE_FSEEKO 1 +#define HAVE_FSYNC 1 +#define HAVE_GETADDRINFO 1 +#define HAVE_GETCWD 1 +#define HAVE_GETHOSTBYADDR_R 1 +#define HAVE_GETHOSTBYNAME_R 1 +/* #undef HAVE_GETHRTIME */ +#define HAVE_GETNAMEINFO 1 +#define HAVE_GETPAGESIZE 1 +#define HAVE_GETPASS 1 +/* #undef HAVE_GETPASSPHRASE */ +#define HAVE_GETPWNAM 1 +#define HAVE_GETPWUID 1 +#define HAVE_GETRLIMIT 1 +#define HAVE_GETRUSAGE 1 +#define HAVE_GETWD 1 +#define HAVE_GMTIME_R 1 +#define HAVE_INITGROUPS 1 +#define HAVE_LDIV 1 +#define HAVE_LOCALTIME_R 1 +#define HAVE_LOG2 1 +#define HAVE_LONGJMP 1 +#define HAVE_LSTAT 1 +#define HAVE_MADVISE 1 +#define HAVE_MALLINFO 1 +#define HAVE_MEMALIGN 1 +#define HAVE_MEMCPY 1 +#define HAVE_MEMMOVE 1 +#define HAVE_MKSTEMP 1 +#define HAVE_MLOCK 1 +#define HAVE_MLOCKALL 1 +#define HAVE_MMAP 1 +#define HAVE_MMAP64 1 +#define HAVE_PERROR 1 +#define HAVE_POLL 1 +#define HAVE_PREAD 1 +/* #undef HAVE_PTHREAD_ATTR_CREATE */ +#define HAVE_PTHREAD_ATTR_GETSTACKSIZE 1 +/* #undef HAVE_PTHREAD_ATTR_SETPRIO */ +#define HAVE_PTHREAD_ATTR_SETSCHEDPARAM 1 +#define HAVE_PTHREAD_ATTR_SETSCOPE 1 +#define HAVE_PTHREAD_ATTR_SETSTACKSIZE 1 +/* #undef HAVE_PTHREAD_CONDATTR_CREATE */ +/* #undef HAVE_PTHREAD_INIT */ +#define HAVE_PTHREAD_KEY_DELETE 1 +#define HAVE_PTHREAD_KILL 1 +#define HAVE_PTHREAD_RWLOCK_RDLOCK 1 +/* #undef HAVE_PTHREAD_SETPRIO_NP */ +#define HAVE_PTHREAD_SETSCHEDPARAM 1 +#define HAVE_PTHREAD_SIGMASK 1 +/* #undef HAVE_PTHREAD_THREADMASK */ +/* #undef HAVE_PTHREAD_YIELD_NP */ +#define HAVE_READDIR_R 1 +#define HAVE_READLINK 1 +#define HAVE_REALPATH 1 +#define HAVE_RENAME 1 +#define HAVE_SCHED_YIELD 1 +#define HAVE_SELECT 1 +/* #undef HAVE_SETFD */ +/* #undef HAVE_SETFILEPOINTER */ +#define HAVE_SIGNAL 1 +#define HAVE_SIGACTION 1 +/* #undef HAVE_SIGTHREADMASK */ +#define HAVE_SIGWAIT 1 +#define HAVE_SLEEP 1 +#define HAVE_SNPRINTF 1 +/* #undef HAVE_SQLITE */ +#define HAVE_STPCPY 1 +#define HAVE_STRERROR 1 +/* #undef HAVE_STRLCPY */ +#define HAVE_STRNLEN 1 +#define HAVE_STRPBRK 1 +#define HAVE_STRSEP 1 +#define HAVE_STRSTR 1 +#define HAVE_STRTOK_R 1 +#define HAVE_STRTOL 1 +#define HAVE_STRTOLL 1 +#define HAVE_STRTOUL 1 +#define HAVE_STRTOULL 1 +/* #undef HAVE_TELL */ +/* #undef HAVE_THR_SETCONCURRENCY */ +/* #undef HAVE_THR_YIELD */ +#define HAVE_VASPRINTF 1 +#define HAVE_VSNPRINTF 1 + +/* + * types and sizes + */ +/* Types we may use */ +#define SIZEOF_CHAR 1 +#if defined(SIZEOF_CHAR) +# define HAVE_CHAR 1 +#endif + +#define SIZEOF_CHARP 8 +#if defined(SIZEOF_CHARP) +# define HAVE_CHARP 1 +#endif + +#define SIZEOF_SHORT 2 +#if defined(SIZEOF_SHORT) +# define HAVE_SHORT 1 +#endif + +#define SIZEOF_INT 4 +#if defined(SIZEOF_INT) +# define HAVE_INT 1 +#endif + +#define SIZEOF_LONG 8 +#if defined(SIZEOF_LONG) +# define HAVE_LONG 1 +#endif + +#define SIZEOF_LONG_LONG 8 +#if defined(SIZEOF_LONG_LONG) +# define HAVE_LONG_LONG 1 +#endif + + +#define SIZEOF_SIGSET_T 128 +#if defined(SIZEOF_SIGSET_T) +# define HAVE_SIGSET_T 1 +#endif + +#define SIZEOF_SIZE_T 8 +#if defined(SIZEOF_SIZE_T) +# define HAVE_SIZE_T 1 +#endif + +/* #undef SIZEOF_UCHAR */ +#if defined(SIZEOF_UCHAR) +# define HAVE_UCHAR 1 +#endif + +#define SIZEOF_UINT 4 +#if defined(SIZEOF_UINT) +# define HAVE_UINT 1 +#endif + +#define SIZEOF_ULONG 8 +#if defined(SIZEOF_ULONG) +# define HAVE_ULONG 1 +#endif + +/* #undef SIZEOF_INT8 */ +#if defined(SIZEOF_INT8) +# define HAVE_INT8 1 +#endif +/* #undef SIZEOF_UINT8 */ +#if defined(SIZEOF_UINT8) +# define HAVE_UINT8 1 +#endif + +/* #undef SIZEOF_INT16 */ +#if defined(SIZEOF_INT16) +# define HAVE_INT16 1 +#endif +/* #undef SIZEOF_UINT16 */ +#if defined(SIZEOF_UINT16) +# define HAVE_UINT16 1 +#endif + +/* #undef SIZEOF_INT32 */ +#if defined(SIZEOF_INT32) +# define HAVE_INT32 1 +#endif +/* #undef SIZEOF_UINT32 */ +#if defined(SIZEOF_UINT32) +# define HAVE_UINT32 1 +#endif +/* #undef SIZEOF_U_INT32_T */ +#if defined(SIZEOF_U_INT32_T) +# define HAVE_U_INT32_T 1 +#endif + +/* #undef SIZEOF_INT64 */ +#if defined(SIZEOF_INT64) +# define HAVE_INT64 1 +#endif +/* #undef SIZEOF_UINT64 */ +#if defined(SIZEOF_UINT64) +# define HAVE_UINT64 1 +#endif + +/* #undef SIZEOF_SOCKLEN_T */ +#if defined(SIZEOF_SOCKLEN_T) +# define HAVE_SOCKLEN_T 1 +#endif + +#define SOCKET_SIZE_TYPE socklen_t + +#define MARIADB_DEFAULT_CHARSET "latin1" + diff --git a/contrib/mariadb-connector-c-cmake/linux_x86_64/include/ma_config.h b/contrib/mariadb-connector-c-cmake/linux_x86_64/include/ma_config.h new file mode 100644 index 00000000000..90c42c97df6 --- /dev/null +++ b/contrib/mariadb-connector-c-cmake/linux_x86_64/include/ma_config.h @@ -0,0 +1,269 @@ + +/* + * Include file constants (processed in LibmysqlIncludeFiles.txt 1 + */ +#define HAVE_ALLOCA_H 1 +/* #undef HAVE_BIGENDIAN */ +#define HAVE_SETLOCALE 1 +#define HAVE_NL_LANGINFO 1 +#define HAVE_ARPA_INET_H 1 +#define HAVE_CRYPT_H 1 +#define HAVE_DIRENT_H 1 +#define HAVE_DLFCN_H 1 +#define HAVE_EXECINFO_H 1 +#define HAVE_FCNTL_H 1 +#define HAVE_FENV_H 1 +#define HAVE_FLOAT_H 1 +/* #undef HAVE_FPU_CONTROL_H */ +#define HAVE_GRP_H 1 +/* #undef HAVE_IEEEFP_H */ +#define HAVE_LIMITS_H 1 +#define HAVE_MALLOC_H 1 +#define HAVE_MEMORY_H 1 +#define HAVE_NETINET_IN_H 1 +#define HAVE_PATHS_H 1 +#define HAVE_PWD_H 1 +#define HAVE_SCHED_H 1 +/* #undef HAVE_SELECT_H */ +#define HAVE_STDDEF_H 1 +#define HAVE_STDINT_H 1 +#define HAVE_STDLIB_H 1 +#define HAVE_STRING_H 1 +#define HAVE_STRINGS_H 1 +/* #undef HAVE_SYNCH_H */ +/* #undef HAVE_SYS_FPU_H */ +#define HAVE_SYS_IOCTL_H 1 +#define HAVE_SYS_IPC_H 1 +#define HAVE_SYS_MMAN_H 1 +#define HAVE_SYS_PRCTL_H 1 +#define HAVE_SYS_SELECT_H 1 +#define HAVE_SYS_SHM_H 1 +#define HAVE_SYS_SOCKET_H 1 +#define HAVE_SYS_STAT_H 1 +/* #undef HAVE_SYS_STREAM_H */ +#define HAVE_SYS_TIMEB_H 1 +#define HAVE_SYS_TYPES_H 1 +#define HAVE_SYS_UN_H 1 +/* #undef HAVE_SYSENT_H */ +#define HAVE_TERMIO_H 1 +#define HAVE_TERMIOS_H 1 +#define HAVE_UNISTD_H 1 +#define HAVE_UTIME_H 1 +#define HAVE_UCONTEXT_H 1 + +/* + * function definitions - processed in LibmysqlFunctions.txt + */ +#define HAVE_ACCESS 1 +/* #undef HAVE_AIOWAIT */ +#define HAVE_ALARM 1 +/* #undef HAVE_ALLOCA */ +#define HAVE_BCMP 1 +/* #undef HAVE_BFILL */ +/* #undef HAVE_BMOVE */ +#define HAVE_BZERO 1 +#define HAVE_CLOCK_GETTIME 1 +/* #undef HAVE_COMPRESS */ +/* #undef HAVE_CRYPT */ +#define HAVE_DLERROR 1 +#define HAVE_DLOPEN 1 +#define HAVE_FCHMOD 1 +#define HAVE_FCNTL 1 +/* #undef HAVE_FCONVERT */ +#define HAVE_FDATASYNC 1 +#define HAVE_FESETROUND 1 +#define HAVE_FINITE 1 +#define HAVE_FSEEKO 1 +#define HAVE_FSYNC 1 +#define HAVE_GETADDRINFO 1 +#define HAVE_GETCWD 1 +#define HAVE_GETHOSTBYADDR_R 1 +#define HAVE_GETHOSTBYNAME_R 1 +/* #undef HAVE_GETHRTIME */ +#define HAVE_GETNAMEINFO 1 +#define HAVE_GETPAGESIZE 1 +#define HAVE_GETPASS 1 +/* #undef HAVE_GETPASSPHRASE */ +#define HAVE_GETPWNAM 1 +#define HAVE_GETPWUID 1 +#define HAVE_GETRLIMIT 1 +#define HAVE_GETRUSAGE 1 +#define HAVE_GETWD 1 +#define HAVE_GMTIME_R 1 +#define HAVE_INITGROUPS 1 +#define HAVE_LDIV 1 +#define HAVE_LOCALTIME_R 1 +#define HAVE_LOG2 1 +#define HAVE_LONGJMP 1 +#define HAVE_LSTAT 1 +#define HAVE_MADVISE 1 +#define HAVE_MALLINFO 1 +#define HAVE_MEMALIGN 1 +#define HAVE_MEMCPY 1 +#define HAVE_MEMMOVE 1 +#define HAVE_MKSTEMP 1 +#define HAVE_MLOCK 1 +#define HAVE_MLOCKALL 1 +#define HAVE_MMAP 1 +#define HAVE_MMAP64 1 +#define HAVE_PERROR 1 +#define HAVE_POLL 1 +#define HAVE_PREAD 1 +/* #undef HAVE_PTHREAD_ATTR_CREATE */ +#define HAVE_PTHREAD_ATTR_GETSTACKSIZE 1 +/* #undef HAVE_PTHREAD_ATTR_SETPRIO */ +#define HAVE_PTHREAD_ATTR_SETSCHEDPARAM 1 +#define HAVE_PTHREAD_ATTR_SETSCOPE 1 +#define HAVE_PTHREAD_ATTR_SETSTACKSIZE 1 +/* #undef HAVE_PTHREAD_CONDATTR_CREATE */ +/* #undef HAVE_PTHREAD_INIT */ +#define HAVE_PTHREAD_KEY_DELETE 1 +#define HAVE_PTHREAD_KILL 1 +#define HAVE_PTHREAD_RWLOCK_RDLOCK 1 +/* #undef HAVE_PTHREAD_SETPRIO_NP */ +#define HAVE_PTHREAD_SETSCHEDPARAM 1 +#define HAVE_PTHREAD_SIGMASK 1 +/* #undef HAVE_PTHREAD_THREADMASK */ +/* #undef HAVE_PTHREAD_YIELD_NP */ +#define HAVE_READDIR_R 1 +#define HAVE_READLINK 1 +#define HAVE_REALPATH 1 +#define HAVE_RENAME 1 +#define HAVE_SCHED_YIELD 1 +#define HAVE_SELECT 1 +/* #undef HAVE_SETFD */ +/* #undef HAVE_SETFILEPOINTER */ +#define HAVE_SIGNAL 1 +#define HAVE_SIGACTION 1 +/* #undef HAVE_SIGTHREADMASK */ +#define HAVE_SIGWAIT 1 +#define HAVE_SLEEP 1 +#define HAVE_SNPRINTF 1 +/* #undef HAVE_SQLITE */ +#define HAVE_STPCPY 1 +#define HAVE_STRERROR 1 +/* #undef HAVE_STRLCPY */ +#define HAVE_STRNLEN 1 +#define HAVE_STRPBRK 1 +#define HAVE_STRSEP 1 +#define HAVE_STRSTR 1 +#define HAVE_STRTOK_R 1 +#define HAVE_STRTOL 1 +#define HAVE_STRTOLL 1 +#define HAVE_STRTOUL 1 +#define HAVE_STRTOULL 1 +/* #undef HAVE_TELL */ +/* #undef HAVE_THR_SETCONCURRENCY */ +/* #undef HAVE_THR_YIELD */ +#define HAVE_VASPRINTF 1 +#define HAVE_VSNPRINTF 1 + +/* + * types and sizes + */ +/* Types we may use */ +#define SIZEOF_CHAR 1 +#if defined(SIZEOF_CHAR) +# define HAVE_CHAR 1 +#endif + +#define SIZEOF_CHARP 8 +#if defined(SIZEOF_CHARP) +# define HAVE_CHARP 1 +#endif + +#define SIZEOF_SHORT 2 +#if defined(SIZEOF_SHORT) +# define HAVE_SHORT 1 +#endif + +#define SIZEOF_INT 4 +#if defined(SIZEOF_INT) +# define HAVE_INT 1 +#endif + +#define SIZEOF_LONG 8 +#if defined(SIZEOF_LONG) +# define HAVE_LONG 1 +#endif + +#define SIZEOF_LONG_LONG 8 +#if defined(SIZEOF_LONG_LONG) +# define HAVE_LONG_LONG 1 +#endif + + +#define SIZEOF_SIGSET_T 128 +#if defined(SIZEOF_SIGSET_T) +# define HAVE_SIGSET_T 1 +#endif + +#define SIZEOF_SIZE_T 8 +#if defined(SIZEOF_SIZE_T) +# define HAVE_SIZE_T 1 +#endif + +/* #undef SIZEOF_UCHAR */ +#if defined(SIZEOF_UCHAR) +# define HAVE_UCHAR 1 +#endif + +#define SIZEOF_UINT 4 +#if defined(SIZEOF_UINT) +# define HAVE_UINT 1 +#endif + +#define SIZEOF_ULONG 8 +#if defined(SIZEOF_ULONG) +# define HAVE_ULONG 1 +#endif + +/* #undef SIZEOF_INT8 */ +#if defined(SIZEOF_INT8) +# define HAVE_INT8 1 +#endif +/* #undef SIZEOF_UINT8 */ +#if defined(SIZEOF_UINT8) +# define HAVE_UINT8 1 +#endif + +/* #undef SIZEOF_INT16 */ +#if defined(SIZEOF_INT16) +# define HAVE_INT16 1 +#endif +/* #undef SIZEOF_UINT16 */ +#if defined(SIZEOF_UINT16) +# define HAVE_UINT16 1 +#endif + +/* #undef SIZEOF_INT32 */ +#if defined(SIZEOF_INT32) +# define HAVE_INT32 1 +#endif +/* #undef SIZEOF_UINT32 */ +#if defined(SIZEOF_UINT32) +# define HAVE_UINT32 1 +#endif +/* #undef SIZEOF_U_INT32_T */ +#if defined(SIZEOF_U_INT32_T) +# define HAVE_U_INT32_T 1 +#endif + +/* #undef SIZEOF_INT64 */ +#if defined(SIZEOF_INT64) +# define HAVE_INT64 1 +#endif +/* #undef SIZEOF_UINT64 */ +#if defined(SIZEOF_UINT64) +# define HAVE_UINT64 1 +#endif + +/* #undef SIZEOF_SOCKLEN_T */ +#if defined(SIZEOF_SOCKLEN_T) +# define HAVE_SOCKLEN_T 1 +#endif + +#define SOCKET_SIZE_TYPE socklen_t + +#define MARIADB_DEFAULT_CHARSET "latin1" + diff --git a/contrib/mariadb-connector-c-cmake/linux_x86_64/include/mariadb_version.h b/contrib/mariadb-connector-c-cmake/linux_x86_64/include/mariadb_version.h new file mode 100644 index 00000000000..821a7f8add2 --- /dev/null +++ b/contrib/mariadb-connector-c-cmake/linux_x86_64/include/mariadb_version.h @@ -0,0 +1,36 @@ +/* Copyright Abandoned 1996, 1999, 2001 MySQL AB + This file is public domain and comes with NO WARRANTY of any kind */ + +/* Version numbers for protocol & mysqld */ + +#ifndef _mariadb_version_h_ +#define _mariadb_version_h_ + +#ifdef _CUSTOMCONFIG_ +#include +#else +#define PROTOCOL_VERSION 10 +#define MARIADB_CLIENT_VERSION_STR "10.3.6" +#define MARIADB_BASE_VERSION "mariadb-10.3" +#define MARIADB_VERSION_ID 100306 +#define MYSQL_VERSION_ID 100306 +#define MARIADB_PORT 3306 +#define MARIADB_UNIX_ADDR "/var/run/mysqld/mysqld.sock" +#define MYSQL_CONFIG_NAME "my" + +#define MARIADB_PACKAGE_VERSION "3.0.6" +#define MARIADB_PACKAGE_VERSION_ID 30006 +#define MARIADB_SYSTEM_TYPE "Linux" +#define MARIADB_MACHINE_TYPE "x86_64" +#define MARIADB_PLUGINDIR "lib/mariadb/plugin" + +/* mysqld compile time options */ +#ifndef MYSQL_CHARSET +#define MYSQL_CHARSET "" +#endif +#endif + +/* Source information */ +#define CC_SOURCE_REVISION "a0fd36cc5a5313414a5a2ebe9322577a29b4782a" + +#endif /* _mariadb_version_h_ */ diff --git a/contrib/mariadb-connector-c-cmake/linux_x86_64/libmariadb/ma_client_plugin.c b/contrib/mariadb-connector-c-cmake/linux_x86_64/libmariadb/ma_client_plugin.c new file mode 100644 index 00000000000..434a4b3f4c3 --- /dev/null +++ b/contrib/mariadb-connector-c-cmake/linux_x86_64/libmariadb/ma_client_plugin.c @@ -0,0 +1,502 @@ +/* Copyright (C) 2010 - 2012 Sergei Golubchik and Monty Program Ab + 2015-2016 MariaDB Corporation AB + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this library; if not see + or write to the Free Software Foundation, Inc., + 51 Franklin St., Fifth Floor, Boston, MA 02110, USA */ + +/** + @file + + Support code for the client side (libmariadb) plugins + + Client plugins are somewhat different from server plugins, they are simpler. + + They do not need to be installed or in any way explicitly loaded on the + client, they are loaded automatically on demand. + One client plugin per shared object, soname *must* match the plugin name. + + There is no reference counting and no unloading either. +*/ + +#if _MSC_VER +/* Silence warnings about variable 'unused' being used. */ +#define FORCE_INIT_OF_VARS 1 +#endif + +#include +#include +#include +#include +#include + +#include "errmsg.h" +#include + +struct st_client_plugin_int { + struct st_client_plugin_int *next; + void *dlhandle; + struct st_mysql_client_plugin *plugin; +}; + +static my_bool initialized= 0; +static MA_MEM_ROOT mem_root; + +static uint valid_plugins[][2]= { + {MYSQL_CLIENT_AUTHENTICATION_PLUGIN, MYSQL_CLIENT_AUTHENTICATION_PLUGIN_INTERFACE_VERSION}, + {MARIADB_CLIENT_PVIO_PLUGIN, MARIADB_CLIENT_PVIO_PLUGIN_INTERFACE_VERSION}, + {MARIADB_CLIENT_TRACE_PLUGIN, MARIADB_CLIENT_TRACE_PLUGIN_INTERFACE_VERSION}, + {MARIADB_CLIENT_CONNECTION_PLUGIN, MARIADB_CLIENT_CONNECTION_PLUGIN_INTERFACE_VERSION}, + {0, 0} +}; + +/* + Loaded plugins are stored in a linked list. + The list is append-only, the elements are added to the head (like in a stack). + The elements are added under a mutex, but the list can be read and traversed + without any mutex because once an element is added to the list, it stays + there. The main purpose of a mutex is to prevent two threads from + loading the same plugin twice in parallel. +*/ + + +struct st_client_plugin_int *plugin_list[MYSQL_CLIENT_MAX_PLUGINS + MARIADB_CLIENT_MAX_PLUGINS]; +#ifdef THREAD +static pthread_mutex_t LOCK_load_client_plugin; +#endif + +extern struct st_mysql_client_plugin mysql_native_password_client_plugin; +extern struct st_mysql_client_plugin mysql_old_password_client_plugin; +extern struct st_mysql_client_plugin pvio_socket_client_plugin; +extern struct st_mysql_client_plugin sha256_password_client_plugin; +extern struct st_mysql_client_plugin caching_sha2_password_client_plugin; + + +struct st_mysql_client_plugin *mysql_client_builtins[]= +{ + (struct st_mysql_client_plugin *)&mysql_native_password_client_plugin, + (struct st_mysql_client_plugin *)&mysql_old_password_client_plugin, + (struct st_mysql_client_plugin *)&pvio_socket_client_plugin, + (struct st_mysql_client_plugin *)&sha256_password_client_plugin, + (struct st_mysql_client_plugin *)&caching_sha2_password_client_plugin, + 0 +}; + + +static int is_not_initialized(MYSQL *mysql, const char *name) +{ + if (initialized) + return 0; + + my_set_error(mysql, CR_AUTH_PLUGIN_CANNOT_LOAD, + SQLSTATE_UNKNOWN, ER(CR_AUTH_PLUGIN_CANNOT_LOAD), + name, "not initialized"); + return 1; +} + +static int get_plugin_nr(uint type) +{ + uint i= 0; + for(; valid_plugins[i][1]; i++) + if (valid_plugins[i][0] == type) + return i; + return -1; +} + +static const char *check_plugin_version(struct st_mysql_client_plugin *plugin, unsigned int version) +{ + if (plugin->interface_version < version || + (plugin->interface_version >> 8) > (version >> 8)) + return "Incompatible client plugin interface"; + return 0; +} + +/** + finds a plugin in the list + + @param name plugin name to search for + @param type plugin type + + @note this does NOT necessarily need a mutex, take care! + + @retval a pointer to a found plugin or 0 +*/ +static struct st_mysql_client_plugin *find_plugin(const char *name, int type) +{ + struct st_client_plugin_int *p; + int plugin_nr= get_plugin_nr(type); + + DBUG_ASSERT(initialized); + if (plugin_nr == -1) + return 0; + + if (!name) + return plugin_list[plugin_nr]->plugin; + + for (p= plugin_list[plugin_nr]; p; p= p->next) + { + if (strcmp(p->plugin->name, name) == 0) + return p->plugin; + } + return NULL; +} + + +/** + verifies the plugin and adds it to the list + + @param mysql MYSQL structure (for error reporting) + @param plugin plugin to install + @param dlhandle a handle to the shared object (returned by dlopen) + or 0 if the plugin was not dynamically loaded + @param argc number of arguments in the 'va_list args' + @param args arguments passed to the plugin initialization function + + @retval a pointer to an installed plugin or 0 +*/ + +static struct st_mysql_client_plugin * +add_plugin(MYSQL *mysql, struct st_mysql_client_plugin *plugin, void *dlhandle, + int argc, va_list args) +{ + const char *errmsg; + struct st_client_plugin_int plugin_int, *p; + char errbuf[1024]; + int plugin_nr; + + DBUG_ASSERT(initialized); + + plugin_int.plugin= plugin; + plugin_int.dlhandle= dlhandle; + + if ((plugin_nr= get_plugin_nr(plugin->type)) == -1) + { + errmsg= "Unknown client plugin type"; + goto err1; + } + if ((errmsg= check_plugin_version(plugin, valid_plugins[plugin_nr][1]))) + goto err1; + + /* Call the plugin initialization function, if any */ + if (plugin->init && plugin->init(errbuf, sizeof(errbuf), argc, args)) + { + errmsg= errbuf; + goto err1; + } + + p= (struct st_client_plugin_int *) + ma_memdup_root(&mem_root, (char *)&plugin_int, sizeof(plugin_int)); + + if (!p) + { + errmsg= "Out of memory"; + goto err2; + } + +#ifdef THREAD + safe_mutex_assert_owner(&LOCK_load_client_plugin); +#endif + + p->next= plugin_list[plugin_nr]; + plugin_list[plugin_nr]= p; + + return plugin; + +err2: + if (plugin->deinit) + plugin->deinit(); +err1: + my_set_error(mysql, CR_AUTH_PLUGIN_CANNOT_LOAD, SQLSTATE_UNKNOWN, + ER(CR_AUTH_PLUGIN_CANNOT_LOAD), plugin->name, errmsg); + if (dlhandle) + (void)dlclose(dlhandle); + return NULL; +} + + +/** + Loads plugins which are specified in the environment variable + LIBMYSQL_PLUGINS. + + Multiple plugins must be separated by semicolon. This function doesn't + return or log an error. + + The function is be called by mysql_client_plugin_init + + @todo + Support extended syntax, passing parameters to plugins, for example + LIBMYSQL_PLUGINS="plugin1(param1,param2);plugin2;..." + or + LIBMYSQL_PLUGINS="plugin1=int:param1,str:param2;plugin2;..." +*/ + +static void load_env_plugins(MYSQL *mysql) +{ + char *plugs, *free_env, *s= getenv("LIBMYSQL_PLUGINS"); + + if (ma_check_env_str(s)) + return; + + free_env= strdup(s); + plugs= s= free_env; + + do { + if ((s= strchr(plugs, ';'))) + *s= '\0'; + mysql_load_plugin(mysql, plugs, -1, 0); + plugs= s + 1; + } while (s); + + free(free_env); +} + +/********** extern functions to be used by libmariadb *********************/ + +/** + Initializes the client plugin layer. + + This function must be called before any other client plugin function. + + @retval 0 successful + @retval != 0 error occurred +*/ + +int mysql_client_plugin_init() +{ + MYSQL mysql; + struct st_mysql_client_plugin **builtin; + va_list unused; + LINT_INIT_STRUCT(unused); + + if (initialized) + return 0; + + memset(&mysql, 0, sizeof(mysql)); /* dummy mysql for set_mysql_extended_error */ + + pthread_mutex_init(&LOCK_load_client_plugin, MY_MUTEX_INIT_SLOW); + ma_init_alloc_root(&mem_root, 128, 128); + + memset(&plugin_list, 0, sizeof(plugin_list)); + + initialized= 1; + + pthread_mutex_lock(&LOCK_load_client_plugin); + for (builtin= mysql_client_builtins; *builtin; builtin++) + add_plugin(&mysql, *builtin, 0, 0, unused); + + pthread_mutex_unlock(&LOCK_load_client_plugin); + + load_env_plugins(&mysql); + + return 0; +} + + +/** + Deinitializes the client plugin layer. + + Unloades all client plugins and frees any associated resources. +*/ + +void mysql_client_plugin_deinit() +{ + int i; + struct st_client_plugin_int *p; + + if (!initialized) + return; + + for (i=0; i < MYSQL_CLIENT_MAX_PLUGINS; i++) + for (p= plugin_list[i]; p; p= p->next) + { + if (p->plugin->deinit) + p->plugin->deinit(); + if (p->dlhandle) + (void)dlclose(p->dlhandle); + } + + memset(&plugin_list, 0, sizeof(plugin_list)); + initialized= 0; + ma_free_root(&mem_root, MYF(0)); + pthread_mutex_destroy(&LOCK_load_client_plugin); +} + +/************* public facing functions, for client consumption *********/ + +/* see for a full description */ +struct st_mysql_client_plugin * STDCALL +mysql_client_register_plugin(MYSQL *mysql, + struct st_mysql_client_plugin *plugin) +{ + va_list unused; + LINT_INIT_STRUCT(unused); + + if (is_not_initialized(mysql, plugin->name)) + return NULL; + + pthread_mutex_lock(&LOCK_load_client_plugin); + + /* make sure the plugin wasn't loaded meanwhile */ + if (find_plugin(plugin->name, plugin->type)) + { + my_set_error(mysql, CR_AUTH_PLUGIN_CANNOT_LOAD, + SQLSTATE_UNKNOWN, ER(CR_AUTH_PLUGIN_CANNOT_LOAD), + plugin->name, "it is already loaded"); + plugin= NULL; + } + else + plugin= add_plugin(mysql, plugin, 0, 0, unused); + + pthread_mutex_unlock(&LOCK_load_client_plugin); + return plugin; +} + + +/* see for a full description */ +struct st_mysql_client_plugin * STDCALL +mysql_load_plugin_v(MYSQL *mysql, const char *name, int type, + int argc, va_list args) +{ + const char *errmsg; +#ifdef _WIN32 + char errbuf[1024]; +#endif + char dlpath[FN_REFLEN+1]; + void *sym, *dlhandle = NULL; + struct st_mysql_client_plugin *plugin; + char *env_plugin_dir= getenv("MARIADB_PLUGIN_DIR"); + + CLEAR_CLIENT_ERROR(mysql); + if (is_not_initialized(mysql, name)) + return NULL; + + pthread_mutex_lock(&LOCK_load_client_plugin); + + /* make sure the plugin wasn't loaded meanwhile */ + if (type >= 0 && find_plugin(name, type)) + { + errmsg= "it is already loaded"; + goto err; + } + + /* Compile dll path */ + snprintf(dlpath, sizeof(dlpath) - 1, "%s/%s%s", + mysql->options.extension && mysql->options.extension->plugin_dir ? + mysql->options.extension->plugin_dir : (env_plugin_dir) ? env_plugin_dir : + MARIADB_PLUGINDIR, name, SO_EXT); + + /* Open new dll handle */ + if (!(dlhandle= dlopen((const char *)dlpath, RTLD_NOW))) + { +#ifdef _WIN32 + char winmsg[255]; + size_t len; + winmsg[0] = 0; + FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, + NULL, + GetLastError(), + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + winmsg, 255, NULL); + len= strlen(winmsg); + while (len > 0 && (winmsg[len - 1] == '\n' || winmsg[len - 1] == '\r')) + len--; + if (len) + winmsg[len] = 0; + snprintf(errbuf, sizeof(errbuf), "%s Library path is '%s'", winmsg, dlpath); + errmsg= errbuf; +#else + errmsg= dlerror(); +#endif + goto err; + } + + + if (!(sym= dlsym(dlhandle, plugin_declarations_sym))) + { + errmsg= "not a plugin"; + (void)dlclose(dlhandle); + goto err; + } + + plugin= (struct st_mysql_client_plugin*)sym; + + if (type >=0 && type != plugin->type) + { + errmsg= "type mismatch"; + goto err; + } + + if (strcmp(name, plugin->name)) + { + errmsg= "name mismatch"; + goto err; + } + + if (type < 0 && find_plugin(name, plugin->type)) + { + errmsg= "it is already loaded"; + goto err; + } + + plugin= add_plugin(mysql, plugin, dlhandle, argc, args); + + pthread_mutex_unlock(&LOCK_load_client_plugin); + + return plugin; + +err: + if (dlhandle) + dlclose(dlhandle); + pthread_mutex_unlock(&LOCK_load_client_plugin); + my_set_error(mysql, CR_AUTH_PLUGIN_CANNOT_LOAD, SQLSTATE_UNKNOWN, + ER(CR_AUTH_PLUGIN_CANNOT_LOAD), name, errmsg); + return NULL; +} + + +/* see for a full description */ +struct st_mysql_client_plugin * STDCALL +mysql_load_plugin(MYSQL *mysql, const char *name, int type, int argc, ...) +{ + struct st_mysql_client_plugin *p; + va_list args; + va_start(args, argc); + p= mysql_load_plugin_v(mysql, name, type, argc, args); + va_end(args); + return p; +} + +/* see for a full description */ +struct st_mysql_client_plugin * STDCALL +mysql_client_find_plugin(MYSQL *mysql, const char *name, int type) +{ + struct st_mysql_client_plugin *p; + int plugin_nr= get_plugin_nr(type); + + if (is_not_initialized(mysql, name)) + return NULL; + + if (plugin_nr == -1) + { + my_set_error(mysql, CR_AUTH_PLUGIN_CANNOT_LOAD, SQLSTATE_UNKNOWN, + ER(CR_AUTH_PLUGIN_CANNOT_LOAD), name, "invalid type"); + } + + if ((p= find_plugin(name, type))) + return p; + + /* not found, load it */ + return mysql_load_plugin(mysql, name, type, 0); +} + diff --git a/libs/libmysqlxx/CMakeLists.txt b/libs/libmysqlxx/CMakeLists.txt index 2d2ad75628d..263a031d7b0 100644 --- a/libs/libmysqlxx/CMakeLists.txt +++ b/libs/libmysqlxx/CMakeLists.txt @@ -29,8 +29,7 @@ add_library (mysqlxx target_include_directories (mysqlxx PUBLIC include) if (USE_INTERNAL_MYSQL_LIBRARY) - target_include_directories (mysqlxx PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c/include) - target_include_directories (mysqlxx PUBLIC ${ClickHouse_BINARY_DIR}/contrib/mariadb-connector-c/include) + else () set(PLATFORM_LIBRARIES ${CMAKE_DL_LIBS}) diff --git a/libs/libmysqlxx/cmake/find_mysqlclient.cmake b/libs/libmysqlxx/cmake/find_mysqlclient.cmake index e07ebe2304d..98b42a0a9b4 100644 --- a/libs/libmysqlxx/cmake/find_mysqlclient.cmake +++ b/libs/libmysqlxx/cmake/find_mysqlclient.cmake @@ -6,14 +6,14 @@ if(ENABLE_MYSQL) option(USE_INTERNAL_MYSQL_LIBRARY "Set to FALSE to use system mysqlclient library instead of bundled" OFF) endif() - if(USE_INTERNAL_MYSQL_LIBRARY AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c/README") + if(USE_INTERNAL_MYSQL_LIBRARY AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c/README.md") message(WARNING "submodule contrib/mariadb-connector-c is missing. to fix try run: \n git submodule update --init --recursive") set(USE_INTERNAL_MYSQL_LIBRARY 0) endif() if (USE_INTERNAL_MYSQL_LIBRARY) - set (MYSQLCLIENT_LIBRARIES mariadbclient) + set (MYSQLCLIENT_LIBRARIES mysqlclient) set (USE_MYSQL 1) set (MYSQLXX_LIBRARY mysqlxx) else () diff --git a/libs/libmysqlxx/src/Connection.cpp b/libs/libmysqlxx/src/Connection.cpp index 7ba14c9baba..0e7d7bd5d3e 100644 --- a/libs/libmysqlxx/src/Connection.cpp +++ b/libs/libmysqlxx/src/Connection.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/Exception.cpp b/libs/libmysqlxx/src/Exception.cpp index b065d17ed51..dadd37e29e7 100644 --- a/libs/libmysqlxx/src/Exception.cpp +++ b/libs/libmysqlxx/src/Exception.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/Pool.cpp b/libs/libmysqlxx/src/Pool.cpp index 410ac062039..a17246e5d6d 100644 --- a/libs/libmysqlxx/src/Pool.cpp +++ b/libs/libmysqlxx/src/Pool.cpp @@ -1,6 +1,6 @@ -#if __has_include() -#include -#include +#if __has_include() +#include +#include #else #include #include diff --git a/libs/libmysqlxx/src/Query.cpp b/libs/libmysqlxx/src/Query.cpp index dc5c3274641..6f275c918a5 100644 --- a/libs/libmysqlxx/src/Query.cpp +++ b/libs/libmysqlxx/src/Query.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/ResultBase.cpp b/libs/libmysqlxx/src/ResultBase.cpp index eac1e22ca3d..b03f92e38f2 100644 --- a/libs/libmysqlxx/src/ResultBase.cpp +++ b/libs/libmysqlxx/src/ResultBase.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/Row.cpp b/libs/libmysqlxx/src/Row.cpp index aecec46e519..e4baa681d69 100644 --- a/libs/libmysqlxx/src/Row.cpp +++ b/libs/libmysqlxx/src/Row.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/StoreQueryResult.cpp b/libs/libmysqlxx/src/StoreQueryResult.cpp index a09986a3014..05ad4299e17 100644 --- a/libs/libmysqlxx/src/StoreQueryResult.cpp +++ b/libs/libmysqlxx/src/StoreQueryResult.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/UseQueryResult.cpp b/libs/libmysqlxx/src/UseQueryResult.cpp index 19daca90b15..c5c52ffcb9c 100644 --- a/libs/libmysqlxx/src/UseQueryResult.cpp +++ b/libs/libmysqlxx/src/UseQueryResult.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif From 38f65a6a2120d2e76bcf71131068f41195149dfc Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 24 Sep 2019 11:32:01 +0300 Subject: [PATCH 235/309] Fix smoke test image --- docker/test/split_build_smoke_test/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/split_build_smoke_test/run.sh b/docker/test/split_build_smoke_test/run.sh index 2772ead5dfe..63cd5fada5b 100755 --- a/docker/test/split_build_smoke_test/run.sh +++ b/docker/test/split_build_smoke_test/run.sh @@ -10,7 +10,7 @@ install_and_run_server() { } run_client() { - LD_LIBRARY_PATH=/unpacked /unpacked/clickhouse-client --query \"select 'OK'\" 2>/var/log/clickhouse-server/clientstderr.log || echo 'FAIL' + LD_LIBRARY_PATH=/unpacked /unpacked/clickhouse-client --query "select 'OK'" 2>/var/log/clickhouse-server/clientstderr.log || echo "FAIL" } install_and_run_server From c051f423842b316fc487fca4d90c950c60ea85e8 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Tue, 24 Sep 2019 10:58:42 +0000 Subject: [PATCH 236/309] Fixes. --- dbms/src/TableFunctions/TableFunctionS3.cpp | 3 ++- dbms/tests/integration/test_storage_s3/test.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/dbms/src/TableFunctions/TableFunctionS3.cpp b/dbms/src/TableFunctions/TableFunctionS3.cpp index 31a66a91af2..849836b0498 100644 --- a/dbms/src/TableFunctions/TableFunctionS3.cpp +++ b/dbms/src/TableFunctions/TableFunctionS3.cpp @@ -10,7 +10,8 @@ StoragePtr TableFunctionS3::getStorage( const String & source, const String & format, const ColumnsDescription & columns, Context & global_context, const std::string & table_name) const { Poco::URI uri(source); - return StorageS3::create(uri, getDatabaseName(), table_name, format, columns, ConstraintsDescription{}, global_context); + UInt64 min_upload_part_size = global_context.getSettingsRef().s3_min_upload_part_size; + return StorageS3::create(uri, getDatabaseName(), table_name, format, min_upload_part_size, columns, ConstraintsDescription{}, global_context); } void registerTableFunctionS3(TableFunctionFactory & factory) diff --git a/dbms/tests/integration/test_storage_s3/test.py b/dbms/tests/integration/test_storage_s3/test.py index 14ad78d4a4a..c5e7d2a7cf1 100644 --- a/dbms/tests/integration/test_storage_s3/test.py +++ b/dbms/tests/integration/test_storage_s3/test.py @@ -64,9 +64,9 @@ def started_cluster(): cluster.shutdown() -def run_query(instance, query, stdin=None): +def run_query(instance, query, stdin=None, settings=None): logging.info("Running query '{}'...".format(query)) - result = instance.query(query, stdin=stdin) + result = instance.query(query, stdin=stdin, settings=settings) logging.info("Query finished") return result @@ -150,8 +150,8 @@ def test_multipart_put(started_cluster): put_communication_data(started_cluster, "=== Multipart test ===") long_data = [[i, i+1, i+2] for i in range(100000)] long_values = "".join([ "{},{},{}\n".format(x,y,z) for x, y, z in long_data ]) - put_query = "set s3_min_upload_part_size = 1000000; insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') format CSV".format(started_cluster.mock_host, started_cluster.multipart_preserving_data_port, started_cluster.bucket, format) - run_query(instance, put_query, stdin=long_values) + put_query = "insert into table function s3('http://{}:{}/{}/test.csv', 'CSV', '{}') format CSV".format(started_cluster.mock_host, started_cluster.multipart_preserving_data_port, started_cluster.bucket, format) + run_query(instance, put_query, stdin=long_values, settings={'s3_min_upload_part_size': 1000000}) data = get_communication_data(started_cluster) assert "multipart_received_data" in data received_data = data["multipart_received_data"] From ec86a9b9335e2008b1f8753aa826fde87075145b Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 24 Sep 2019 14:46:58 +0300 Subject: [PATCH 237/309] Auto version update to [19.15.1.1398] [54426] --- dbms/cmake/version.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dbms/cmake/version.cmake b/dbms/cmake/version.cmake index a2decdff605..086e835146a 100644 --- a/dbms/cmake/version.cmake +++ b/dbms/cmake/version.cmake @@ -3,9 +3,9 @@ set(VERSION_REVISION 54426) set(VERSION_MAJOR 19) set(VERSION_MINOR 15) set(VERSION_PATCH 1) -set(VERSION_GITHASH 6f1a8c37abe6ee4e7ee74c0b5cb9c05a87417b61) -set(VERSION_DESCRIBE v19.15.1.1-prestable) -set(VERSION_STRING 19.15.1.1) +set(VERSION_GITHASH 38f65a6a2120d2e76bcf71131068f41195149dfc) +set(VERSION_DESCRIBE v19.15.1.1398-prestable) +set(VERSION_STRING 19.15.1.1398) # end of autochange set(VERSION_EXTRA "" CACHE STRING "") From 06e1a9d78e63294ed90158c5c36d86a2918bf456 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 24 Sep 2019 14:47:32 +0300 Subject: [PATCH 238/309] Auto version update to [19.16.1.1] [54427] --- dbms/cmake/version.cmake | 8 ++-- .../StorageSystemContributors.generated.cpp | 38 +++++++++++++++++++ debian/changelog | 4 +- docker/client/Dockerfile | 2 +- docker/server/Dockerfile | 2 +- docker/test/Dockerfile | 2 +- 6 files changed, 47 insertions(+), 9 deletions(-) diff --git a/dbms/cmake/version.cmake b/dbms/cmake/version.cmake index 086e835146a..8dcdcf64a7a 100644 --- a/dbms/cmake/version.cmake +++ b/dbms/cmake/version.cmake @@ -1,11 +1,11 @@ # This strings autochanged from release_lib.sh: -set(VERSION_REVISION 54426) +set(VERSION_REVISION 54427) set(VERSION_MAJOR 19) -set(VERSION_MINOR 15) +set(VERSION_MINOR 16) set(VERSION_PATCH 1) set(VERSION_GITHASH 38f65a6a2120d2e76bcf71131068f41195149dfc) -set(VERSION_DESCRIBE v19.15.1.1398-prestable) -set(VERSION_STRING 19.15.1.1398) +set(VERSION_DESCRIBE v19.16.1.1-prestable) +set(VERSION_STRING 19.16.1.1) # end of autochange set(VERSION_EXTRA "" CACHE STRING "") diff --git a/dbms/src/Storages/System/StorageSystemContributors.generated.cpp b/dbms/src/Storages/System/StorageSystemContributors.generated.cpp index debd1fe2dc6..3822b648842 100644 --- a/dbms/src/Storages/System/StorageSystemContributors.generated.cpp +++ b/dbms/src/Storages/System/StorageSystemContributors.generated.cpp @@ -2,10 +2,12 @@ const char * auto_contributors[] { "0xflotus", "821008736@qq.com", + "Akazz", "Alberto", "Aleksandra (Ася)", "Alex Bocharov", "Alex Krash", + "Alex Ryndin", "Alex Zatelepin", "Alexander Avdonkin", "Alexander Ermolaev", @@ -18,9 +20,11 @@ const char * auto_contributors[] { "Alexander Lukin", "Alexander Makarov", "Alexander Marshalov", + "Alexander Mezhov", "Alexander Millin", "Alexander Mochalin", "Alexander Prudaev", + "Alexander Rodin", "Alexander Sapin", "Alexander Tokmakov", "Alexander Tretiakov", @@ -69,6 +73,7 @@ const char * auto_contributors[] { "Bakhtiyor Ruziev", "BanyRule", "BayoNet", + "Big Elephant", "BlahGeek", "Bogdan", "Bogdan Voronin", @@ -96,6 +101,8 @@ const char * auto_contributors[] { "Dmitry Petukhov", "Dmitry Rubashkin", "Dmitry S..ky / skype: dvska-at-skype", + "Doge", + "Eldar Zaitov", "Elghazal Ahmed", "Emmanuel Donin de Rosière", "Eric", @@ -105,9 +112,11 @@ const char * auto_contributors[] { "Evgeniy Gatov", "Evgeniy Udodov", "Evgeny Konkov", + "Fabian Stäber", "Fadi Hadzh", "FeehanG", "Flowyi", + "Francisco Barón", "Fruit of Eden", "Gary Dotzler", "George", @@ -121,7 +130,10 @@ const char * auto_contributors[] { "Hiroaki Nakamura", "Igor", "Igor Hatarist", + "Igor Mineev", "Igor Strykhar", + "Igr", + "Igr Mineev", "Ildar Musin", "Ildus Kurbangaliev", "Ilya", @@ -235,6 +247,8 @@ const char * auto_contributors[] { "Pawel Rog", "Persiyanov Dmitriy Andreevich", "Quid37", + "Rafael David Tinoco", + "Ramazan Polat", "Ravengg", "Reto Kromer", "Roman Lipovsky", @@ -246,6 +260,7 @@ const char * auto_contributors[] { "SaltTan", "Samuel Chou", "Serge Rider", + "Sergei Bocharov", "Sergei Semin", "Sergei Tsetlin (rekub)", "Sergey Elantsev", @@ -263,6 +278,7 @@ const char * auto_contributors[] { "Stanislav Pavlovichev", "Stas Pavlovichev", "Stefan Thies", + "Stepan Herold", "Stupnikov Andrey", "SuperBot", "Sébastien Launay", @@ -271,6 +287,7 @@ const char * auto_contributors[] { "The-Alchemist", "Tobias Adamson", "Tsarkova Anastasia", + "VDimir", "Vadim", "Vadim Plakhtinskiy", "Vadim Skipin", @@ -284,6 +301,7 @@ const char * auto_contributors[] { "Victor Tarnavsky", "Vitaliy Karnienko", "Vitaliy Lyudvichenko", + "Vitaliy Zakaznikov", "Vitaly Baranov", "Vitaly Samigullin", "Vivien Maisonneuve", @@ -296,6 +314,7 @@ const char * auto_contributors[] { "Vladislav Smirnov", "Vojtech Splichal", "Vsevolod Orlov", + "Vxider", "Vyacheslav Alipov", "Weiqing Xu", "William Shallum", @@ -312,9 +331,11 @@ const char * auto_contributors[] { "abdrakhmanov", "abyss7", "achulkov2", + "akazz", "akonyaev", "akuzm", "alesapin", + "alex-zaitsev", "alexander kozhikhov", "alexey-milovidov", "andrewsg", @@ -336,9 +357,13 @@ const char * auto_contributors[] { "chertus", "coraxster", "daoready", + "dasmfm", "davydovska", "decaseal", + "dependabot[bot]", "dimarub2000", + "dmitrii", + "dmitriiut", "dmitry kuzmin", "eejoin", "egatov", @@ -346,6 +371,7 @@ const char * auto_contributors[] { "ezhaka", "f1yegor", "felixoid", + "fenglv", "fessmage", "filimonov", "flow", @@ -363,8 +389,10 @@ const char * auto_contributors[] { "javi", "javi santana", "kmeaw", + "kreuzerkrieg", "ks1322", "kshvakov", + "l", "leozhang", "levushkin aleksej", "levysh", @@ -375,10 +403,13 @@ const char * auto_contributors[] { "lomberts", "luc1ph3r", "maiha", + "malkfilipp", + "maqroll", "maxkuzn", "mf5137", "mfridental", "miha-g", + "millb", "morty", "moscas", "never lee", @@ -388,7 +419,9 @@ const char * auto_contributors[] { "ogorbacheva", "olegkv", "orantius", + "palasonicq", "peshkurov", + "philip.han", "proller", "pyos", "qianlixiang", @@ -399,6 +432,8 @@ const char * auto_contributors[] { "santaux", "sdk2", "serebrserg", + "sev7e0", + "sfod", "shangshujie", "shedx", "simon-says", @@ -408,6 +443,7 @@ const char * auto_contributors[] { "sundyli", "svladykin", "tai", + "tavplubix", "topvisor", "unknown", "urgordeadbeef", @@ -427,4 +463,6 @@ const char * auto_contributors[] { "张健", "张风啸", "谢磊", + "黄朝晖", + "박현우", nullptr}; diff --git a/debian/changelog b/debian/changelog index 563be7c48eb..131741b202f 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,5 +1,5 @@ -clickhouse (19.15.1.1) unstable; urgency=low +clickhouse (19.16.1.1) unstable; urgency=low * Modified source code - -- clickhouse-release Fri, 06 Sep 2019 17:58:30 +0300 + -- clickhouse-release Tue, 24 Sep 2019 14:47:28 +0300 diff --git a/docker/client/Dockerfile b/docker/client/Dockerfile index 9fde85b9fb0..3134686b0c0 100644 --- a/docker/client/Dockerfile +++ b/docker/client/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:18.04 ARG repository="deb http://repo.yandex.ru/clickhouse/deb/stable/ main/" -ARG version=19.15.1.* +ARG version=19.16.1.* RUN apt-get update \ && apt-get install --yes --no-install-recommends \ diff --git a/docker/server/Dockerfile b/docker/server/Dockerfile index c65a0ddb550..4b5420a3e5a 100644 --- a/docker/server/Dockerfile +++ b/docker/server/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:18.04 ARG repository="deb http://repo.yandex.ru/clickhouse/deb/stable/ main/" -ARG version=19.15.1.* +ARG version=19.16.1.* ARG gosu_ver=1.10 RUN apt-get update \ diff --git a/docker/test/Dockerfile b/docker/test/Dockerfile index 0106d877feb..189e76d6c6b 100644 --- a/docker/test/Dockerfile +++ b/docker/test/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:18.04 ARG repository="deb http://repo.yandex.ru/clickhouse/deb/stable/ main/" -ARG version=19.15.1.* +ARG version=19.16.1.* RUN apt-get update && \ apt-get install -y apt-transport-https dirmngr && \ From 80d902a4010090605ca5378931a392db541b4e3a Mon Sep 17 00:00:00 2001 From: chertus Date: Tue, 24 Sep 2019 16:45:59 +0300 Subject: [PATCH 239/309] fix case with duplicated right keys --- dbms/src/Interpreters/MergeJoin.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index 5168b9e13c4..bd7e7cfe078 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -133,7 +133,7 @@ public: return getNextEqualRangeImpl(rhs); } - int intersect(const Block & right_block, const Block & right_table_keys) + int intersect(const Block & right_block, const Block & right_table_keys, const Names & key_names) { const Block min_max = extractMinMax(right_block, right_table_keys); if (end() == 0 || min_max.rows() != 2) @@ -146,7 +146,7 @@ public: for (size_t i = 0; i < impl.sort_columns.size(); ++i) { auto & left_column = *impl.sort_columns[i]; - auto & right_column = *min_max.getByPosition(i).column; + auto & right_column = *min_max.getByName(key_names[i]).column; /// cannot get by position cause of possible duplicates if (!first_vs_max) first_vs_max = nullableCompareAt(left_column, right_column, position(), 1); @@ -426,7 +426,7 @@ void MergeJoin::joinBlock(Block & block) if (skip_not_intersected) { - int intersection = left_cursor.intersect(*it, right_table_keys); + int intersection = left_cursor.intersect(*it, right_table_keys, table_join->keyNamesRight()); if (intersection < 0) break; /// (left) ... (right) if (intersection > 0) @@ -452,7 +452,7 @@ void MergeJoin::joinBlock(Block & block) if (skip_not_intersected) { - int intersection = left_cursor.intersect(*it, right_table_keys); + int intersection = left_cursor.intersect(*it, right_table_keys, table_join->keyNamesRight()); if (intersection < 0) break; /// (left) ... (right) if (intersection > 0) From dac5889bc4c810b3772f6678d87c116c679ce7fa Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Tue, 24 Sep 2019 17:43:07 +0300 Subject: [PATCH 240/309] Add build instruction --- docs/en/development/build.md | 10 +++--- docs/en/development/build_cross.md | 53 ++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 6 deletions(-) create mode 100644 docs/en/development/build_cross.md diff --git a/docs/en/development/build.md b/docs/en/development/build.md index 02cea936c70..f67ff45786a 100644 --- a/docs/en/development/build.md +++ b/docs/en/development/build.md @@ -10,7 +10,7 @@ sudo apt-get install git pbuilder debhelper lsb-release fakeroot sudo debian-arc ## Checkout ClickHouse Sources ```bash -git clone --recursive --branch stable https://github.com/yandex/ClickHouse.git +git clone --recursive --branch master https://github.com/ClickHouse/ClickHouse.git cd ClickHouse ``` @@ -55,7 +55,7 @@ sudo apt-get install gcc-9 g++-9 ### Install from Sources -Look at [utils/ci/build-gcc-from-sources.sh](https://github.com/yandex/ClickHouse/blob/master/utils/ci/build-gcc-from-sources.sh) +Look at [utils/ci/build-gcc-from-sources.sh](https://github.com/ClickHouse/ClickHouse/blob/master/utils/ci/build-gcc-from-sources.sh) ## Use GCC 9 for Builds @@ -73,14 +73,12 @@ sudo apt-get install libicu-dev libreadline-dev gperf ## Checkout ClickHouse Sources ```bash -git clone --recursive git@github.com:yandex/ClickHouse.git -# or: git clone --recursive https://github.com/yandex/ClickHouse.git +git clone --recursive git@github.com:ClickHouse/ClickHouse.git +# or: git clone --recursive https://github.com/ClickHouse/ClickHouse.git cd ClickHouse ``` -For the latest stable version, switch to the `stable` branch. - ## Build ClickHouse ```bash diff --git a/docs/en/development/build_cross.md b/docs/en/development/build_cross.md new file mode 100644 index 00000000000..ebbde15ec3f --- /dev/null +++ b/docs/en/development/build_cross.md @@ -0,0 +1,53 @@ +# How to Build ClickHouse on Linux for Mac OS X + +The cross-build for Mac OS X is based on the Build instructions, follow them first. + +# Install Clang-8 + +Follow the instructions from https://apt.llvm.org/ for your Ubuntu or Debian setup. +For example the commands for Bionic are like: + +```bash +sudo echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main" >> /etc/apt/sources.list +sudo apt-get install clang-8 +``` + +# Install Cross-Compilation Toolset + +```bash +mkdir cctools + +git clone https://github.com/tpoechtrager/apple-libtapi.git +cd apple-libtapi +INSTALLPREFIX=../cctools ./build.sh +./install.sh +cd .. + +git clone https://github.com/tpoechtrager/cctools-port.git +cd cctools-port/cctools +./configure --prefix=../cctools --with-libtapi=../cctools --target=x86_64-apple-darwin +make install +cd .. + +cd cctools +wget https://github.com/phracker/MacOSX-SDKs/releases/download/10.14-beta4/MacOSX10.14.sdk.tar.xz +tar xJf MacOSX10.14.sdk.tar.xz +``` + +Let's remember the path where we created `cctools` directory as ${CCTOOLS_PARENT} + +# Build ClickHouse + +```bash +cd ClickHouse +mkdir build-osx +CC=clang-8 CXX=clang++-8 cmake . -Bbuild-osx -DCMAKE_SYSTEM_NAME=Darwin \ + -DCMAKE_AR:FILEPATH=${CCTOOLS_PARENT}/cctools/bin/x86_64-apple-darwin-ar \ + -DCMAKE_RANLIB:FILEPATH=${CCTOOLS_PARENT}/cctools/bin/x86_64-apple-darwin-ranlib \ + -DLINKER_NAME=${CCTOOLS_PARENT}/cctools/bin/x86_64-apple-darwin-ld \ + -DSDK_PATH=${CCTOOLS_PARENT}/cctools/MacOSX10.14.sdk \ + -DUSE_SNAPPY=OFF -DENABLE_SSL=OFF -DENABLE_PROTOBUF=OFF -DENABLE_PARQUET=OFF -DENABLE_READLINE=OFF -DENABLE_ICU=OFF -DENABLE_FASTOPS=OFF +ninja -C build-osx +``` + +The resulting binary will have Mach-O executable format and can't be run on Linux. From f3bde19b74207707c0cc678c22682327f800bb07 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Tue, 20 Aug 2019 12:58:44 +0300 Subject: [PATCH 241/309] Do not use iterators in find() and emplace() methods of hash tables. Instead, these methods return a pointer to the required data as they are stored inside the hash table. The caller uses overloaded functions to get the key and "mapped" values from this pointer. Such an interface avoids the need for constructing iterator-like wrapper objects, which is especially important for compound hash tables such as the future StringHashMap. --- dbms/programs/obfuscator/Obfuscator.cpp | 8 +- .../AggregateFunctionGroupUniqArray.h | 4 +- dbms/src/Columns/ReverseIndex.h | 11 +- dbms/src/Common/ColumnsHashing.h | 9 +- dbms/src/Common/ColumnsHashingImpl.h | 39 ++-- dbms/src/Common/HashTable/ClearableHashMap.h | 11 +- dbms/src/Common/HashTable/ClearableHashSet.h | 8 + .../Common/HashTable/FixedClearableHashMap.h | 1 - .../Common/HashTable/FixedClearableHashSet.h | 4 +- dbms/src/Common/HashTable/FixedHashMap.h | 28 ++- dbms/src/Common/HashTable/FixedHashTable.h | 33 ++-- dbms/src/Common/HashTable/HashMap.h | 37 +++- dbms/src/Common/HashTable/HashSet.h | 8 + dbms/src/Common/HashTable/HashTable.h | 174 +++++++++++++----- dbms/src/Common/HashTable/TwoLevelHashMap.h | 9 +- dbms/src/Common/HashTable/TwoLevelHashTable.h | 40 ++-- dbms/src/Common/SpaceSaving.h | 4 +- dbms/src/Common/tests/auto_array.cpp | 6 +- dbms/src/Common/tests/hash_table.cpp | 12 +- .../src/Common/tests/parallel_aggregation.cpp | 30 +-- .../Common/tests/parallel_aggregation2.cpp | 14 +- dbms/src/Core/tests/string_pool.cpp | 6 +- dbms/src/DataTypes/DataTypeEnum.cpp | 12 +- dbms/src/DataTypes/DataTypeEnum.h | 4 +- .../Dictionaries/ComplexKeyCacheDictionary.h | 2 +- .../ComplexKeyHashedDictionary.cpp | 6 +- dbms/src/Dictionaries/HashedDictionary.cpp | 2 +- .../Dictionaries/RangeHashedDictionary.cpp | 16 +- dbms/src/Functions/addressToLine.cpp | 6 +- dbms/src/Functions/array/arrayDistinct.cpp | 6 +- dbms/src/Functions/transform.cpp | 32 ++-- dbms/src/Interpreters/Aggregator.cpp | 1 + dbms/src/Interpreters/SetVariants.h | 17 +- dbms/src/Interpreters/tests/hash_map.cpp | 18 +- .../Interpreters/tests/hash_map_lookup.cpp | 21 ++- .../Interpreters/tests/hash_map_string.cpp | 24 +-- .../Interpreters/tests/hash_map_string_2.cpp | 6 +- .../Interpreters/tests/hash_map_string_3.cpp | 6 +- .../tests/hash_map_string_small.cpp | 12 +- .../Interpreters/tests/two_level_hash_map.cpp | 12 +- .../Impl/JSONEachRowRowInputFormat.cpp | 12 +- .../Formats/Impl/JSONEachRowRowInputFormat.h | 2 +- .../Formats/Impl/TSKVRowInputFormat.cpp | 4 +- .../MergeTree/MergeTreeDataWriter.cpp | 6 +- 44 files changed, 440 insertions(+), 283 deletions(-) diff --git a/dbms/programs/obfuscator/Obfuscator.cpp b/dbms/programs/obfuscator/Obfuscator.cpp index febe2b28606..be6125d77bf 100644 --- a/dbms/programs/obfuscator/Obfuscator.cpp +++ b/dbms/programs/obfuscator/Obfuscator.cpp @@ -670,13 +670,13 @@ public: while (pos < end) { - Table::iterator it = table.end(); + Table::LookupResult it; size_t context_size = params.order; while (true) { it = table.find(hashContext(code_points.data() + code_points.size() - context_size, code_points.data() + code_points.size())); - if (table.end() != it && it->getSecond().total + it->getSecond().count_end != 0) + if (it && lookupResultGetMapped(it)->total + lookupResultGetMapped(it)->count_end != 0) break; if (context_size == 0) @@ -684,7 +684,7 @@ public: --context_size; } - if (table.end() == it) + if (!it) throw Exception("Logical error in markov model", ErrorCodes::LOGICAL_ERROR); size_t offset_from_begin_of_string = pos - data; @@ -710,7 +710,7 @@ public: if (num_bytes_after_desired_size > 0) end_probability_multiplier = std::pow(1.25, num_bytes_after_desired_size); - CodePoint code = it->getSecond().sample(determinator, end_probability_multiplier); + CodePoint code = lookupResultGetMapped(it)->sample(determinator, end_probability_multiplier); if (code == END) break; diff --git a/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h b/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h index f5dd37ca644..f4f9f0913d9 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionGroupUniqArray.h @@ -218,7 +218,7 @@ public: return; bool inserted; - State::Set::iterator it; + State::Set::LookupResult it; auto key_holder = getKeyHolder(*columns[0], row_num, *arena); set.emplace(key_holder, it, inserted); } @@ -229,7 +229,7 @@ public: auto & rhs_set = this->data(rhs).value; bool inserted; - State::Set::iterator it; + State::Set::LookupResult it; for (auto & rhs_elem : rhs_set) { if (limit_num_elems && cur_set.size() >= max_elems) diff --git a/dbms/src/Columns/ReverseIndex.h b/dbms/src/Columns/ReverseIndex.h index 2e017202741..1e80164ca05 100644 --- a/dbms/src/Columns/ReverseIndex.h +++ b/dbms/src/Columns/ReverseIndex.h @@ -151,6 +151,7 @@ namespace public: using Base::Base; using iterator = typename Base::iterator; + using LookupResult = typename Base::LookupResult; State & getState() { return *this; } @@ -168,7 +169,7 @@ namespace } template - void ALWAYS_INLINE reverseIndexEmplaceNonZero(const Key & key, iterator & it, + void ALWAYS_INLINE reverseIndexEmplaceNonZero(const Key & key, LookupResult & it, bool & inserted, size_t hash_value, const ObjectToCompareWith & object) { size_t place_value = reverseIndexFindCell(object, hash_value, @@ -184,10 +185,14 @@ namespace void ALWAYS_INLINE reverseIndexEmplace(Key key, iterator & it, bool & inserted, size_t hash_value, const ObjectToCompareWith& object) { - if (!this->emplaceIfZero(key, it, inserted, hash_value)) + LookupResult impl_it = nullptr; + + if (!this->emplaceIfZero(key, impl_it, inserted, hash_value)) { - reverseIndexEmplaceNonZero(key, it, inserted, hash_value, object); + reverseIndexEmplaceNonZero(key, impl_it, inserted, hash_value, object); } + assert(impl_it != nullptr); + it = iterator(this, impl_it); } template diff --git a/dbms/src/Common/ColumnsHashing.h b/dbms/src/Common/ColumnsHashing.h index c02d79d3648..28938bd43ac 100644 --- a/dbms/src/Common/ColumnsHashing.h +++ b/dbms/src/Common/ColumnsHashing.h @@ -349,7 +349,7 @@ struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod auto key_holder = getKeyHolder(row_, pool); bool inserted = false; - typename Data::iterator it; + typename Data::LookupResult it; if (saved_hash) data.emplace(key_holder, it, inserted, saved_hash[row]); else @@ -359,12 +359,13 @@ struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod if constexpr (has_mapped) { + auto & mapped = *lookupResultGetMapped(it); if (inserted) { - new (&it->getSecond()) Mapped(); + new (&mapped) Mapped(); } - mapped_cache[row] = it->getSecond(); - return EmplaceResult(it->getSecond(), mapped_cache[row], inserted); + mapped_cache[row] = mapped; + return EmplaceResult(mapped, mapped_cache[row], inserted); } else return EmplaceResult(inserted); diff --git a/dbms/src/Common/ColumnsHashingImpl.h b/dbms/src/Common/ColumnsHashingImpl.h index 0fed1160ba6..e204242d8fe 100644 --- a/dbms/src/Common/ColumnsHashingImpl.h +++ b/dbms/src/Common/ColumnsHashingImpl.h @@ -168,34 +168,41 @@ protected: } } - typename Data::iterator it; + typename Data::LookupResult it; bool inserted = false; data.emplace(key_holder, it, inserted); [[maybe_unused]] Mapped * cached = nullptr; if constexpr (has_mapped) - cached = &it->getSecond(); + cached = lookupResultGetMapped(it); if (inserted) { if constexpr (has_mapped) { - new(&it->getSecond()) Mapped(); + new(lookupResultGetMapped(it)) Mapped(); } } if constexpr (consecutive_keys_optimization) { - cache.value = it->getValue(); cache.found = true; cache.empty = false; if constexpr (has_mapped) + { + cache.value.first = *lookupResultGetKey(it); + cache.value.second = *lookupResultGetMapped(it); cached = &cache.value.second; + } + else + { + cache.value = *lookupResultGetKey(it); + } } if constexpr (has_mapped) - return EmplaceResult(it->getSecond(), *cached, inserted); + return EmplaceResult(*lookupResultGetMapped(it), *cached, inserted); else return EmplaceResult(inserted); } @@ -215,28 +222,30 @@ protected: } auto it = data.find(key); - bool found = it != data.end(); if constexpr (consecutive_keys_optimization) { - cache.found = found; + cache.found = it != nullptr; cache.empty = false; - if (found) - cache.value = it->getValue(); + if constexpr (has_mapped) + { + cache.value.first = key; + if (it) + { + cache.value.second = *lookupResultGetMapped(it); + } + } else { - if constexpr (has_mapped) - cache.value.first = key; - else - cache.value = key; + cache.value = key; } } if constexpr (has_mapped) - return FindResult(found ? &it->getSecond() : nullptr, found); + return FindResult(it ? lookupResultGetMapped(it) : nullptr, it != nullptr); else - return FindResult(found); + return FindResult(it != nullptr); } }; diff --git a/dbms/src/Common/HashTable/ClearableHashMap.h b/dbms/src/Common/HashTable/ClearableHashMap.h index c7084e56e4c..d1703394e14 100644 --- a/dbms/src/Common/HashTable/ClearableHashMap.h +++ b/dbms/src/Common/HashTable/ClearableHashMap.h @@ -14,6 +14,11 @@ struct ClearableHashMapCell : public ClearableHashTableCell +auto lookupResultGetKey(ClearableHashMapCell * cell) { return &cell->getFirst(); } + +template +auto lookupResultGetMapped(ClearableHashMapCell * cell) { return &cell->getSecond(); } template < @@ -32,14 +37,14 @@ public: mapped_type & operator[](Key x) { - typename ClearableHashMap::iterator it; + typename ClearableHashMap::LookupResult it; bool inserted; this->emplace(x, it, inserted); if (inserted) - new(&it->getSecond()) mapped_type(); + new(lookupResultGetMapped(it)) mapped_type(); - return it->getSecond(); + return *lookupResultGetMapped(it); } void clear() diff --git a/dbms/src/Common/HashTable/ClearableHashSet.h b/dbms/src/Common/HashTable/ClearableHashSet.h index e61504d025f..4f079eddc78 100644 --- a/dbms/src/Common/HashTable/ClearableHashSet.h +++ b/dbms/src/Common/HashTable/ClearableHashSet.h @@ -48,6 +48,11 @@ struct ClearableHashTableCell : public BaseCell ClearableHashTableCell(const Key & key_, const State & state) : BaseCell(key_, state), version(state.version) {} }; +template +auto lookupResultGetKey(ClearableHashTableCell * cell) { return &cell->key; } + +template +void * lookupResultGetMapped(ClearableHashTableCell *) { return nullptr; } template < @@ -62,6 +67,9 @@ public: using key_type = Key; using value_type = typename ClearableHashSet::cell_type::value_type; + using Base = HashTable>, Hash, Grower, Allocator>; + using typename Base::LookupResult; + void clear() { ++this->version; diff --git a/dbms/src/Common/HashTable/FixedClearableHashMap.h b/dbms/src/Common/HashTable/FixedClearableHashMap.h index ab808b56f34..e4a67b63446 100644 --- a/dbms/src/Common/HashTable/FixedClearableHashMap.h +++ b/dbms/src/Common/HashTable/FixedClearableHashMap.h @@ -23,7 +23,6 @@ struct FixedClearableHashMapCell bool isZero(const State & state) const { return version != state.version; } void setZero() { version = 0; } static constexpr bool need_zero_value_storage = false; - void setMapped(const value_type & value) { mapped = value.getSecond(); } struct CellExt { diff --git a/dbms/src/Common/HashTable/FixedClearableHashSet.h b/dbms/src/Common/HashTable/FixedClearableHashSet.h index f22e41fcd4e..063798ae370 100644 --- a/dbms/src/Common/HashTable/FixedClearableHashSet.h +++ b/dbms/src/Common/HashTable/FixedClearableHashSet.h @@ -10,6 +10,7 @@ struct FixedClearableHashTableCell using State = ClearableHashSetState; using value_type = Key; + using mapped_type = void; UInt32 version; FixedClearableHashTableCell() {} @@ -18,7 +19,6 @@ struct FixedClearableHashTableCell bool isZero(const State & state) const { return version != state.version; } void setZero() { version = 0; } static constexpr bool need_zero_value_storage = false; - void setMapped(const value_type & /*value*/) {} struct CellExt { @@ -33,8 +33,10 @@ template class FixedClearableHashSet : public FixedHashTable, Allocator> { public: + using Base = FixedHashTable, Allocator>; using key_type = Key; using value_type = typename FixedClearableHashSet::cell_type::value_type; + using LookupResult = typename Base::LookupResult; void clear() { diff --git a/dbms/src/Common/HashTable/FixedHashMap.h b/dbms/src/Common/HashTable/FixedHashMap.h index 4e7686002eb..e0f1a2494e0 100644 --- a/dbms/src/Common/HashTable/FixedHashMap.h +++ b/dbms/src/Common/HashTable/FixedHashMap.h @@ -11,6 +11,8 @@ struct FixedHashMapCell using State = TState; using value_type = PairNoInit; + using mapped_type = TMapped; + Mapped mapped; bool full; @@ -23,7 +25,6 @@ struct FixedHashMapCell bool isZero(const State &) const { return !full; } void setZero() { full = false; } static constexpr bool need_zero_value_storage = false; - void setMapped(const value_type & value) { mapped = value.getSecond(); } /// Similar to FixedHashSetCell except that we need to contain a pointer to the Mapped field. /// Note that we have to assemble a continuous layout for the value_type on each call of getValue(). @@ -46,6 +47,11 @@ struct FixedHashMapCell }; }; +template +void * lookupResultGetKey(FixedHashMapCell *) { return nullptr; } + +template +auto lookupResultGetMapped(FixedHashMapCell * cell) { return &cell->getSecond(); } template class FixedHashMap : public FixedHashTable, Allocator> @@ -54,21 +60,23 @@ public: using Base = FixedHashTable, Allocator>; using Self = FixedHashMap; using key_type = Key; - using mapped_type = Mapped; using Cell = typename Base::cell_type; using value_type = typename Cell::value_type; + using mapped_type = typename Cell::Mapped; using Base::Base; + using LookupResult = typename Base::LookupResult; + template void ALWAYS_INLINE mergeToViaEmplace(Self & that, Func && func) { for (auto it = this->begin(), end = this->end(); it != end; ++it) { - decltype(it) res_it; + typename Self::LookupResult res_it; bool inserted; that.emplace(it->getFirst(), res_it, inserted, it.getHash()); - func(res_it->getSecond(), it->getSecond(), inserted); + func(*lookupResultGetMapped(res_it), it->getSecond(), inserted); } } @@ -77,11 +85,11 @@ public: { for (auto it = this->begin(), end = this->end(); it != end; ++it) { - decltype(it) res_it = that.find(it->getFirst(), it.getHash()); - if (res_it == that.end()) + auto res_it = that.find(it->getFirst(), it.getHash()); + if (!res_it) func(it->getSecond(), it->getSecond(), false); else - func(res_it->getSecond(), it->getSecond(), true); + func(*lookupResultGetMapped(res_it), it->getSecond(), true); } } @@ -101,12 +109,12 @@ public: mapped_type & ALWAYS_INLINE operator[](Key x) { - typename Base::iterator it; + typename Base::LookupResult it; bool inserted; this->emplace(x, it, inserted); if (inserted) - new (&it->getSecond()) mapped_type(); + new (it) mapped_type(); - return it->getSecond(); + return it; } }; diff --git a/dbms/src/Common/HashTable/FixedHashTable.h b/dbms/src/Common/HashTable/FixedHashTable.h index b673dbcea8f..aadce906dc2 100644 --- a/dbms/src/Common/HashTable/FixedHashTable.h +++ b/dbms/src/Common/HashTable/FixedHashTable.h @@ -8,6 +8,7 @@ struct FixedHashTableCell using State = TState; using value_type = Key; + using mapped_type = void; bool full; FixedHashTableCell() {} @@ -16,7 +17,6 @@ struct FixedHashTableCell bool isZero(const State &) const { return !full; } void setZero() { full = false; } static constexpr bool need_zero_value_storage = false; - void setMapped(const value_type & /*value*/) {} /// This Cell is only stored inside an iterator. It's used to accomodate the fact /// that the iterator based API always provide a reference to a continuous memory @@ -141,6 +141,11 @@ protected: public: using key_type = Key; using value_type = typename Cell::value_type; + using mapped_type = typename Cell::mapped_type; + + using LookupResult = Cell *; + using ConstLookupResult = const Cell *; + size_t hash(const Key & x) const { return x; } @@ -263,9 +268,9 @@ public: public: /// The last parameter is unused but exists for compatibility with HashTable interface. - void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted, size_t /* hash */ = 0) + void ALWAYS_INLINE emplace(Key x, LookupResult & it, bool & inserted, size_t /* hash */ = 0) { - it = iterator(this, &buf[x]); + it = &buf[x]; if (!buf[x].isZero(*this)) { @@ -278,34 +283,34 @@ public: ++m_size; } - std::pair ALWAYS_INLINE insert(const value_type & x) + std::pair ALWAYS_INLINE insert(const value_type & x) { - std::pair res; + std::pair res; emplace(Cell::getKey(x), res.first, res.second); if (res.second) - res.first.ptr->setMapped(x); + insertSetMapped(lookupResultGetMapped(res.first), x); return res; } - iterator ALWAYS_INLINE find(Key x) + LookupResult ALWAYS_INLINE find(Key x) { - return !buf[x].isZero(*this) ? iterator(this, &buf[x]) : end(); + return !buf[x].isZero(*this) ? &buf[x] : nullptr; } - const_iterator ALWAYS_INLINE find(Key x) const + ConstLookupResult ALWAYS_INLINE find(Key x) const { - return !buf[x].isZero(*this) ? const_iterator(this, &buf[x]) : end(); + return const_cast *>(this)->find(x); } - iterator ALWAYS_INLINE find(Key, size_t hash_value) + LookupResult ALWAYS_INLINE find(Key, size_t hash_value) { - return !buf[hash_value].isZero(*this) ? iterator(this, &buf[hash_value]) : end(); + return !buf[hash_value].isZero(*this) ? &buf[hash_value] : nullptr; } - const_iterator ALWAYS_INLINE find(Key, size_t hash_value) const + ConstLookupResult ALWAYS_INLINE find(Key key, size_t hash_value) const { - return !buf[hash_value].isZero(*this) ? const_iterator(this, &buf[hash_value]) : end(); + return const_cast *>(this)->find(key, hash_value); } bool ALWAYS_INLINE has(Key x) const { return !buf[x].isZero(*this); } diff --git a/dbms/src/Common/HashTable/HashMap.h b/dbms/src/Common/HashTable/HashMap.h index 55a4d91dd84..1d9ca29b77a 100644 --- a/dbms/src/Common/HashTable/HashMap.h +++ b/dbms/src/Common/HashTable/HashMap.h @@ -43,6 +43,9 @@ struct HashMapCell using State = TState; using value_type = PairNoInit; + using mapped_type = Mapped; + using key_type = Key; + value_type value; HashMapCell() {} @@ -107,6 +110,12 @@ struct HashMapCell } }; +template +auto lookupResultGetKey(HashMapCell * cell) { return &cell->getFirst(); } + +template +auto lookupResultGetMapped(HashMapCell * cell) { return &cell->getSecond(); } + template struct HashMapCellWithSavedHash : public HashMapCell @@ -125,6 +134,12 @@ struct HashMapCellWithSavedHash : public HashMapCell size_t getHash(const Hash & /*hash_function*/) const { return saved_hash; } }; +template +auto lookupResultGetKey(HashMapCellWithSavedHash * cell) { return &cell->getFirst(); } + +template +auto lookupResultGetMapped(HashMapCellWithSavedHash * cell) { return &cell->getSecond(); } + template < typename Key, @@ -136,9 +151,13 @@ class HashMapTable : public HashTable { public: using Self = HashMapTable; + using Base = HashTable; + using key_type = Key; - using mapped_type = typename Cell::Mapped; using value_type = typename Cell::value_type; + using mapped_type = typename Cell::Mapped; + + using LookupResult = typename Base::LookupResult; using HashTable::HashTable; @@ -153,10 +172,10 @@ public: { for (auto it = this->begin(), end = this->end(); it != end; ++it) { - decltype(it) res_it; + typename Self::LookupResult res_it; bool inserted; that.emplace(it->getFirst(), res_it, inserted, it.getHash()); - func(res_it->getSecond(), it->getSecond(), inserted); + func(*lookupResultGetMapped(res_it), it->getSecond(), inserted); } } @@ -170,11 +189,11 @@ public: { for (auto it = this->begin(), end = this->end(); it != end; ++it) { - decltype(it) res_it = that.find(it->getFirst(), it.getHash()); - if (res_it == that.end()) + auto res_it = that.find(it->getFirst(), it.getHash()); + if (!res_it) func(it->getSecond(), it->getSecond(), false); else - func(res_it->getSecond(), it->getSecond(), true); + func(*lookupResultGetMapped(res_it), it->getSecond(), true); } } @@ -196,7 +215,7 @@ public: mapped_type & ALWAYS_INLINE operator[](Key x) { - typename HashMapTable::iterator it; + typename HashMapTable::LookupResult it; bool inserted; this->emplace(x, it, inserted); @@ -215,9 +234,9 @@ public: * the compiler can not guess about this, and generates the `load`, `increment`, `store` code. */ if (inserted) - new(&it->getSecond()) mapped_type(); + new(lookupResultGetMapped(it)) mapped_type(); - return it->getSecond(); + return *lookupResultGetMapped(it); } }; diff --git a/dbms/src/Common/HashTable/HashSet.h b/dbms/src/Common/HashTable/HashSet.h index f506197f343..9c25f7f906b 100644 --- a/dbms/src/Common/HashTable/HashSet.h +++ b/dbms/src/Common/HashTable/HashSet.h @@ -30,6 +30,9 @@ public: using Self = HashSetTable; using Cell = TCell; + using Base = HashTable; + using typename Base::LookupResult; + void merge(const Self & rhs) { if (!this->hasZero() && rhs.hasZero()) @@ -81,6 +84,11 @@ struct HashSetCellWithSavedHash : public HashTableCell size_t getHash(const Hash & /*hash_function*/) const { return saved_hash; } }; +template +auto lookupResultGetKey(HashSetCellWithSavedHash * cell) { return &cell->key; } + +template +void * lookupResultGetMapped(HashSetCellWithSavedHash *) { return nullptr; } template < diff --git a/dbms/src/Common/HashTable/HashTable.h b/dbms/src/Common/HashTable/HashTable.h index ade51969bea..03822996361 100644 --- a/dbms/src/Common/HashTable/HashTable.h +++ b/dbms/src/Common/HashTable/HashTable.h @@ -77,6 +77,67 @@ void set(T & x) { x = 0; } } +/** + * lookupResultGetKey/Mapped -- functions to get key/"mapped" values from the + * LookupResult returned by find() and emplace() methods of HashTable. + * Must not be called for a null LookupResult. + * + * We don't use iterators for lookup result to avoid creating temporary + * objects. Instead, LookupResult is a pointer of some kind. There are global + * functions lookupResultGetKey/Mapped, overloaded for this pointer type, that + * return pointers to key/"mapped" values. They are implemented as global + * functions and not as methods, because they have to be overloaded for POD + * types, e.g. in StringHashTable where different components have different + * Cell format. + * + * Different hash table implementations support this interface to a varying + * degree: + * + * 1) Hash tables that store neither the key in its original form, nor a + * "mapped" value: FixedHashTable or StringHashTable. + * Neither GetKey nor GetMapped are supported, the only valid operation is + * checking LookupResult for null. + * + * 2) Hash maps that do not store the key, e.g. FixedHashMap or StringHashMap. + * Only GetMapped is supported. + * + * 3) Hash tables that store the key and do not have a "mapped" value, e.g. the + * normal HashTable. + * GetKey returns the key, and GetMapped returns a zero void pointer. This + * simplifies generic code that works with mapped values: it can overload + * on the return type of GetMapped(), and doesn't need other parameters. One + * example is insertSetMapped() function. + * + * 4) Hash tables that store both the key and the "mapped" value, e.g. HashMap. + * Both GetKey and GetMapped are supported. + * + * The implementation side goes as follows: + * for (1), LookupResult = void *, no getters; + * for (2), LookupResult = Mapped *, GetMapped is a default implementation that + * takes any pointer-like object; + * for (3) and (4), LookupResult = Cell *, and both getters are implemented. + * They have to be specialized for each particular Cell class to supersede the + * default verision that takes a generic pointer-like object. + */ + +/** + * The default implementation of GetMapped that is used for the above case (2). + */ +template +inline auto lookupResultGetMapped(PointerLike && ptr) { return &*ptr; } + +/** + * Generic const wrapper for lookupResultGetMapped, that calls a non-const + * version. Should be safe, given that these functions only do pointer + * arithmetics. + */ +template +auto lookupResultGetMapped(const T * obj) +{ + auto mapped_ptr = lookupResultGetMapped(const_cast(obj)); + const auto const_mapped_ptr = mapped_ptr; + return const_mapped_ptr; +} /** Compile-time interface for cell of the hash table. * Different cell types are used to implement different hash tables. @@ -89,7 +150,10 @@ struct HashTableCell { using State = TState; + using key_type = Key; using value_type = Key; + using mapped_type = void; + Key key; HashTableCell() {} @@ -143,6 +207,22 @@ struct HashTableCell void readText(DB::ReadBuffer & rb) { DB::readDoubleQuoted(key, rb); } }; +template +auto lookupResultGetKey(HashTableCell * cell) { return &cell->key; } + +template +void * lookupResultGetMapped(HashTableCell *) { return nullptr; } + +/** + * A helper function for HashTable::insert() to set the "mapped" value. + * Overloaded on the mapped type, does nothing if it's void. + */ +template +void insertSetMapped(void * /* dest */, const ValueType & /* src */) {} + +template +void insertSetMapped(MappedType * dest, const ValueType & src) { *dest = src.second; } + /** Determines the size of the hash table, and when and how much it should be resized. */ @@ -476,6 +556,23 @@ protected: { return container->grower.place((ptr - container->buf) - container->grower.place(getHash())); } + + /** + * A hack for HashedDictionary. + * + * The problem: std-like find() returns an iterator, which has to be + * compared to end(). On the other hand, HashMap::find() returns + * LookupResult, which is compared to nullptr. HashedDictionary has to + * support both hash maps with the same code, hence the need for this + * hack. + * + * The proper way would be to remove iterator interface from our + * HashMap completely, change all its users to the existing internal + * iteration interface, and redefine end() to return LookupResult for + * compatibility with std find(). Unfortunately, now is not the time to + * do this. + */ + operator Cell * () const { return nullptr; } }; @@ -483,6 +580,10 @@ public: using key_type = Key; using value_type = typename Cell::value_type; + // Use lookupResultGetMapped/Key to work with these values. + using LookupResult = Cell *; + using ConstLookupResult = const Cell *; + size_t hash(const Key & x) const { return Hash::operator()(x); } @@ -642,7 +743,7 @@ protected: /// If the key is zero, insert it into a special place and return true. /// We don't have to persist a zero key, because it's not actually inserted. /// That's why we just take a Key by value, an not a key holder. - bool ALWAYS_INLINE emplaceIfZero(Key x, iterator & it, bool & inserted, size_t hash_value) + bool ALWAYS_INLINE emplaceIfZero(Key x, LookupResult & it, bool & inserted, size_t hash_value) { /// If it is claimed that the zero key can not be inserted into the table. if (!Cell::need_zero_value_storage) @@ -650,12 +751,13 @@ protected: if (Cell::isZero(x, *this)) { - it = iteratorToZero(); + it = this->zeroValue(); + if (!this->hasZero()) { ++m_size; this->setHasZero(); - it.ptr->setHash(hash_value); + this->zeroValue()->setHash(hash_value); inserted = true; } else @@ -669,9 +771,9 @@ protected: template void ALWAYS_INLINE emplaceNonZeroImpl(size_t place_value, KeyHolder && key_holder, - iterator & it, bool & inserted, size_t hash_value) + LookupResult & it, bool & inserted, size_t hash_value) { - it = iterator(this, &buf[place_value]); + it = &buf[place_value]; if (!buf[place_value].isZero(*this)) { @@ -705,13 +807,16 @@ protected: throw; } - it = find(keyHolderGetKey(key_holder), hash_value); + // The hash table was rehashed, so we have to re-find the key. + size_t new_place = findCell(key, hash_value, grower.place(hash_value)); + assert(!buf[new_place].isZero(*this)); + it = &buf[new_place]; } } /// Only for non-zero keys. Find the right place, insert the key there, if it does not already exist. Set iterator to the cell in output parameter. template - void ALWAYS_INLINE emplaceNonZero(KeyHolder && key_holder, iterator & it, + void ALWAYS_INLINE emplaceNonZero(KeyHolder && key_holder, LookupResult & it, bool & inserted, size_t hash_value) { const auto & key = keyHolderGetKey(key_holder); @@ -722,9 +827,9 @@ protected: public: /// Insert a value. In the case of any more complex values, it is better to use the `emplace` function. - std::pair ALWAYS_INLINE insert(const value_type & x) + std::pair ALWAYS_INLINE insert(const value_type & x) { - std::pair res; + std::pair res; size_t hash_value = hash(Cell::getKey(x)); if (!emplaceIfZero(Cell::getKey(x), res.first, res.second, hash_value)) @@ -733,7 +838,7 @@ public: } if (res.second) - res.first.ptr->setMapped(x); + insertSetMapped(lookupResultGetMapped(res.first), x); return res; } @@ -746,9 +851,10 @@ public: } - /** Insert the key, - * return an iterator to a position that can be used for `placement new` of value, - * as well as the flag - whether a new key was inserted. + /** Insert the key. + * Return values: + * 'it' -- a LookupResult pointing to the corresponding key/mapped pair. + * 'inserted' -- whether a new key was inserted. * * You have to make `placement new` of value if you inserted a new key, * since when destroying a hash table, it will call the destructor! @@ -762,14 +868,14 @@ public: * new(&it->second) Mapped(value); */ template - void ALWAYS_INLINE emplace(KeyHolder && key_holder, iterator & it, bool & inserted) + void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted) { const auto & key = keyHolderGetKey(key_holder); emplace(key_holder, it, inserted, hash(key)); } template - void ALWAYS_INLINE emplace(KeyHolder && key_holder, iterator & it, + void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted, size_t hash_value) { const auto & key = keyHolderGetKey(key_holder); @@ -789,48 +895,30 @@ public: resize(); } - iterator ALWAYS_INLINE find(Key x) + LookupResult ALWAYS_INLINE find(Key x) { if (Cell::isZero(x, *this)) - return this->hasZero() ? iteratorToZero() : end(); + return this->hasZero() ? this->zeroValue() : nullptr; size_t hash_value = hash(x); size_t place_value = findCell(x, hash_value, grower.place(hash_value)); - return !buf[place_value].isZero(*this) ? iterator(this, &buf[place_value]) : end(); + return !buf[place_value].isZero(*this) ? &buf[place_value] : nullptr; } + ConstLookupResult ALWAYS_INLINE find(Key x) const + { + return const_cast *>(this)->find(x); + } - const_iterator ALWAYS_INLINE find(Key x) const + LookupResult ALWAYS_INLINE find(Key x, size_t hash_value) { if (Cell::isZero(x, *this)) - return this->hasZero() ? iteratorToZero() : end(); - - size_t hash_value = hash(x); - size_t place_value = findCell(x, hash_value, grower.place(hash_value)); - return !buf[place_value].isZero(*this) ? const_iterator(this, &buf[place_value]) : end(); - } - - - iterator ALWAYS_INLINE find(Key x, size_t hash_value) - { - if (Cell::isZero(x, *this)) - return this->hasZero() ? iteratorToZero() : end(); + return this->hasZero() ? this->zeroValue() : nullptr; size_t place_value = findCell(x, hash_value, grower.place(hash_value)); - return !buf[place_value].isZero(*this) ? iterator(this, &buf[place_value]) : end(); + return !buf[place_value].isZero(*this) ? &buf[place_value] : nullptr; } - - const_iterator ALWAYS_INLINE find(Key x, size_t hash_value) const - { - if (Cell::isZero(x, *this)) - return this->hasZero() ? iteratorToZero() : end(); - - size_t place_value = findCell(x, hash_value, grower.place(hash_value)); - return !buf[place_value].isZero(*this) ? const_iterator(this, &buf[place_value]) : end(); - } - - bool ALWAYS_INLINE has(Key x) const { if (Cell::isZero(x, *this)) diff --git a/dbms/src/Common/HashTable/TwoLevelHashMap.h b/dbms/src/Common/HashTable/TwoLevelHashMap.h index cd08de702d4..f90cb6d2306 100644 --- a/dbms/src/Common/HashTable/TwoLevelHashMap.h +++ b/dbms/src/Common/HashTable/TwoLevelHashMap.h @@ -20,6 +20,9 @@ public: using mapped_type = typename Cell::Mapped; using value_type = typename Cell::value_type; + using Impl = ImplTable; + using LookupResult = typename Impl::LookupResult; + using TwoLevelHashTable>::TwoLevelHashTable; template @@ -31,14 +34,14 @@ public: mapped_type & ALWAYS_INLINE operator[](Key x) { - typename TwoLevelHashMapTable::iterator it; + typename TwoLevelHashMapTable::LookupResult it; bool inserted; this->emplace(x, it, inserted); if (inserted) - new(&it->getSecond()) mapped_type(); + new(lookupResultGetMapped(it)) mapped_type(); - return it->getSecond(); + return *lookupResultGetMapped(it); } }; diff --git a/dbms/src/Common/HashTable/TwoLevelHashTable.h b/dbms/src/Common/HashTable/TwoLevelHashTable.h index 19f7954038c..988fa139caa 100644 --- a/dbms/src/Common/HashTable/TwoLevelHashTable.h +++ b/dbms/src/Common/HashTable/TwoLevelHashTable.h @@ -84,6 +84,9 @@ public: using key_type = typename Impl::key_type; using value_type = typename Impl::value_type; + using LookupResult = typename Impl::LookupResult; + using ConstLookupResult = typename Impl::ConstLookupResult; + Impl impls[NUM_BUCKETS]; @@ -206,15 +209,15 @@ public: /// Insert a value. In the case of any more complex values, it is better to use the `emplace` function. - std::pair ALWAYS_INLINE insert(const value_type & x) + std::pair ALWAYS_INLINE insert(const value_type & x) { size_t hash_value = hash(Cell::getKey(x)); - std::pair res; + std::pair res; emplace(Cell::getKey(x), res.first, res.second, hash_value); if (res.second) - res.first.getPtr()->setMapped(x); + insertSetMapped(lookupResultGetMapped(res.first), x); return res; } @@ -236,7 +239,7 @@ public: * new(&it->second) Mapped(value); */ template - void ALWAYS_INLINE emplace(KeyHolder && key_holder, iterator & it, bool & inserted) + void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted) { size_t hash_value = hash(keyHolderGetKey(key_holder)); emplace(key_holder, it, inserted, hash_value); @@ -245,40 +248,27 @@ public: /// Same, but with a precalculated values of hash function. template - void ALWAYS_INLINE emplace(KeyHolder && key_holder, iterator & it, + void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted, size_t hash_value) { size_t buck = getBucketFromHash(hash_value); - typename Impl::iterator impl_it; - impls[buck].emplace(key_holder, impl_it, inserted, hash_value); - it = iterator(this, buck, impl_it); + impls[buck].emplace(key_holder, it, inserted, hash_value); } - - iterator ALWAYS_INLINE find(Key x, size_t hash_value) + LookupResult ALWAYS_INLINE find(Key x, size_t hash_value) { size_t buck = getBucketFromHash(hash_value); - - typename Impl::iterator found = impls[buck].find(x, hash_value); - return found != impls[buck].end() - ? iterator(this, buck, found) - : end(); + return impls[buck].find(x, hash_value); } - - const_iterator ALWAYS_INLINE find(Key x, size_t hash_value) const + ConstLookupResult ALWAYS_INLINE find(Key x, size_t hash_value) const { - size_t buck = getBucketFromHash(hash_value); - - typename Impl::const_iterator found = impls[buck].find(x, hash_value); - return found != impls[buck].end() - ? const_iterator(this, buck, found) - : end(); + return const_cast *>(this)->find(x, hash_value); } + LookupResult ALWAYS_INLINE find(Key x) { return find(x, hash(x)); } - iterator ALWAYS_INLINE find(Key x) { return find(x, hash(x)); } - const_iterator ALWAYS_INLINE find(Key x) const { return find(x, hash(x)); } + ConstLookupResult ALWAYS_INLINE find(Key x) const { return find(x, hash(x)); } void write(DB::WriteBuffer & wb) const diff --git a/dbms/src/Common/SpaceSaving.h b/dbms/src/Common/SpaceSaving.h index da7e9293723..93ddfee6b19 100644 --- a/dbms/src/Common/SpaceSaving.h +++ b/dbms/src/Common/SpaceSaving.h @@ -366,10 +366,10 @@ private: Counter * findCounter(const TKey & key, size_t hash) { auto it = counter_map.find(key, hash); - if (it == counter_map.end()) + if (!it) return nullptr; - return it->getSecond(); + return *lookupResultGetMapped(it); } void rebuildCounterMap() diff --git a/dbms/src/Common/tests/auto_array.cpp b/dbms/src/Common/tests/auto_array.cpp index 11a4b79aa3a..fd23afc0236 100644 --- a/dbms/src/Common/tests/auto_array.cpp +++ b/dbms/src/Common/tests/auto_array.cpp @@ -149,16 +149,16 @@ int main(int argc, char ** argv) Map map; for (size_t i = 0; i < map_size; ++i) { - Map::iterator it; + Map::LookupResult it; bool inserted; map.emplace(rand(), it, inserted); if (inserted) { - new(&it->getSecond()) Arr(n); + new(lookupResultGetMapped(it)) Arr(n); for (size_t j = 0; j < n; ++j) - it->getSecond()[j] = field; + (*lookupResultGetMapped(it))[j] = field; } } diff --git a/dbms/src/Common/tests/hash_table.cpp b/dbms/src/Common/tests/hash_table.cpp index 291b7e7167e..b2464c7cbd3 100644 --- a/dbms/src/Common/tests/hash_table.cpp +++ b/dbms/src/Common/tests/hash_table.cpp @@ -17,14 +17,14 @@ int main(int, char **) cont.insert(1); cont.insert(2); - Cont::iterator it; + Cont::LookupResult it; bool inserted; + int key = 3; + cont.emplace(key, it, inserted); + std::cerr << inserted << ", " << key << std::endl; - cont.emplace(3, it, inserted); - std::cerr << inserted << ", " << it->getValue() << std::endl; - - cont.emplace(3, it, inserted); - std::cerr << inserted << ", " << it->getValue() << std::endl; + cont.emplace(key, it, inserted); + std::cerr << inserted << ", " << key << std::endl; for (auto x : cont) std::cerr << x.getValue() << std::endl; diff --git a/dbms/src/Common/tests/parallel_aggregation.cpp b/dbms/src/Common/tests/parallel_aggregation.cpp index 36bbe6e66d5..4b3cc3006d7 100644 --- a/dbms/src/Common/tests/parallel_aggregation.cpp +++ b/dbms/src/Common/tests/parallel_aggregation.cpp @@ -76,20 +76,20 @@ void aggregate1(Map & map, Source::const_iterator begin, Source::const_iterator void aggregate12(Map & map, Source::const_iterator begin, Source::const_iterator end) { - Map::iterator found; + Map::LookupResult found = nullptr; auto prev_it = end; for (auto it = begin; it != end; ++it) { - if (*it == *prev_it) + if (prev_it != end && *it == *prev_it) { - ++found->getSecond(); + ++*lookupResultGetMapped(found); continue; } prev_it = it; bool inserted; map.emplace(*it, found, inserted); - ++found->getSecond(); + ++*lookupResultGetMapped(found); } } @@ -101,20 +101,20 @@ void aggregate2(MapTwoLevel & map, Source::const_iterator begin, Source::const_i void aggregate22(MapTwoLevel & map, Source::const_iterator begin, Source::const_iterator end) { - MapTwoLevel::iterator found; + MapTwoLevel::LookupResult found = nullptr; auto prev_it = end; for (auto it = begin; it != end; ++it) { if (*it == *prev_it) { - ++found->getSecond(); + ++*lookupResultGetMapped(found); continue; } prev_it = it; bool inserted; map.emplace(*it, found, inserted); - ++found->getSecond(); + ++*lookupResultGetMapped(found); } } @@ -135,10 +135,10 @@ void aggregate3(Map & local_map, Map & global_map, Mutex & mutex, Source::const_ for (auto it = begin; it != end; ++it) { - Map::iterator found = local_map.find(*it); + auto found = local_map.find(*it); - if (found != local_map.end()) - ++found->getSecond(); + if (found) + ++*lookupResultGetMapped(found); else if (local_map.size() < threshold) ++local_map[*it]; /// TODO You could do one lookup, not two. else @@ -160,10 +160,10 @@ void aggregate33(Map & local_map, Map & global_map, Mutex & mutex, Source::const for (auto it = begin; it != end; ++it) { - Map::iterator found; + Map::LookupResult found; bool inserted; local_map.emplace(*it, found, inserted); - ++found->getSecond(); + ++*lookupResultGetMapped(found); if (inserted && local_map.size() == threshold) { @@ -195,10 +195,10 @@ void aggregate4(Map & local_map, MapTwoLevel & global_map, Mutex * mutexes, Sour { for (; it != block_end; ++it) { - Map::iterator found = local_map.find(*it); + auto found = local_map.find(*it); - if (found != local_map.end()) - ++found->getSecond(); + if (found) + ++*lookupResultGetMapped(found); else { size_t hash_value = global_map.hash(*it); diff --git a/dbms/src/Common/tests/parallel_aggregation2.cpp b/dbms/src/Common/tests/parallel_aggregation2.cpp index a2b26e82420..7df230c5651 100644 --- a/dbms/src/Common/tests/parallel_aggregation2.cpp +++ b/dbms/src/Common/tests/parallel_aggregation2.cpp @@ -46,14 +46,14 @@ struct AggregateIndependent { for (auto it = begin; it != end; ++it) { - typename Map::iterator place; + typename Map::LookupResult place; bool inserted; map.emplace(*it, place, inserted); if (inserted) - creator(place->getSecond()); + creator(*lookupResultGetMapped(place)); else - updater(place->getSecond()); + updater(*lookupResultGetMapped(place)); } }); } @@ -87,13 +87,13 @@ struct AggregateIndependentWithSequentialKeysOptimization pool.schedule([&, begin, end]() { - typename Map::iterator place; + typename Map::LookupResult place = nullptr; Key prev_key {}; for (auto it = begin; it != end; ++it) { if (it != begin && *it == prev_key) { - updater(place->getSecond()); + updater(*lookupResultGetMapped(place)); continue; } prev_key = *it; @@ -102,9 +102,9 @@ struct AggregateIndependentWithSequentialKeysOptimization map.emplace(*it, place, inserted); if (inserted) - creator(place->getSecond()); + creator(*lookupResultGetMapped(place)); else - updater(place->getSecond()); + updater(*lookupResultGetMapped(place)); } }); } diff --git a/dbms/src/Core/tests/string_pool.cpp b/dbms/src/Core/tests/string_pool.cpp index 4f792860029..2db1233e8fe 100644 --- a/dbms/src/Core/tests/string_pool.cpp +++ b/dbms/src/Core/tests/string_pool.cpp @@ -209,9 +209,9 @@ int main(int argc, char ** argv) for (Vec::iterator it = vec.begin(); it != vec.end(); ++it) { - RefsHashMap::iterator inserted_it; + RefsHashMap::LookupResult inserted_it; bool inserted; - set.emplace(StringRef(*it), inserted_it, inserted); + set.emplace(StringRef(*lookupResultGetMapped(it)), inserted_it, inserted); } std::cerr << "Inserted refs into HashMap in " << watch.elapsedSeconds() << " sec, " @@ -236,7 +236,7 @@ int main(int argc, char ** argv) for (Vec::iterator it = vec.begin(); it != vec.end(); ++it) { - RefsHashMap::iterator inserted_it; + RefsHashMap::LookupResult inserted_it; bool inserted; set.emplace(StringRef(pool.insert(it->data(), it->size()), it->size()), inserted_it, inserted); } diff --git a/dbms/src/DataTypes/DataTypeEnum.cpp b/dbms/src/DataTypes/DataTypeEnum.cpp index add7052195a..d933b9a61d6 100644 --- a/dbms/src/DataTypes/DataTypeEnum.cpp +++ b/dbms/src/DataTypes/DataTypeEnum.cpp @@ -70,20 +70,20 @@ void DataTypeEnum::fillMaps() { for (const auto & name_and_value : values) { - const auto name_to_value_pair = name_to_value_map.insert( + const auto inserted_value = name_to_value_map.insert( { StringRef{name_and_value.first}, name_and_value.second }); - if (!name_to_value_pair.second) + if (!inserted_value.second) throw Exception{"Duplicate names in enum: '" + name_and_value.first + "' = " + toString(name_and_value.second) - + " and '" + name_to_value_pair.first->getFirst().toString() + "' = " + toString(name_to_value_pair.first->getSecond()), + + " and " + toString(*lookupResultGetMapped(inserted_value.first)), ErrorCodes::SYNTAX_ERROR}; - const auto value_to_name_pair = value_to_name_map.insert( + const auto inserted_name = value_to_name_map.insert( { name_and_value.second, StringRef{name_and_value.first} }); - if (!value_to_name_pair.second) + if (!inserted_name.second) throw Exception{"Duplicate values in enum: '" + name_and_value.first + "' = " + toString(name_and_value.second) - + " and '" + value_to_name_pair.first->second.toString() + "' = " + toString(value_to_name_pair.first->first), + + " and '" + toString((*inserted_name.first).first) + "'", ErrorCodes::SYNTAX_ERROR}; } } diff --git a/dbms/src/DataTypes/DataTypeEnum.h b/dbms/src/DataTypes/DataTypeEnum.h index 0fd9a898fb0..ac96c08dc75 100644 --- a/dbms/src/DataTypes/DataTypeEnum.h +++ b/dbms/src/DataTypes/DataTypeEnum.h @@ -78,10 +78,10 @@ public: FieldType getValue(StringRef field_name) const { const auto it = name_to_value_map.find(field_name); - if (it == std::end(name_to_value_map)) + if (!it) throw Exception{"Unknown element '" + field_name.toString() + "' for type " + getName(), ErrorCodes::LOGICAL_ERROR}; - return it->getSecond(); + return *lookupResultGetMapped(it); } Field castToName(const Field & value_or_name) const override; diff --git a/dbms/src/Dictionaries/ComplexKeyCacheDictionary.h b/dbms/src/Dictionaries/ComplexKeyCacheDictionary.h index 7c2ba75ba17..6837bd9eab4 100644 --- a/dbms/src/Dictionaries/ComplexKeyCacheDictionary.h +++ b/dbms/src/Dictionaries/ComplexKeyCacheDictionary.h @@ -469,7 +469,7 @@ private: { const StringRef key = keys_array[row]; const auto it = map.find(key); - const auto string_ref = it != std::end(map) ? it->getSecond() : get_default(row); + const auto string_ref = it ? *lookupResultGetMapped(it) : get_default(row); out->insertData(string_ref.data, string_ref.size); } } diff --git a/dbms/src/Dictionaries/ComplexKeyHashedDictionary.cpp b/dbms/src/Dictionaries/ComplexKeyHashedDictionary.cpp index 586fc5e89f9..5e08ce3295e 100644 --- a/dbms/src/Dictionaries/ComplexKeyHashedDictionary.cpp +++ b/dbms/src/Dictionaries/ComplexKeyHashedDictionary.cpp @@ -357,7 +357,7 @@ void ComplexKeyHashedDictionary::updateData() { const auto s_key = placeKeysInPool(i, saved_key_column_ptrs, keys, temp_key_pool); auto it = update_key_hash.find(s_key); - if (it != std::end(update_key_hash)) + if (it) filter[i] = 0; else filter[i] = 1; @@ -561,7 +561,7 @@ void ComplexKeyHashedDictionary::getItemsImpl( const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool); const auto it = attr.find(key); - set_value(i, it != attr.end() ? static_cast(it->getSecond()) : get_default(i)); + set_value(i, it ? static_cast(*lookupResultGetMapped(it)) : get_default(i)); /// free memory allocated for the key temporary_keys_pool.rollback(key.size); @@ -672,7 +672,7 @@ void ComplexKeyHashedDictionary::has(const Attribute & attribute, const Columns const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool); const auto it = attr.find(key); - out[i] = it != attr.end(); + out[i] = static_cast(it); /// free memory allocated for the key temporary_keys_pool.rollback(key.size); diff --git a/dbms/src/Dictionaries/HashedDictionary.cpp b/dbms/src/Dictionaries/HashedDictionary.cpp index 7946c87dff8..7fe5dd7abf2 100644 --- a/dbms/src/Dictionaries/HashedDictionary.cpp +++ b/dbms/src/Dictionaries/HashedDictionary.cpp @@ -696,7 +696,7 @@ void HashedDictionary::has(const Attribute & attribute, const PaddedPODArraygetSecond(); + const auto & ranges_and_values = *lookupResultGetMapped(it); const auto val_it = std::find_if(std::begin(ranges_and_values), std::end(ranges_and_values), [date](const Value & v) { @@ -395,10 +395,10 @@ void RangeHashedDictionary::getItemsImpl( for (const auto i : ext::range(0, ids.size())) { const auto it = attr.find(ids[i]); - if (it != std::end(attr)) + if (it) { const auto date = dates[i]; - const auto & ranges_and_values = it->getSecond(); + const auto & ranges_and_values = *lookupResultGetMapped(it); const auto val_it = std::find_if(std::begin(ranges_and_values), std::end(ranges_and_values), [date](const Value & v) { @@ -423,9 +423,9 @@ void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const K auto & map = *std::get>(attribute.maps); const auto it = map.find(id); - if (it != map.end()) + if (it) { - auto & values = it->getSecond(); + auto & values = *lookupResultGetMapped(it); const auto insert_it = std::lower_bound(std::begin(values), std::end(values), range, [](const Value & lhs, const Range & rhs_range) @@ -496,9 +496,9 @@ void RangeHashedDictionary::setAttributeValue(Attribute & attribute, const Key i const auto it = map.find(id); - if (it != map.end()) + if (it) { - auto & values = it->getSecond(); + auto & values = *lookupResultGetMapped(it); const auto insert_it = std::lower_bound( std::begin(values), std::end(values), range, [](const Value & lhs, const Range & rhs_range) diff --git a/dbms/src/Functions/addressToLine.cpp b/dbms/src/Functions/addressToLine.cpp index 498ab6e7a12..cd7e374c27b 100644 --- a/dbms/src/Functions/addressToLine.cpp +++ b/dbms/src/Functions/addressToLine.cpp @@ -135,13 +135,13 @@ private: StringRef implCached(uintptr_t addr) { - Map::iterator it; + Map::LookupResult it; bool inserted; std::lock_guard lock(mutex); map.emplace(addr, it, inserted); if (inserted) - it->getSecond() = impl(addr); - return it->getSecond(); + *lookupResultGetMapped(it) = impl(addr); + return *lookupResultGetMapped(it); } }; diff --git a/dbms/src/Functions/array/arrayDistinct.cpp b/dbms/src/Functions/array/arrayDistinct.cpp index aa4e8aec4f1..4bcd5100b26 100644 --- a/dbms/src/Functions/array/arrayDistinct.cpp +++ b/dbms/src/Functions/array/arrayDistinct.cpp @@ -173,7 +173,7 @@ bool FunctionArrayDistinct::executeNumber( if (nullable_col && (*src_null_map)[j]) continue; - if (set.find(values[j]) == set.end()) + if (!set.find(values[j])) { res_data.emplace_back(values[j]); set.insert(values[j]); @@ -229,7 +229,7 @@ bool FunctionArrayDistinct::executeString( StringRef str_ref = src_data_concrete->getDataAt(j); - if (set.find(str_ref) == set.end()) + if (!set.find(str_ref)) { set.insert(str_ref); res_data_column_string.insertData(str_ref.data, str_ref.size); @@ -279,7 +279,7 @@ void FunctionArrayDistinct::executeHashed( src_data.updateHashWithValue(j, hash_function); hash_function.get128(reinterpret_cast(&hash)); - if (set.find(hash) == set.end()) + if (!set.find(hash)) { set.insert(hash); res_data_col.insertFrom(src_data, j); diff --git a/dbms/src/Functions/transform.cpp b/dbms/src/Functions/transform.cpp index bac5adbb7b9..df3daa62f95 100644 --- a/dbms/src/Functions/transform.cpp +++ b/dbms/src/Functions/transform.cpp @@ -507,8 +507,8 @@ private: for (size_t i = 0; i < size; ++i) { auto it = table.find(src[i]); - if (it != table.end()) - memcpy(&dst[i], &it->getSecond(), sizeof(dst[i])); /// little endian. + if (it) + memcpy(&dst[i], lookupResultGetMapped(it), sizeof(dst[i])); /// little endian. else dst[i] = dst_default; } @@ -523,8 +523,8 @@ private: for (size_t i = 0; i < size; ++i) { auto it = table.find(src[i]); - if (it != table.end()) - memcpy(&dst[i], &it->getSecond(), sizeof(dst[i])); /// little endian. + if (it) + memcpy(&dst[i], lookupResultGetMapped(it), sizeof(dst[i])); /// little endian. else dst[i] = dst_default[i]; } @@ -539,8 +539,8 @@ private: for (size_t i = 0; i < size; ++i) { auto it = table.find(src[i]); - if (it != table.end()) - memcpy(&dst[i], &it->getSecond(), sizeof(dst[i])); + if (it) + memcpy(&dst[i], lookupResultGetMapped(it), sizeof(dst[i])); else dst[i] = src[i]; } @@ -557,7 +557,7 @@ private: for (size_t i = 0; i < size; ++i) { auto it = table.find(src[i]); - StringRef ref = it != table.end() ? it->getSecond() : dst_default; + StringRef ref = it ? *lookupResultGetMapped(it) : dst_default; dst_data.resize(current_dst_offset + ref.size); memcpy(&dst_data[current_dst_offset], ref.data, ref.size); current_dst_offset += ref.size; @@ -580,8 +580,8 @@ private: auto it = table.find(src[i]); StringRef ref; - if (it != table.end()) - ref = it->getSecond(); + if (it) + ref = *lookupResultGetMapped(it); else { ref.data = reinterpret_cast(&dst_default_data[current_dst_default_offset]); @@ -610,8 +610,8 @@ private: StringRef ref{&src_data[current_src_offset], src_offsets[i] - current_src_offset}; current_src_offset = src_offsets[i]; auto it = table.find(ref); - if (it != table.end()) - memcpy(&dst[i], &it->getSecond(), sizeof(dst[i])); + if (it) + memcpy(&dst[i], lookupResultGetMapped(it), sizeof(dst[i])); else dst[i] = dst_default; } @@ -631,8 +631,8 @@ private: StringRef ref{&src_data[current_src_offset], src_offsets[i] - current_src_offset}; current_src_offset = src_offsets[i]; auto it = table.find(ref); - if (it != table.end()) - memcpy(&dst[i], &it->getSecond(), sizeof(dst[i])); + if (it) + memcpy(&dst[i], lookupResultGetMapped(it), sizeof(dst[i])); else dst[i] = dst_default[i]; } @@ -655,7 +655,7 @@ private: auto it = table.find(src_ref); - StringRef dst_ref = it != table.end() ? it->getSecond() : (with_default ? dst_default : src_ref); + StringRef dst_ref = it ? *lookupResultGetMapped(it) : (with_default ? dst_default : src_ref); dst_data.resize(current_dst_offset + dst_ref.size); memcpy(&dst_data[current_dst_offset], dst_ref.data, dst_ref.size); current_dst_offset += dst_ref.size; @@ -696,8 +696,8 @@ private: auto it = table.find(src_ref); StringRef dst_ref; - if (it != table.end()) - dst_ref = it->getSecond(); + if (it) + dst_ref = *lookupResultGetMapped(it); else { dst_ref.data = reinterpret_cast(&dst_default_data[current_dst_default_offset]); diff --git a/dbms/src/Interpreters/Aggregator.cpp b/dbms/src/Interpreters/Aggregator.cpp index 91941791afc..6b83246b83a 100644 --- a/dbms/src/Interpreters/Aggregator.cpp +++ b/dbms/src/Interpreters/Aggregator.cpp @@ -486,6 +486,7 @@ void NO_INLINE Aggregator::executeImplBatch( aggregate_data = emplace_result.getMapped(); places[i] = aggregate_data; + assert(places[i] != nullptr); } /// Add values to the aggregate functions. diff --git a/dbms/src/Interpreters/SetVariants.h b/dbms/src/Interpreters/SetVariants.h index 2d84e80156c..bdb5e212751 100644 --- a/dbms/src/Interpreters/SetVariants.h +++ b/dbms/src/Interpreters/SetVariants.h @@ -22,7 +22,7 @@ namespace DB /// For the case where there is one numeric key. -template /// UInt8/16/32/64 for any types with corresponding bit width. +template /// UInt8/16/32/64 for any types with corresponding bit width. struct SetMethodOneNumber { using Data = TData; @@ -30,7 +30,8 @@ struct SetMethodOneNumber Data data; - using State = ColumnsHashing::HashMethodOneNumber; + using State = ColumnsHashing::HashMethodOneNumber; }; /// For the case where there is one string key. @@ -183,8 +184,12 @@ struct SetMethodHashed */ struct NonClearableSet { - std::unique_ptr>> key8; - std::unique_ptr>> key16; + /* + * As in Aggregator, using consecutive keys cache doesn't improve performance + * for FixedHashTables. + */ + std::unique_ptr, false /* use_cache */>> key8; + std::unique_ptr, false /* use_cache */>> key16; /** Also for the experiment was tested the ability to use SmallSet, * as long as the number of elements in the set is small (and, if necessary, converted to a full-fledged HashSet). @@ -209,8 +214,8 @@ struct NonClearableSet struct ClearableSet { - std::unique_ptr>> key8; - std::unique_ptr>> key16; + std::unique_ptr, false /* use_cache */>> key8; + std::unique_ptr, false /*use_cache */>> key16; std::unique_ptr>>> key32; std::unique_ptr>>> key64; diff --git a/dbms/src/Interpreters/tests/hash_map.cpp b/dbms/src/Interpreters/tests/hash_map.cpp index 0bbabab8632..910bf2c0649 100644 --- a/dbms/src/Interpreters/tests/hash_map.cpp +++ b/dbms/src/Interpreters/tests/hash_map.cpp @@ -154,7 +154,7 @@ int main(int argc, char ** argv) Stopwatch watch; HashMap map; - HashMap::iterator it; + HashMap::LookupResult it; bool inserted; for (size_t i = 0; i < n; ++i) @@ -162,8 +162,8 @@ int main(int argc, char ** argv) map.emplace(data[i], it, inserted); if (inserted) { - new(&it->getSecond()) Value; - std::swap(it->getSecond(), value); + new(lookupResultGetMapped(it)) Value; + std::swap(*lookupResultGetMapped(it), value); INIT } } @@ -185,7 +185,7 @@ int main(int argc, char ** argv) using Map = HashMap; Map map; - Map::iterator it; + Map::LookupResult it; bool inserted; for (size_t i = 0; i < n; ++i) @@ -193,8 +193,8 @@ int main(int argc, char ** argv) map.emplace(data[i], it, inserted); if (inserted) { - new(&it->getSecond()) Value; - std::swap(it->getSecond(), value); + new(lookupResultGetMapped(it)) Value; + std::swap(*lookupResultGetMapped(it), value); INIT } } @@ -217,7 +217,7 @@ int main(int argc, char ** argv) using Map = HashMap; Map map; - Map::iterator it; + Map::LookupResult it; bool inserted; for (size_t i = 0; i < n; ++i) @@ -225,8 +225,8 @@ int main(int argc, char ** argv) map.emplace(data[i], it, inserted); if (inserted) { - new(&it->getSecond()) Value; - std::swap(it->getSecond(), value); + new(lookupResultGetMapped(it)) Value; + std::swap(*lookupResultGetMapped(it), value); INIT } } diff --git a/dbms/src/Interpreters/tests/hash_map_lookup.cpp b/dbms/src/Interpreters/tests/hash_map_lookup.cpp index 1aceec7b18f..b34c23e6c41 100644 --- a/dbms/src/Interpreters/tests/hash_map_lookup.cpp +++ b/dbms/src/Interpreters/tests/hash_map_lookup.cpp @@ -46,23 +46,24 @@ template void NO_INLINE bench(const std::vector & data, const char * name) { Map map; - typename Map::iterator it; - bool inserted; Stopwatch watch; for (size_t i = 0, size = data.size(); i < size; ++i) { + typename Map::LookupResult it; + bool inserted; + map.emplace(data[i], it, inserted); if (inserted) - it->getSecond() = 1; + *lookupResultGetMapped(it) = 1; else - ++it->getSecond(); + ++*lookupResultGetMapped(it); } for (size_t i = 0, size = data.size(); i < size; ++i) { - it = map.find(data[i]); - ++it->getSecond(); + auto it = map.find(data[i]); + ++*lookupResultGetMapped(it); } watch.stop(); std::cerr << std::fixed << std::setprecision(2) << "HashMap (" << name << "). Size: " << map.size() @@ -77,13 +78,13 @@ template void insert(Map & map, StringRef & k) { bool inserted; - typename Map::iterator it; + typename Map::LookupResult it; map.emplace(k, it, inserted, nullptr); if (inserted) - *it = 1; + *lookupResultGetMapped(it) = 1; else - ++*it; - std::cout << *map.find(k) << std::endl; + ++*lookupResultGetMapped(it); + std::cout << *lookupResultGetMapped(map.find(k))<< std::endl; } int main(int argc, char ** argv) diff --git a/dbms/src/Interpreters/tests/hash_map_string.cpp b/dbms/src/Interpreters/tests/hash_map_string.cpp index cad701b49ca..61980a614ab 100644 --- a/dbms/src/Interpreters/tests/hash_map_string.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string.cpp @@ -330,15 +330,15 @@ int main(int argc, char ** argv) using Map = HashMapWithSavedHash, Grower>; Map map; - Map::iterator it; + Map::LookupResult it; bool inserted; for (size_t i = 0; i < n; ++i) { map.emplace(data[i], it, inserted); if (inserted) - it->getSecond() = 0; - ++it->getSecond(); + *lookupResultGetMapped(it) = 0; + ++*lookupResultGetMapped(it); } watch.stop(); @@ -359,15 +359,15 @@ int main(int argc, char ** argv) using Map = HashMapWithSavedHash; Map map; - Map::iterator it; + Map::LookupResult it; bool inserted; for (size_t i = 0; i < n; ++i) { map.emplace(data[i], it, inserted); if (inserted) - it->getSecond() = 0; - ++it->getSecond(); + *lookupResultGetMapped(it) = 0; + ++*lookupResultGetMapped(it); } watch.stop(); @@ -389,15 +389,15 @@ int main(int argc, char ** argv) using Map = HashMapWithSavedHash; Map map; - Map::iterator it; + Map::LookupResult it; bool inserted; for (size_t i = 0; i < n; ++i) { map.emplace(data[i], it, inserted); if (inserted) - it->getSecond() = 0; - ++it->getSecond(); + *lookupResultGetMapped(it) = 0; + ++*lookupResultGetMapped(it); } watch.stop(); @@ -419,15 +419,15 @@ int main(int argc, char ** argv) using Map = HashMapWithSavedHash; Map map; - Map::iterator it; + Map::LookupResult it; bool inserted; for (size_t i = 0; i < n; ++i) { map.emplace(data[i], it, inserted); if (inserted) - it->getSecond() = 0; - ++it->getSecond(); + *lookupResultGetMapped(it) = 0; + ++*lookupResultGetMapped(it); } watch.stop(); diff --git a/dbms/src/Interpreters/tests/hash_map_string_2.cpp b/dbms/src/Interpreters/tests/hash_map_string_2.cpp index da9619c638d..66a087d6824 100644 --- a/dbms/src/Interpreters/tests/hash_map_string_2.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string_2.cpp @@ -588,15 +588,15 @@ void NO_INLINE bench(const std::vector & data, const char * name) using Map = HashMapWithSavedHash>; Map map; - typename Map::iterator it; + typename Map::LookupResult it; bool inserted; for (size_t i = 0, size = data.size(); i < size; ++i) { map.emplace(static_cast(data[i]), it, inserted); if (inserted) - it->getSecond() = 0; - ++it->getSecond(); + *lookupResultGetMapped(it) = 0; + ++*lookupResultGetMapped(it); } watch.stop(); diff --git a/dbms/src/Interpreters/tests/hash_map_string_3.cpp b/dbms/src/Interpreters/tests/hash_map_string_3.cpp index 2309a29c531..7bd27bc6785 100644 --- a/dbms/src/Interpreters/tests/hash_map_string_3.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string_3.cpp @@ -435,15 +435,15 @@ void NO_INLINE bench(const std::vector & data, const char * name) using Map = HashMapWithSavedHash; Map map; - typename Map::iterator it; + typename Map::LookupResult it; bool inserted; for (size_t i = 0, size = data.size(); i < size; ++i) { map.emplace(static_cast(data[i]), it, inserted); if (inserted) - it->getSecond() = 0; - ++it->getSecond(); + *lookupResultGetMapped(it) = 0; + ++*lookupResultGetMapped(it); } watch.stop(); diff --git a/dbms/src/Interpreters/tests/hash_map_string_small.cpp b/dbms/src/Interpreters/tests/hash_map_string_small.cpp index 730d88f0a12..529cef13c11 100644 --- a/dbms/src/Interpreters/tests/hash_map_string_small.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string_small.cpp @@ -137,15 +137,15 @@ int main(int argc, char ** argv) using Map = HashMapWithSavedHash; Map map; - Map::iterator it; + Map::LookupResult it; bool inserted; for (size_t i = 0; i < n; ++i) { map.emplace(data[i], it, inserted); if (inserted) - it->getSecond() = 0; - ++it->getSecond(); + *lookupResultGetMapped(it) = 0; + ++*lookupResultGetMapped(it); } watch.stop(); @@ -166,15 +166,15 @@ int main(int argc, char ** argv) using Map = HashMapWithSavedHash; Map map; - Map::iterator it; + Map::LookupResult it; bool inserted; for (size_t i = 0; i < n; ++i) { map.emplace(SmallStringRef(data[i].data, data[i].size), it, inserted); if (inserted) - it->getSecond() = 0; - ++it->getSecond(); + *lookupResultGetMapped(it) = 0; + ++*lookupResultGetMapped(it); } watch.stop(); diff --git a/dbms/src/Interpreters/tests/two_level_hash_map.cpp b/dbms/src/Interpreters/tests/two_level_hash_map.cpp index 475475f9c7a..ed9df82d0b1 100644 --- a/dbms/src/Interpreters/tests/two_level_hash_map.cpp +++ b/dbms/src/Interpreters/tests/two_level_hash_map.cpp @@ -60,15 +60,15 @@ int main(int argc, char ** argv) using Map = TwoLevelHashTable>, DefaultHash, HashTableGrower<8>, HashTableAllocator>; Map map; - Map::iterator it; + Map::LookupResult it; bool inserted; for (size_t i = 0; i < n; ++i) { map.emplace(data[i], it, inserted); if (inserted) - it->getSecond() = 0; - ++it->getSecond(); + *lookupResultGetMapped(it) = 0; + ++*lookupResultGetMapped(it); } watch.stop(); @@ -96,15 +96,15 @@ int main(int argc, char ** argv) //using Map = HashMap; Map map; - Map::iterator it; + Map::LookupResult it; bool inserted; for (size_t i = 0; i < n; ++i) { map.emplace(i, it, inserted); if (inserted) - it->getSecond() = 0; - ++it->getSecond(); + *lookupResultGetMapped(it) = 0; + ++*lookupResultGetMapped(it); } watch.stop(); diff --git a/dbms/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 9730ae3f3cc..20830d2eccf 100644 --- a/dbms/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -49,7 +49,7 @@ JSONEachRowRowInputFormat::JSONEachRowRowInputFormat( } } - prev_positions.assign(num_columns, name_map.end()); + prev_positions.resize(num_columns); } const String & JSONEachRowRowInputFormat::columnName(size_t i) const @@ -63,21 +63,21 @@ inline size_t JSONEachRowRowInputFormat::columnIndex(const StringRef & name, siz /// and a quick check to match the next expected field, instead of searching the hash table. if (prev_positions.size() > key_index - && prev_positions[key_index] != name_map.end() - && name == prev_positions[key_index]->getFirst()) + && prev_positions[key_index] + && name == *lookupResultGetKey(prev_positions[key_index])) { - return prev_positions[key_index]->getSecond(); + return *lookupResultGetMapped(prev_positions[key_index]); } else { const auto it = name_map.find(name); - if (name_map.end() != it) + if (it) { if (key_index < prev_positions.size()) prev_positions[key_index] = it; - return it->getSecond(); + return *lookupResultGetMapped(it); } else return UNKNOWN_FIELD; diff --git a/dbms/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h b/dbms/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h index 17711b5f27d..afa3c9f2ba1 100644 --- a/dbms/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h @@ -62,7 +62,7 @@ private: NameMap name_map; /// Cached search results for previous row (keyed as index in JSON object) - used as a hint. - std::vector prev_positions; + std::vector prev_positions; }; } diff --git a/dbms/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index 35a0b4b7a7c..8cf3702d3bf 100644 --- a/dbms/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -118,7 +118,7 @@ bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ex /// and quickly checking for the next expected field, instead of searching the hash table. auto it = name_map.find(name_ref); - if (name_map.end() == it) + if (!it) { if (!format_settings.skip_unknown_fields) throw Exception("Unknown field found while parsing TSKV format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA); @@ -129,7 +129,7 @@ bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ex } else { - index = it->getSecond(); + index = *lookupResultGetMapped(it); if (read_columns[index]) throw Exception("Duplicate field found while parsing TSKV format: " + name_ref.toString(), ErrorCodes::INCORRECT_DATA); diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataWriter.cpp index d7afe4098af..e29ae01b3b3 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -47,7 +47,7 @@ void buildScatterSelector( for (size_t i = 0; i < num_rows; ++i) { Data::key_type key = hash128(i, columns.size(), columns); - typename Data::iterator it; + typename Data::LookupResult it; bool inserted; partitions_map.emplace(key, it, inserted); @@ -57,7 +57,7 @@ void buildScatterSelector( throw Exception("Too many partitions for single INSERT block (more than " + toString(max_parts) + "). The limit is controlled by 'max_partitions_per_insert_block' setting. Large number of partitions is a common misconception. It will lead to severe negative performance impact, including slow server startup, slow INSERT queries and slow SELECT queries. Recommended total number of partitions for a table is under 1000..10000. Please note, that partitioning is not intended to speed up SELECT queries (ORDER BY key is sufficient to make range queries fast). Partitions are intended for data manipulation (DROP PARTITION, etc).", ErrorCodes::TOO_MANY_PARTS); partition_num_to_first_row.push_back(i); - it->getSecond() = partitions_count; + *lookupResultGetMapped(it) = partitions_count; ++partitions_count; @@ -70,7 +70,7 @@ void buildScatterSelector( } if (partitions_count > 1) - selector[i] = it->getSecond(); + selector[i] = *lookupResultGetMapped(it); } } From e197cc8a49d01f2d9d4bea7a901e5dfc30e72dc6 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 24 Sep 2019 17:25:22 +0300 Subject: [PATCH 242/309] read settings from file --- dbms/src/Core/Settings.h | 5 +- dbms/src/Formats/FormatFactory.cpp | 12 +- dbms/src/Formats/FormatSchemaInfo.cpp | 31 ++--- dbms/src/Formats/FormatSchemaInfo.h | 2 +- dbms/src/Formats/FormatSettings.h | 2 +- .../Formats/ParsedTemplateFormatString.cpp | 9 +- dbms/src/Formats/ParsedTemplateFormatString.h | 3 +- .../Formats/Impl/CapnProtoRowInputFormat.cpp | 3 +- .../Formats/Impl/ProtobufRowInputFormat.cpp | 4 +- .../Formats/Impl/ProtobufRowOutputFormat.cpp | 3 +- .../Impl/TemplateBlockOutputFormat.cpp | 106 ++++++++++-------- .../Formats/Impl/TemplateBlockOutputFormat.h | 20 ++-- .../Formats/Impl/TemplateRowInputFormat.cpp | 59 ++++++---- .../Formats/Impl/TemplateRowInputFormat.h | 5 +- .../00937_template_output_format.sh | 22 ++++ .../00937_template_output_format.sql | 12 -- .../00938_template_input_format.sh | 24 ++-- 17 files changed, 194 insertions(+), 128 deletions(-) create mode 100755 dbms/tests/queries/0_stateless/00937_template_output_format.sh delete mode 100644 dbms/tests/queries/0_stateless/00937_template_output_format.sql diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index cacaf883fb7..b2646749ee5 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -217,8 +217,9 @@ struct Settings : public SettingsCollection M(SettingMilliseconds, stream_flush_interval_ms, 7500, "Timeout for flushing data from streaming storages.") \ M(SettingMilliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from/to streaming storages.") \ M(SettingString, format_schema, "", "Schema identifier (used by schema-based formats)") \ - M(SettingString, format_schema_rows, "", "Row format string for Template format") \ - M(SettingString, format_schema_rows_between_delimiter, "\n", "Delimiter between rows for Template format") \ + M(SettingString, format_template_resultset, "", "Path to file which contains format string for result set (for Template format)") \ + M(SettingString, format_template_row, "", "Path to file which contains format string for rows (for Template format)") \ + M(SettingString, format_template_rows_between_delimiter, "\n", "Delimiter between rows (for Template format)") \ M(SettingBool, insert_allow_materialized_columns, 0, "If setting is enabled, Allow materialized columns in INSERT.") \ M(SettingSeconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.") \ M(SettingSeconds, http_send_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP send timeout") \ diff --git a/dbms/src/Formats/FormatFactory.cpp b/dbms/src/Formats/FormatFactory.cpp index b3253d3cbf1..4875729a8d5 100644 --- a/dbms/src/Formats/FormatFactory.cpp +++ b/dbms/src/Formats/FormatFactory.cpp @@ -48,9 +48,9 @@ static FormatSettings getInputFormatSetting(const Settings & settings) format_settings.date_time_input_format = settings.date_time_input_format; format_settings.input_allow_errors_num = settings.input_format_allow_errors_num; format_settings.input_allow_errors_ratio = settings.input_format_allow_errors_ratio; - format_settings.template_settings.format = settings.format_schema; - format_settings.template_settings.row_format = settings.format_schema_rows; - format_settings.template_settings.row_between_delimiter = settings.format_schema_rows_between_delimiter; + format_settings.template_settings.resultset_format = settings.format_template_resultset; + format_settings.template_settings.row_format = settings.format_template_row; + format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter; return format_settings; } @@ -67,9 +67,9 @@ static FormatSettings getOutputFormatSetting(const Settings & settings) format_settings.pretty.max_rows = settings.output_format_pretty_max_rows; format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width; format_settings.pretty.color = settings.output_format_pretty_color; - format_settings.template_settings.format = settings.format_schema; - format_settings.template_settings.row_format = settings.format_schema_rows; - format_settings.template_settings.row_between_delimiter = settings.format_schema_rows_between_delimiter; + format_settings.template_settings.resultset_format = settings.format_template_resultset; + format_settings.template_settings.row_format = settings.format_template_row; + format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter; format_settings.write_statistics = settings.output_format_write_statistics; format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size; diff --git a/dbms/src/Formats/FormatSchemaInfo.cpp b/dbms/src/Formats/FormatSchemaInfo.cpp index f01dbe457db..fab8fc7fa63 100644 --- a/dbms/src/Formats/FormatSchemaInfo.cpp +++ b/dbms/src/Formats/FormatSchemaInfo.cpp @@ -26,28 +26,33 @@ namespace } -FormatSchemaInfo::FormatSchemaInfo(const Context & context, const String & format) +FormatSchemaInfo::FormatSchemaInfo(const Context & context, const String & format_schema, const String & format, bool require_message) { - String format_schema = context.getSettingsRef().format_schema.toString(); if (format_schema.empty()) throw Exception( - "The format " + format + " requires a schema. The 'format_schema' setting should be set", ErrorCodes::BAD_ARGUMENTS); + "The format " + format + " requires a schema. The corresponding setting should be set", ErrorCodes::BAD_ARGUMENTS); String default_file_extension = getFormatSchemaDefaultFileExtension(format); - size_t colon_pos = format_schema.find(':'); Poco::Path path; - if ((colon_pos == String::npos) || (colon_pos == 0) || (colon_pos == format_schema.length() - 1) - || path.assign(format_schema.substr(0, colon_pos)).makeFile().getFileName().empty()) + if (require_message) { - throw Exception( - "Format schema requires the 'format_schema' setting to have the 'schema_file:message_name' format" - + (default_file_extension.empty() ? "" : ", e.g. 'schema." + default_file_extension + ":Message'") + ". Got '" + format_schema - + "'", - ErrorCodes::BAD_ARGUMENTS); - } + size_t colon_pos = format_schema.find(':'); + if ((colon_pos == String::npos) || (colon_pos == 0) || (colon_pos == format_schema.length() - 1) + || path.assign(format_schema.substr(0, colon_pos)).makeFile().getFileName().empty()) + { + throw Exception( + "Format schema requires the 'format_schema' setting to have the 'schema_file:message_name' format" + + (default_file_extension.empty() ? "" : ", e.g. 'schema." + default_file_extension + ":Message'") + + ". Got '" + format_schema + + "'", + ErrorCodes::BAD_ARGUMENTS); + } - message_name = format_schema.substr(colon_pos + 1); + message_name = format_schema.substr(colon_pos + 1); + } + else + path.assign(format_schema).makeFile().getFileName(); auto default_schema_directory = [&context]() { diff --git a/dbms/src/Formats/FormatSchemaInfo.h b/dbms/src/Formats/FormatSchemaInfo.h index f7921b5f8e3..3360698c81f 100644 --- a/dbms/src/Formats/FormatSchemaInfo.h +++ b/dbms/src/Formats/FormatSchemaInfo.h @@ -10,7 +10,7 @@ class Context; class FormatSchemaInfo { public: - FormatSchemaInfo(const Context & context, const String & format); + FormatSchemaInfo(const Context & context, const String & format_schema, const String & format, bool require_message); /// Returns path to the schema file. const String & schemaPath() const { return schema_path; } diff --git a/dbms/src/Formats/FormatSettings.h b/dbms/src/Formats/FormatSettings.h index 43cf7c8f5e7..bd1a30e5e9f 100644 --- a/dbms/src/Formats/FormatSettings.h +++ b/dbms/src/Formats/FormatSettings.h @@ -52,7 +52,7 @@ struct FormatSettings struct Template { - String format; + String resultset_format; String row_format; String row_between_delimiter; }; diff --git a/dbms/src/Formats/ParsedTemplateFormatString.cpp b/dbms/src/Formats/ParsedTemplateFormatString.cpp index c8b8a655475..4e9612717f2 100644 --- a/dbms/src/Formats/ParsedTemplateFormatString.cpp +++ b/dbms/src/Formats/ParsedTemplateFormatString.cpp @@ -2,6 +2,8 @@ #include #include #include +#include +#include namespace DB { @@ -11,11 +13,14 @@ namespace ErrorCodes extern const int INVALID_TEMPLATE_FORMAT; } -ParsedTemplateFormatString::ParsedTemplateFormatString(const String & format_string, const ColumnIdxGetter & idx_by_name) +ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name) { try { - parse(format_string, idx_by_name); + ReadBufferFromFile schema_file(schema.absoluteSchemaPath()); + WriteBufferFromOwnString format_string; + copyData(schema_file, format_string); + parse(format_string.str(), idx_by_name); } catch (DB::Exception & e) { diff --git a/dbms/src/Formats/ParsedTemplateFormatString.h b/dbms/src/Formats/ParsedTemplateFormatString.h index 5353f336f64..a1f6831c3fe 100644 --- a/dbms/src/Formats/ParsedTemplateFormatString.h +++ b/dbms/src/Formats/ParsedTemplateFormatString.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB { @@ -34,7 +35,7 @@ struct ParsedTemplateFormatString typedef std::function(const String &)> ColumnIdxGetter; ParsedTemplateFormatString() = default; - ParsedTemplateFormatString(const String & format_string, const ColumnIdxGetter & idx_by_name); + ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name); void parse(const String & format_string, const ColumnIdxGetter & idx_by_name); diff --git a/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp index a45d83052c2..63614ea23eb 100644 --- a/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp @@ -303,7 +303,8 @@ void registerInputFormatProcessorCapnProto(FormatFactory & factory) "CapnProto", [](ReadBuffer & buf, const Block & sample, const Context & context, IRowInputFormat::Params params, const FormatSettings &) { - return std::make_shared(buf, sample, std::move(params), FormatSchemaInfo(context, "CapnProto")); + return std::make_shared(buf, sample, std::move(params), + FormatSchemaInfo(context, context.getSettingsRef().format_schema, "CapnProto", true)); }); } diff --git a/dbms/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp index 1cd9d329c9d..f4569377ea6 100644 --- a/dbms/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB @@ -74,7 +75,8 @@ void registerInputFormatProcessorProtobuf(FormatFactory & factory) IRowInputFormat::Params params, const FormatSettings &) { - return std::make_shared(buf, sample, std::move(params), FormatSchemaInfo(context, "Protobuf")); + return std::make_shared(buf, sample, std::move(params), + FormatSchemaInfo(context, context.getSettingsRef().format_schema, "Protobuf", true)); }); } diff --git a/dbms/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp index 96b19337b3d..d1007492203 100644 --- a/dbms/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp @@ -54,7 +54,8 @@ void registerOutputFormatProcessorProtobuf(FormatFactory & factory) FormatFactory::WriteCallback callback, const FormatSettings &) { - return std::make_shared(buf, header, callback, FormatSchemaInfo(context, "Protobuf")); + return std::make_shared(buf, header, callback, + FormatSchemaInfo(context, context.getSettingsRef().format_schema, "Protobuf", true)); }); } diff --git a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index 08d97c3c0d6..12078e33172 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -12,8 +12,9 @@ namespace ErrorCodes extern const int SYNTAX_ERROR; } -TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, WriteBuffer & out_, const FormatSettings & settings_) - : IOutputFormat(header_, out_), settings(settings_) +TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, WriteBuffer & out_, const FormatSettings & settings_, + ParsedTemplateFormatString format_, ParsedTemplateFormatString row_format_) + : IOutputFormat(header_, out_), settings(settings_), format(std::move(format_)), row_format(std::move(row_format_)) { auto & sample = getPort(PortKind::Main).getHeader(); size_t columns = sample.columns(); @@ -21,14 +22,6 @@ TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, Writ for (size_t i = 0; i < columns; ++i) types[i] = sample.safeGetByPosition(i).type; - /// Parse format string for whole output - static const String default_format("${data}"); - const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format; - format = ParsedTemplateFormatString(format_str, [&](const String & partName) - { - return static_cast(stringToOutputPart(partName)); - }); - /// Validate format string for whole output size_t data_idx = format.format_idx_to_column_idx.size() + 1; for (size_t i = 0; i < format.format_idx_to_column_idx.size(); ++i) @@ -37,20 +30,20 @@ TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, Writ format.throwInvalidFormat("Output part name cannot be empty.", i); switch (*format.format_idx_to_column_idx[i]) { - case static_cast(OutputPart::Data): + case static_cast(ResultsetPart::Data): data_idx = i; [[fallthrough]]; - case static_cast(OutputPart::Totals): - case static_cast(OutputPart::ExtremesMin): - case static_cast(OutputPart::ExtremesMax): + case static_cast(ResultsetPart::Totals): + case static_cast(ResultsetPart::ExtremesMin): + case static_cast(ResultsetPart::ExtremesMax): if (format.formats[i] != ColumnFormat::None) format.throwInvalidFormat("Serialization type for data, totals, min and max must be empty or None", i); break; - case static_cast(OutputPart::Rows): - case static_cast(OutputPart::RowsBeforeLimit): - case static_cast(OutputPart::TimeElapsed): - case static_cast(OutputPart::RowsRead): - case static_cast(OutputPart::BytesRead): + case static_cast(ResultsetPart::Rows): + case static_cast(ResultsetPart::RowsBeforeLimit): + case static_cast(ResultsetPart::TimeElapsed): + case static_cast(ResultsetPart::RowsRead): + case static_cast(ResultsetPart::BytesRead): if (format.formats[i] == ColumnFormat::None) format.throwInvalidFormat("Serialization type for output part rows, rows_before_limit, time, " "rows_read or bytes_read is not specified", i); @@ -62,12 +55,6 @@ TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, Writ if (data_idx != 0) format.throwInvalidFormat("${data} must be the first output part", 0); - /// Parse format string for rows - row_format = ParsedTemplateFormatString(settings.template_settings.row_format, [&](const String & colName) - { - return sample.getPositionByName(colName); - }); - /// Validate format string for rows if (row_format.delimiters.size() == 1) row_format.throwInvalidFormat("No columns specified", 0); @@ -83,26 +70,26 @@ TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, Writ } } -TemplateBlockOutputFormat::OutputPart TemplateBlockOutputFormat::stringToOutputPart(const String & part) +TemplateBlockOutputFormat::ResultsetPart TemplateBlockOutputFormat::stringToResultsetPart(const String & part) { if (part == "data") - return OutputPart::Data; + return ResultsetPart::Data; else if (part == "totals") - return OutputPart::Totals; + return ResultsetPart::Totals; else if (part == "min") - return OutputPart::ExtremesMin; + return ResultsetPart::ExtremesMin; else if (part == "max") - return OutputPart::ExtremesMax; + return ResultsetPart::ExtremesMax; else if (part == "rows") - return OutputPart::Rows; + return ResultsetPart::Rows; else if (part == "rows_before_limit") - return OutputPart::RowsBeforeLimit; + return ResultsetPart::RowsBeforeLimit; else if (part == "time") - return OutputPart::TimeElapsed; + return ResultsetPart::TimeElapsed; else if (part == "rows_read") - return OutputPart::RowsRead; + return ResultsetPart::RowsRead; else if (part == "bytes_read") - return OutputPart::BytesRead; + return ResultsetPart::BytesRead; else throw Exception("Unknown output part " + part, ErrorCodes::SYNTAX_ERROR); } @@ -193,38 +180,38 @@ void TemplateBlockOutputFormat::finalize() { auto type = std::make_shared(); ColumnWithTypeAndName col(type->createColumnConst(1, row_count), type, String("tmp")); - switch (static_cast(*format.format_idx_to_column_idx[i])) + switch (static_cast(*format.format_idx_to_column_idx[i])) { - case OutputPart::Totals: + case ResultsetPart::Totals: if (!totals) format.throwInvalidFormat("Cannot print totals for this request", i); writeRow(totals, 0); break; - case OutputPart::ExtremesMin: + case ResultsetPart::ExtremesMin: if (!extremes) format.throwInvalidFormat("Cannot print extremes for this request", i); writeRow(extremes, 0); break; - case OutputPart::ExtremesMax: + case ResultsetPart::ExtremesMax: if (!extremes) format.throwInvalidFormat("Cannot print extremes for this request", i); writeRow(extremes, 1); break; - case OutputPart::Rows: + case ResultsetPart::Rows: writeValue(row_count, format.formats[i]); break; - case OutputPart::RowsBeforeLimit: + case ResultsetPart::RowsBeforeLimit: if (!rows_before_limit_set) format.throwInvalidFormat("Cannot print rows_before_limit for this request", i); writeValue(rows_before_limit, format.formats[i]); break; - case OutputPart::TimeElapsed: + case ResultsetPart::TimeElapsed: writeValue(watch.elapsedSeconds(), format.formats[i]); break; - case OutputPart::RowsRead: + case ResultsetPart::RowsRead: writeValue(progress.read_rows.load(), format.formats[i]); break; - case OutputPart::BytesRead: + case ResultsetPart::BytesRead: writeValue(progress.read_bytes.load(), format.formats[i]); break; default: @@ -242,11 +229,38 @@ void registerOutputFormatProcessorTemplate(FormatFactory & factory) factory.registerOutputFormatProcessor("Template", []( WriteBuffer & buf, const Block & sample, - const Context &, + const Context & context, FormatFactory::WriteCallback, const FormatSettings & settings) { - return std::make_shared(sample, buf, settings); + ParsedTemplateFormatString resultset_format; + if (settings.template_settings.resultset_format.empty()) + { + /// Default format string: "${data}" + resultset_format.delimiters.resize(2); + resultset_format.formats.emplace_back(ParsedTemplateFormatString::ColumnFormat::None); + resultset_format.format_idx_to_column_idx.emplace_back(0); + resultset_format.column_names.emplace_back("data"); + } + else + { + /// Read format string from file + resultset_format = ParsedTemplateFormatString( + FormatSchemaInfo(context, settings.template_settings.resultset_format, "Template", false), + [&](const String & partName) + { + return static_cast(TemplateBlockOutputFormat::stringToResultsetPart(partName)); + }); + } + + ParsedTemplateFormatString row_format = ParsedTemplateFormatString( + FormatSchemaInfo(context, settings.template_settings.row_format, "Template", false), + [&](const String & colName) + { + return sample.getPositionByName(colName); + }); + + return std::make_shared(sample, buf, settings, resultset_format, row_format); }); } } diff --git a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h index 9eb5f61d4e7..25a6a832bc8 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.h @@ -14,7 +14,8 @@ class TemplateBlockOutputFormat : public IOutputFormat { using ColumnFormat = ParsedTemplateFormatString::ColumnFormat; public: - TemplateBlockOutputFormat(const Block & header_, WriteBuffer & out_, const FormatSettings & settings_); + TemplateBlockOutputFormat(const Block & header_, WriteBuffer & out_, const FormatSettings & settings_, + ParsedTemplateFormatString format_, ParsedTemplateFormatString row_format_); String getName() const override { return "TemplateBlockOutputFormat"; } @@ -23,13 +24,7 @@ public: void setRowsBeforeLimit(size_t rows_before_limit_) override { rows_before_limit = rows_before_limit_; rows_before_limit_set = true; } void onProgress(const Progress & progress_) override { progress.incrementPiecewiseAtomically(progress_); } -protected: - void consume(Chunk chunk) override; - void consumeTotals(Chunk chunk) override { totals = std::move(chunk); } - void consumeExtremes(Chunk chunk) override { extremes = std::move(chunk); } - void finalize() override; - - enum class OutputPart : size_t + enum class ResultsetPart : size_t { Data, Totals, @@ -42,7 +37,14 @@ protected: BytesRead }; - OutputPart stringToOutputPart(const String & part); + static ResultsetPart stringToResultsetPart(const String & part); + +protected: + void consume(Chunk chunk) override; + void consumeTotals(Chunk chunk) override { totals = std::move(chunk); } + void consumeExtremes(Chunk chunk) override { extremes = std::move(chunk); } + void finalize() override; + void writeRow(const Chunk & chunk, size_t row_num); void serializeField(const IColumn & column, const IDataType & type, size_t row_num, ColumnFormat format); template void writeValue(U value, ColumnFormat col_format); diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 617e9124e83..9f4b2147452 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB { @@ -18,21 +19,13 @@ extern const int SYNTAX_ERROR; TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, - const FormatSettings & settings_, bool ignore_spaces_) + const FormatSettings & settings_, bool ignore_spaces_, + ParsedTemplateFormatString format_, ParsedTemplateFormatString row_format_) : RowInputFormatWithDiagnosticInfo(header_, buf, params_), buf(in_), data_types(header_.getDataTypes()), - settings(settings_), ignore_spaces(ignore_spaces_) + settings(settings_), ignore_spaces(ignore_spaces_), + format(std::move(format_)), row_format(std::move(row_format_)) { - /// Parse format string for whole input - static const String default_format("${data}"); - const String & format_str = settings.template_settings.format.empty() ? default_format : settings.template_settings.format; - format = ParsedTemplateFormatString(format_str, [&](const String & partName) -> std::optional - { - if (partName == "data") - return 0; - throw Exception("Unknown input part " + partName, ErrorCodes::SYNTAX_ERROR); - }); - - /// Validate format string for whole input + /// Validate format string for result set bool has_data = false; for (size_t i = 0; i < format.columnsCount(); ++i) { @@ -54,12 +47,6 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer } } - /// Parse format string for rows - row_format = ParsedTemplateFormatString(settings.template_settings.row_format, [&](const String & colName) -> std::optional - { - return header_.getPositionByName(colName); - }); - /// Validate format string for rows std::vector column_in_format(header_.columns(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) @@ -494,11 +481,41 @@ void registerInputFormatProcessorTemplate(FormatFactory & factory) factory.registerInputFormatProcessor(ignore_spaces ? "TemplateIgnoreSpaces" : "Template", [=]( ReadBuffer & buf, const Block & sample, - const Context &, + const Context & context, IRowInputFormat::Params params, const FormatSettings & settings) { - return std::make_shared(sample, buf, params, settings, ignore_spaces); + ParsedTemplateFormatString resultset_format; + if (settings.template_settings.resultset_format.empty()) + { + /// Default format string: "${data}" + resultset_format.delimiters.resize(2); + resultset_format.formats.emplace_back(ParsedTemplateFormatString::ColumnFormat::None); + resultset_format.format_idx_to_column_idx.emplace_back(0); + resultset_format.column_names.emplace_back("data"); + } + else + { + /// Read format string from file + resultset_format = ParsedTemplateFormatString( + FormatSchemaInfo(context, settings.template_settings.resultset_format, "Template", false), + [&](const String & partName) -> std::optional + { + if (partName == "data") + return 0; + throw Exception("Unknown input part " + partName, + ErrorCodes::SYNTAX_ERROR); + }); + } + + ParsedTemplateFormatString row_format = ParsedTemplateFormatString( + FormatSchemaInfo(context, settings.template_settings.row_format, "Template", false), + [&](const String & colName) -> std::optional + { + return sample.getPositionByName(colName); + }); + + return std::make_shared(sample, buf, params, settings, ignore_spaces, resultset_format, row_format); }); } } diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h index ff7b2adc34a..3ad80f48207 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -16,7 +16,8 @@ class TemplateRowInputFormat : public RowInputFormatWithDiagnosticInfo using ColumnFormat = ParsedTemplateFormatString::ColumnFormat; public: TemplateRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, - const FormatSettings & settings_, bool ignore_spaces_); + const FormatSettings & settings_, bool ignore_spaces_, + ParsedTemplateFormatString format_, ParsedTemplateFormatString row_format_); String getName() const override { return "TemplateRowInputFormat"; } @@ -50,9 +51,9 @@ private: DataTypes data_types; FormatSettings settings; + const bool ignore_spaces; ParsedTemplateFormatString format; ParsedTemplateFormatString row_format; - const bool ignore_spaces; size_t format_data_idx; bool end_of_stream = false; diff --git a/dbms/tests/queries/0_stateless/00937_template_output_format.sh b/dbms/tests/queries/0_stateless/00937_template_output_format.sh new file mode 100755 index 00000000000..239f7d672d0 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00937_template_output_format.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +. $CURDIR/../shell_config.sh + +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS template"; +$CLICKHOUSE_CLIENT --query="CREATE TABLE template (s1 String, s2 String, \`s 3\` String, \"s 4\" String, n UInt64, d Date) ENGINE = Memory"; +$CLICKHOUSE_CLIENT --query="INSERT INTO template VALUES +('qwe,rty', 'as\"df''gh', '', 'zx\ncv\tbn m', 123, '2016-01-01'),\ +('as\"df''gh', '', 'zx\ncv\tbn m', 'qwe,rty', 456, '2016-01-02'),\ +('', 'zx\ncv\tbn m', 'qwe,rty', 'as\"df''gh', 9876543210, '2016-01-03'),\ +('zx\ncv\tbn m', 'qwe,rty', 'as\"df''gh', '', 789, '2016-01-04')"; + +echo -ne '{prefix} \n${data:None}\n------\n${totals:}\n------\n${min}\n------\n${max}\n${rows:Escaped} rows\nbefore limit ${rows_before_limit:XML}\nread ${rows_read:Escaped} $$ suffix $$' > $CURDIR/00937_template_output_format_resultset.tmp +echo -ne 'n:\t${n:JSON}, s1:\t${0:Escaped}, s2:\t${s2:Quoted}, s3:\t${`s 3`:JSON}, s4:\t${"s 4":CSV}, d:\t${d:Escaped}, n:\t${n:Raw}\t' > $CURDIR/00937_template_output_format_row.tmp + +$CLICKHOUSE_CLIENT --query="SELECT * FROM template GROUP BY s1, s2, \`s 3\`, \"s 4\", n, d WITH TOTALS ORDER BY n LIMIT 4 FORMAT Template SETTINGS extremes = 1,\ +format_template_resultset = '$CURDIR/00937_template_output_format_resultset.tmp', \ +format_template_row = '$CURDIR/00937_template_output_format_row.tmp', \ +format_template_rows_between_delimiter = ';\n'"; + +$CLICKHOUSE_CLIENT --query="DROP TABLE template"; diff --git a/dbms/tests/queries/0_stateless/00937_template_output_format.sql b/dbms/tests/queries/0_stateless/00937_template_output_format.sql deleted file mode 100644 index 8c3865ad0fa..00000000000 --- a/dbms/tests/queries/0_stateless/00937_template_output_format.sql +++ /dev/null @@ -1,12 +0,0 @@ -DROP TABLE IF EXISTS template; -CREATE TABLE template (s1 String, s2 String, `s 3` String, "s 4" String, n UInt64, d Date) ENGINE = Memory; -INSERT INTO template VALUES -('qwe,rty', 'as"df''gh', '', 'zx\ncv\tbn m', 123, '2016-01-01'),('as"df''gh', '', 'zx\ncv\tbn m', 'qwe,rty', 456, '2016-01-02'),('', 'zx\ncv\tbn m', 'qwe,rty', 'as"df''gh', 9876543210, '2016-01-03'),('zx\ncv\tbn m', 'qwe,rty', 'as"df''gh', '', 789, '2016-01-04'); - -SELECT * FROM template GROUP BY s1, s2, `s 3`, "s 4", n, d WITH TOTALS ORDER BY n LIMIT 4 FORMAT Template SETTINGS -extremes = 1, -format_schema = '{prefix} \n${data:None}\n------\n${totals:}\n------\n${min}\n------\n${max}\n${rows:Escaped} rows\nbefore limit ${rows_before_limit:XML}\nread ${rows_read:Escaped} $$ suffix $$', -format_schema_rows = 'n:\t${n:JSON}, s1:\t${0:Escaped}, s2:\t${s2:Quoted}, s3:\t${`s 3`:JSON}, s4:\t${"s 4":CSV}, d:\t${d:Escaped}, n:\t${n:Raw}\t', -format_schema_rows_between_delimiter = ';\n'; - -DROP TABLE template; diff --git a/dbms/tests/queries/0_stateless/00938_template_input_format.sh b/dbms/tests/queries/0_stateless/00938_template_input_format.sh index 63297ab3850..998fe195203 100755 --- a/dbms/tests/queries/0_stateless/00938_template_input_format.sh +++ b/dbms/tests/queries/0_stateless/00938_template_input_format.sh @@ -9,6 +9,8 @@ $CLICKHOUSE_CLIENT --query="CREATE TABLE template1 (s1 String, s2 String, s3 Str $CLICKHOUSE_CLIENT --query="CREATE TABLE template2 (s1 String, s2 String, s3 String, s4 String, n UInt64, d Date) ENGINE = Memory"; echo "==== check escaping ====" +echo -ne '{prefix} \n${data}\n $$ suffix $$\n' > $CURDIR/00938_template_input_format_resultset.tmp +echo -ne 'n:\t${n:Escaped}, s1:\t${0:Escaped}\t, s2:\t${1:Quoted}, s3:\t${s3:JSON}, s4:\t${3:CSV}, d:\t${d:Escaped}\t' > $CURDIR/00938_template_input_format_row.tmp echo "{prefix} n: 123, s1: qwe,rty , s2: 'as\"df\\'gh', s3: \"\", s4: \"zx @@ -17,28 +19,32 @@ n: 456, s1: as\"df\\'gh , s2: '', s3: \"zx\\ncv\\tbn m\", s4: \"qwe,rty\", d: 20 n: 9876543210, s1: , s2: 'zx\\ncv\\tbn m', s3: \"qwe,rty\", s4: \"as\"\"df'gh\", d: 2016-01-03 ; n: 789, s1: zx\\ncv\\tbn m , s2: 'qwe,rty', s3: \"as\\\"df'gh\", s4: \"\", d: 2016-01-04 $ suffix $" | $CLICKHOUSE_CLIENT --query="INSERT INTO template1 FORMAT Template SETTINGS \ -format_schema = '{prefix} \n\${data}\n \$\$ suffix \$\$\n', \ -format_schema_rows = 'n:\t\${n:Escaped}, s1:\t\${0:Escaped}\t, s2:\t\${1:Quoted}, s3:\t\${s3:JSON}, s4:\t\${3:CSV}, d:\t\${d:Escaped}\t', \ -format_schema_rows_between_delimiter = ';\n'"; +format_template_resultset = '$CURDIR/00938_template_input_format_resultset.tmp', \ +format_template_row = '$CURDIR/00938_template_input_format_row.tmp', \ +format_template_rows_between_delimiter = ';\n'"; $CLICKHOUSE_CLIENT --query="SELECT * FROM template1 ORDER BY n FORMAT CSV"; echo "==== parse json (sophisticated template) ====" +echo -ne '{${:}"meta"${:}:${:}[${:}{${:}"name"${:}:${:}"s1"${:},${:}"type"${:}:${:}"String"${:}}${:},${:}{${:}"name"${:}:${:}"s2"${:},${:}"type"${:}:${:}"String"${:}}${:},${:}{${:}"name"${:}:${:}"s3"${:},${:}"type"${:}:${:}"String"${:}}${:},${:}{${:}"name"${:}:${:}"s4"${:},${:}"type"${:}:${:}"String"${:}}${:},${:}{${:}"name"${:}:${:}"n"${:},${:}"type"${:}:${:}"UInt64"${:}}${:},${:}{${:}"name"${:}:${:}"d"${:},${:}"type"${:}:${:}"Date"${:}}${:}]${:},${:}"data"${:}:${:}[${data}]${:},${:}"rows"${:}:${:}${:CSV}${:},${:}"statistics"${:}:${:}{${:}"elapsed"${:}:${:}${:CSV}${:},${:}"rows_read"${:}:${:}${:CSV}${:},${:}"bytes_read"${:}:${:}${:CSV}${:}}${:}}' > $CURDIR/00938_template_input_format_resultset.tmp +echo -ne '{${:}"s1"${:}:${:}${s1:JSON}${:},${:}"s2"${:}:${:}${s2:JSON}${:},${:}"s3"${:}:${:}${s3:JSON}${:},${:}"s4"${:}:${:}${s4:JSON}${:},${:}"n"${:}:${:}${n:JSON}${:},${:}"d"${:}:${:}${d:JSON}${:}${:}}' > $CURDIR/00938_template_input_format_row.tmp $CLICKHOUSE_CLIENT --query="SELECT * FROM template1 ORDER BY n FORMAT JSON" | $CLICKHOUSE_CLIENT --query="INSERT INTO template2 FORMAT TemplateIgnoreSpaces SETTINGS \ -format_schema = '{\${:}\"meta\"\${:}:\${:}[\${:}{\${:}\"name\"\${:}:\${:}\"s1\"\${:},\${:}\"type\"\${:}:\${:}\"String\"\${:}}\${:},\${:}{\${:}\"name\"\${:}:\${:}\"s2\"\${:},\${:}\"type\"\${:}:\${:}\"String\"\${:}}\${:},\${:}{\${:}\"name\"\${:}:\${:}\"s3\"\${:},\${:}\"type\"\${:}:\${:}\"String\"\${:}}\${:},\${:}{\${:}\"name\"\${:}:\${:}\"s4\"\${:},\${:}\"type\"\${:}:\${:}\"String\"\${:}}\${:},\${:}{\${:}\"name\"\${:}:\${:}\"n\"\${:},\${:}\"type\"\${:}:\${:}\"UInt64\"\${:}}\${:},\${:}{\${:}\"name\"\${:}:\${:}\"d\"\${:},\${:}\"type\"\${:}:\${:}\"Date\"\${:}}\${:}]\${:},\${:}\"data\"\${:}:\${:}[\${data}]\${:},\${:}\"rows\"\${:}:\${:}\${:CSV}\${:},\${:}\"statistics\"\${:}:\${:}{\${:}\"elapsed\"\${:}:\${:}\${:CSV}\${:},\${:}\"rows_read\"\${:}:\${:}\${:CSV}\${:},\${:}\"bytes_read\"\${:}:\${:}\${:CSV}\${:}}\${:}}', \ -format_schema_rows = '{\${:}\"s1\"\${:}:\${:}\${s1:JSON}\${:},\${:}\"s2\"\${:}:\${:}\${s2:JSON}\${:},\${:}\"s3\"\${:}:\${:}\${s3:JSON}\${:},\${:}\"s4\"\${:}:\${:}\${s4:JSON}\${:},\${:}\"n\"\${:}:\${:}\${n:JSON}\${:},\${:}\"d\"\${:}:\${:}\${d:JSON}\${:}\${:}}', \ -format_schema_rows_between_delimiter = ','"; +format_template_resultset = '$CURDIR/00938_template_input_format_resultset.tmp', \ +format_template_row = '$CURDIR/00938_template_input_format_row.tmp', \ +format_template_rows_between_delimiter = ','"; $CLICKHOUSE_CLIENT --query="SELECT * FROM template2 ORDER BY n FORMAT CSV"; $CLICKHOUSE_CLIENT --query="TRUNCATE TABLE template2"; echo "==== parse json ====" +echo -ne '{${:}"meta"${:}:${:JSON},${:}"data"${:}:${:}[${data}]${:},${:}"rows"${:}:${:JSON},${:}"statistics"${:}:${:JSON}${:}}' > $CURDIR/00938_template_input_format_resultset.tmp +echo -ne '{${:}"s1"${:}:${:}${s3:JSON}${:},${:}"s2"${:}:${:}${:JSON}${:},${:}"s3"${:}:${:}${s1:JSON}${:},${:}"s4"${:}:${:}${:JSON}${:},${:}"n"${:}:${:}${n:JSON}${:},${:}"d"${:}:${:}${d:JSON}${:}${:}}' > $CURDIR/00938_template_input_format_row.tmp $CLICKHOUSE_CLIENT --query="SELECT * FROM template1 ORDER BY n FORMAT JSON" | $CLICKHOUSE_CLIENT --query="INSERT INTO template2 FORMAT TemplateIgnoreSpaces SETTINGS \ -format_schema = '{\${:}\"meta\"\${:}:\${:JSON},\${:}\"data\"\${:}:\${:}[\${data}]\${:},\${:}\"rows\"\${:}:\${:JSON},\${:}\"statistics\"\${:}:\${:JSON}\${:}}', \ -format_schema_rows = '{\${:}\"s1\"\${:}:\${:}\${s3:JSON}\${:},\${:}\"s2\"\${:}:\${:}\${:JSON}\${:},\${:}\"s3\"\${:}:\${:}\${s1:JSON}\${:},\${:}\"s4\"\${:}:\${:}\${:JSON}\${:},\${:}\"n\"\${:}:\${:}\${n:JSON}\${:},\${:}\"d\"\${:}:\${:}\${d:JSON}\${:}\${:}}', \ -format_schema_rows_between_delimiter = ','"; +format_template_resultset = '$CURDIR/00938_template_input_format_resultset.tmp', \ +format_template_row = '$CURDIR/00938_template_input_format_row.tmp', \ +format_template_rows_between_delimiter = ','"; $CLICKHOUSE_CLIENT --query="SELECT * FROM template2 ORDER BY n FORMAT CSV"; From def500f591f104a40e9981f7f4f6a56be1989ed2 Mon Sep 17 00:00:00 2001 From: Vasilyev Nikita Date: Tue, 24 Sep 2019 17:49:30 +0300 Subject: [PATCH 243/309] some impr --- dbms/src/Databases/DatabaseOrdinary.cpp | 34 +++---------------------- dbms/src/Databases/DatabaseOrdinary.h | 2 -- 2 files changed, 3 insertions(+), 33 deletions(-) diff --git a/dbms/src/Databases/DatabaseOrdinary.cpp b/dbms/src/Databases/DatabaseOrdinary.cpp index b988329127e..b5f0266720d 100644 --- a/dbms/src/Databases/DatabaseOrdinary.cpp +++ b/dbms/src/Databases/DatabaseOrdinary.cpp @@ -263,11 +263,8 @@ void DatabaseOrdinary::createTable( /// A race condition would be possible if a table with the same name is simultaneously created using CREATE and using ATTACH. /// But there is protection from it - see using DDLGuard in InterpreterCreateQuery. - { - std::lock_guard lock(mutex); - if (tables.find(table_name) != tables.end()) - throw Exception("Table " + name + "." + table_name + " already exists.", ErrorCodes::TABLE_ALREADY_EXISTS); - } + if (isTableExist(context, table_name)) + throw Exception("Table " + name + "." + table_name + " already exists.", ErrorCodes::TABLE_ALREADY_EXISTS); String table_metadata_path = getTableMetadataPath(table_name); String table_metadata_tmp_path = table_metadata_path + ".tmp"; @@ -288,11 +285,7 @@ void DatabaseOrdinary::createTable( try { /// Add a table to the map of known tables. - { - std::lock_guard lock(mutex); - if (!tables.emplace(table_name, table).second) - throw Exception("Table " + name + "." + table_name + " already exists.", ErrorCodes::TABLE_ALREADY_EXISTS); - } + attachTable(table_name, table); /// If it was ATTACH query and file with table metadata already exist /// (so, ATTACH is done after DETACH), then rename atomically replaces old file with new one. @@ -489,27 +482,6 @@ ASTPtr DatabaseOrdinary::getCreateDatabaseQuery(const Context & /*context*/) con return ast; } - -void DatabaseOrdinary::shutdown() -{ - /// You can not hold a lock during shutdown. - /// Because inside `shutdown` function the tables can work with database, and mutex is not recursive. - - Tables tables_snapshot; - { - std::lock_guard lock(mutex); - tables_snapshot = tables; - } - - for (const auto & kv: tables_snapshot) - { - kv.second->shutdown(); - } - - std::lock_guard lock(mutex); - tables.clear(); -} - void DatabaseOrdinary::alterTable( const Context & context, const String & table_name, diff --git a/dbms/src/Databases/DatabaseOrdinary.h b/dbms/src/Databases/DatabaseOrdinary.h index e8895075768..d8f7e1b3612 100644 --- a/dbms/src/Databases/DatabaseOrdinary.h +++ b/dbms/src/Databases/DatabaseOrdinary.h @@ -68,8 +68,6 @@ public: void drop() override; - void shutdown() override; - private: const String metadata_path; const String data_path; From 0467229effb2d0ae6e8495a2ad3e1cdca80a091e Mon Sep 17 00:00:00 2001 From: chertus Date: Tue, 24 Sep 2019 21:21:57 +0300 Subject: [PATCH 244/309] fix all left partial merge join on block borders --- dbms/src/Interpreters/MergeJoin.cpp | 8 +- .../01010_pmj_one_row_blocks.reference | 188 ++++++++++++++++++ .../0_stateless/01010_pmj_one_row_blocks.sql | 106 ++++++++++ .../01010_pmj_skip_blocks.reference | 188 ++++++++++++++++++ .../0_stateless/01010_pmj_skip_blocks.sql | 107 ++++++++++ 5 files changed, 595 insertions(+), 2 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/01010_pmj_one_row_blocks.reference create mode 100644 dbms/tests/queries/0_stateless/01010_pmj_one_row_blocks.sql create mode 100644 dbms/tests/queries/0_stateless/01010_pmj_skip_blocks.reference create mode 100644 dbms/tests/queries/0_stateless/01010_pmj_skip_blocks.sql diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index bd7e7cfe078..2464500957b 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -476,10 +476,14 @@ void MergeJoin::leftJoin(MergeJoinCursor & left_cursor, const Block & left_block while (!left_cursor.atEnd() && !right_cursor.atEnd()) { - size_t left_position = left_cursor.position(); /// save inequal position + /// Not zero left_key_tail means there were equality for the last left key in previous leftJoin() call. + /// Do not join it twice: join only if it's equal with a first right key of current leftJoin() call and skip otherwise. + size_t left_unequal_position = left_cursor.position() + left_key_tail; + left_key_tail = 0; + Range range = left_cursor.getNextEqualRange(right_cursor); - joinInequalsLeft(left_block, left_columns, right_columns, left_position, range.left_start, is_all); + joinInequalsLeft(left_block, left_columns, right_columns, left_unequal_position, range.left_start, is_all); if (range.empty()) break; diff --git a/dbms/tests/queries/0_stateless/01010_pmj_one_row_blocks.reference b/dbms/tests/queries/0_stateless/01010_pmj_one_row_blocks.reference new file mode 100644 index 00000000000..c5ef57bb882 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01010_pmj_one_row_blocks.reference @@ -0,0 +1,188 @@ +any left +0 0 0 +1 10 0 +2 20 2 +3 30 0 +4 40 4 +- +0 0 0 +1 10 0 +2 20 0 +3 30 0 +4 40 0 +- +0 0 0 +1 10 0 +2 20 2 +3 30 0 +4 40 4 +- +0 0 0 +1 10 0 +2 20 0 +3 30 0 +4 40 0 +all left +0 0 0 0 +1 10 0 0 +2 20 2 21 +2 20 2 22 +3 30 0 0 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +1 10 0 0 +2 20 0 0 +3 30 0 0 +4 40 0 0 +- +0 0 0 0 +1 10 0 0 +2 20 0 0 +3 30 0 0 +4 40 0 0 +- +0 0 0 0 +1 10 0 0 +2 20 2 21 +2 20 2 22 +3 30 0 0 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +1 10 0 0 +2 20 2 21 +2 20 2 22 +3 30 0 0 +4 40 4 41 +4 40 4 42 +any inner +0 0 0 +2 20 2 +4 40 4 +- +0 0 0 +- +0 0 0 +2 20 2 +4 40 4 +- +0 0 0 +all inner +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +- +0 0 0 0 +- +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 +any left +0 0 0 +1 10 \N +2 20 2 +3 30 \N +4 40 4 +- +0 0 0 +1 10 \N +2 20 \N +3 30 \N +4 40 \N +- +0 0 0 +1 10 \N +2 20 2 +3 30 \N +4 40 4 +- +0 0 0 +1 10 \N +2 20 \N +3 30 \N +4 40 \N +all left +0 0 0 0 +1 10 \N \N +2 20 2 21 +2 20 2 22 +3 30 \N \N +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +1 10 \N \N +2 20 \N \N +3 30 \N \N +4 40 \N \N +- +0 0 0 0 +1 10 \N \N +2 20 \N \N +3 30 \N \N +4 40 \N \N +- +0 0 0 0 +1 10 \N \N +2 20 2 21 +2 20 2 22 +3 30 \N \N +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +1 10 \N \N +2 20 2 21 +2 20 2 22 +3 30 \N \N +4 40 4 41 +4 40 4 42 +any inner +0 0 0 +2 20 2 +4 40 4 +- +0 0 0 +- +0 0 0 +2 20 2 +4 40 4 +- +0 0 0 +all inner +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +- +0 0 0 0 +- +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 diff --git a/dbms/tests/queries/0_stateless/01010_pmj_one_row_blocks.sql b/dbms/tests/queries/0_stateless/01010_pmj_one_row_blocks.sql new file mode 100644 index 00000000000..59aa7d7d4d7 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01010_pmj_one_row_blocks.sql @@ -0,0 +1,106 @@ +DROP TABLE IF EXISTS t0; +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; + +CREATE TABLE t0 (x UInt32, y UInt64) engine = MergeTree ORDER BY (x,y); +CREATE TABLE t1 (x UInt32, y UInt64) engine = MergeTree ORDER BY (x,y); +CREATE TABLE t2 (x UInt32, y UInt64) engine = MergeTree ORDER BY (x,y); + +SET partial_merge_join = 1; +SET partial_merge_join_rows_in_right_blocks = 1; +SET any_join_distinct_right_table_keys = 1; + +INSERT INTO t1 (x, y) VALUES (0, 0); +INSERT INTO t1 (x, y) VALUES (1, 10) (2, 20); +INSERT INTO t1 (x, y) VALUES (4, 40) (3, 30); + +INSERT INTO t2 (x, y) VALUES (4, 41) (2, 21) (2, 22); +INSERT INTO t2 (x, y) VALUES (0, 0) (5, 50) (4, 42); + +SET join_use_nulls = 0; + +SELECT 'any left'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x,y) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x,y) ORDER BY x; + +SELECT 'all left'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND toUInt32(intDiv(t1.y,10)) = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND toUInt64(t1.x) = intDiv(t2.y,10) ORDER BY x, t2.y; + +SELECT 'any inner'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x,y) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x,y) ORDER BY x; + +SELECT 'all inner'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND toUInt32(intDiv(t1.y,10)) = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND toUInt64(t1.x) = intDiv(t2.y,10) ORDER BY x, t2.y; + +SET join_use_nulls = 1; + +SELECT 'any left'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x,y) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x,y) ORDER BY x; + +SELECT 'all left'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND toUInt32(intDiv(t1.y,10)) = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND toUInt64(t1.x) = intDiv(t2.y,10) ORDER BY x, t2.y; + +SELECT 'any inner'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x,y) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x,y) ORDER BY x; + +SELECT 'all inner'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND toUInt32(intDiv(t1.y,10)) = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND toUInt64(t1.x) = intDiv(t2.y,10) ORDER BY x, t2.y; + +DROP TABLE t0; +DROP TABLE t1; +DROP TABLE t2; diff --git a/dbms/tests/queries/0_stateless/01010_pmj_skip_blocks.reference b/dbms/tests/queries/0_stateless/01010_pmj_skip_blocks.reference new file mode 100644 index 00000000000..c5ef57bb882 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01010_pmj_skip_blocks.reference @@ -0,0 +1,188 @@ +any left +0 0 0 +1 10 0 +2 20 2 +3 30 0 +4 40 4 +- +0 0 0 +1 10 0 +2 20 0 +3 30 0 +4 40 0 +- +0 0 0 +1 10 0 +2 20 2 +3 30 0 +4 40 4 +- +0 0 0 +1 10 0 +2 20 0 +3 30 0 +4 40 0 +all left +0 0 0 0 +1 10 0 0 +2 20 2 21 +2 20 2 22 +3 30 0 0 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +1 10 0 0 +2 20 0 0 +3 30 0 0 +4 40 0 0 +- +0 0 0 0 +1 10 0 0 +2 20 0 0 +3 30 0 0 +4 40 0 0 +- +0 0 0 0 +1 10 0 0 +2 20 2 21 +2 20 2 22 +3 30 0 0 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +1 10 0 0 +2 20 2 21 +2 20 2 22 +3 30 0 0 +4 40 4 41 +4 40 4 42 +any inner +0 0 0 +2 20 2 +4 40 4 +- +0 0 0 +- +0 0 0 +2 20 2 +4 40 4 +- +0 0 0 +all inner +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +- +0 0 0 0 +- +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 +any left +0 0 0 +1 10 \N +2 20 2 +3 30 \N +4 40 4 +- +0 0 0 +1 10 \N +2 20 \N +3 30 \N +4 40 \N +- +0 0 0 +1 10 \N +2 20 2 +3 30 \N +4 40 4 +- +0 0 0 +1 10 \N +2 20 \N +3 30 \N +4 40 \N +all left +0 0 0 0 +1 10 \N \N +2 20 2 21 +2 20 2 22 +3 30 \N \N +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +1 10 \N \N +2 20 \N \N +3 30 \N \N +4 40 \N \N +- +0 0 0 0 +1 10 \N \N +2 20 \N \N +3 30 \N \N +4 40 \N \N +- +0 0 0 0 +1 10 \N \N +2 20 2 21 +2 20 2 22 +3 30 \N \N +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +1 10 \N \N +2 20 2 21 +2 20 2 22 +3 30 \N \N +4 40 4 41 +4 40 4 42 +any inner +0 0 0 +2 20 2 +4 40 4 +- +0 0 0 +- +0 0 0 +2 20 2 +4 40 4 +- +0 0 0 +all inner +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +- +0 0 0 0 +- +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 +- +0 0 0 0 +2 20 2 21 +2 20 2 22 +4 40 4 41 +4 40 4 42 diff --git a/dbms/tests/queries/0_stateless/01010_pmj_skip_blocks.sql b/dbms/tests/queries/0_stateless/01010_pmj_skip_blocks.sql new file mode 100644 index 00000000000..3f2cbdd0cc4 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01010_pmj_skip_blocks.sql @@ -0,0 +1,107 @@ +DROP TABLE IF EXISTS t0; +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; + +CREATE TABLE t0 (x UInt32, y UInt64) engine = MergeTree ORDER BY (x,y); +CREATE TABLE t1 (x UInt32, y UInt64) engine = MergeTree ORDER BY (x,y); +CREATE TABLE t2 (x UInt32, y UInt64) engine = MergeTree ORDER BY (x,y); + +SET partial_merge_join = 1; +SET partial_merge_join_optimisations = 1; +SET partial_merge_join_rows_in_right_blocks = 2; +SET any_join_distinct_right_table_keys = 1; + +INSERT INTO t1 (x, y) VALUES (0, 0); +INSERT INTO t1 (x, y) VALUES (1, 10) (2, 20); +INSERT INTO t1 (x, y) VALUES (4, 40) (3, 30); + +INSERT INTO t2 (x, y) VALUES (4, 41) (2, 21) (2, 22); +INSERT INTO t2 (x, y) VALUES (0, 0) (5, 50) (4, 42); + +SET join_use_nulls = 0; + +SELECT 'any left'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x,y) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x,y) ORDER BY x; + +SELECT 'all left'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND toUInt32(intDiv(t1.y,10)) = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND toUInt64(t1.x) = intDiv(t2.y,10) ORDER BY x, t2.y; + +SELECT 'any inner'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x,y) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x,y) ORDER BY x; + +SELECT 'all inner'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND toUInt32(intDiv(t1.y,10)) = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND toUInt64(t1.x) = intDiv(t2.y,10) ORDER BY x, t2.y; + +SET join_use_nulls = 1; + +SELECT 'any left'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x,y) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY LEFT JOIN t2 USING (x,y) ORDER BY x; + +SELECT 'all left'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND toUInt32(intDiv(t1.y,10)) = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.x = t2.x AND toUInt64(t1.x) = intDiv(t2.y,10) ORDER BY x, t2.y; + +SELECT 'any inner'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x,y) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x) ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.x FROM t1 ANY INNER JOIN t2 USING (x,y) ORDER BY x; + +SELECT 'all inner'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND t1.y = t2.y ORDER BY x; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND toUInt32(intDiv(t1.y,10)) = t2.x ORDER BY x, t2.y; +SELECT '-'; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.x = t2.x AND toUInt64(t1.x) = intDiv(t2.y,10) ORDER BY x, t2.y; + +DROP TABLE t0; +DROP TABLE t1; +DROP TABLE t2; From 6e32553e0184e151b23dd109a75acfa28630ca4d Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Tue, 24 Sep 2019 22:00:05 +0300 Subject: [PATCH 245/309] Add a comment about MemorySanitizer options. --- cmake/sanitize.cmake | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index 04f2b80d346..5063e12fc1f 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -14,6 +14,15 @@ if (SANITIZE) endif () elseif (SANITIZE STREQUAL "memory") + # MemorySanitizer flags are set according to the official documentation: + # https://clang.llvm.org/docs/MemorySanitizer.html#usage + # + # For now, it compiles with `cmake -DSANITIZE=memory -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_CXX_FLAGS_ADD="-O1" -DCMAKE_C_FLAGS_ADD="-O1"` + # Compiling with -DCMAKE_BUILD_TYPE=Debug leads to ld.lld failures because + # of large files (was not tested with ld.gold). This is why we compile with + # RelWithDebInfo, and downgrade optimizations to -O1 but not to -Og, to + # keep the binary size down. + # TODO: try compiling with -Og and with ld.gold. set (MSAN_FLAGS "-fsanitize=memory -fsanitize-memory-track-origins -fno-optimize-sibling-calls") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${MSAN_FLAGS}") From b949cc232b1d70fe0a47bc36a1f807deeee70cdb Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Tue, 24 Sep 2019 22:03:10 +0300 Subject: [PATCH 246/309] Mark lookupResultGetKey/Mapped functions ALWAYS_INLINE. They only do pointer arithmetics, so it makes sense to always inline them. --- dbms/src/Common/HashTable/ClearableHashMap.h | 4 ++-- dbms/src/Common/HashTable/ClearableHashSet.h | 4 ++-- dbms/src/Common/HashTable/FixedHashMap.h | 6 ++++-- dbms/src/Common/HashTable/HashMap.h | 12 ++++++++---- dbms/src/Common/HashTable/HashSet.h | 6 ++++-- dbms/src/Common/HashTable/HashTable.h | 10 ++++++---- 6 files changed, 26 insertions(+), 16 deletions(-) diff --git a/dbms/src/Common/HashTable/ClearableHashMap.h b/dbms/src/Common/HashTable/ClearableHashMap.h index d1703394e14..e9f010cffe5 100644 --- a/dbms/src/Common/HashTable/ClearableHashMap.h +++ b/dbms/src/Common/HashTable/ClearableHashMap.h @@ -15,10 +15,10 @@ struct ClearableHashMapCell : public ClearableHashTableCell -auto lookupResultGetKey(ClearableHashMapCell * cell) { return &cell->getFirst(); } +ALWAYS_INLINE inline auto lookupResultGetKey(ClearableHashMapCell * cell) { return &cell->getFirst(); } template -auto lookupResultGetMapped(ClearableHashMapCell * cell) { return &cell->getSecond(); } +ALWAYS_INLINE inline auto lookupResultGetMapped(ClearableHashMapCell * cell) { return &cell->getSecond(); } template < diff --git a/dbms/src/Common/HashTable/ClearableHashSet.h b/dbms/src/Common/HashTable/ClearableHashSet.h index 4f079eddc78..240c32632a9 100644 --- a/dbms/src/Common/HashTable/ClearableHashSet.h +++ b/dbms/src/Common/HashTable/ClearableHashSet.h @@ -49,10 +49,10 @@ struct ClearableHashTableCell : public BaseCell }; template -auto lookupResultGetKey(ClearableHashTableCell * cell) { return &cell->key; } +ALWAYS_INLINE inline auto lookupResultGetKey(ClearableHashTableCell * cell) { return &cell->key; } template -void * lookupResultGetMapped(ClearableHashTableCell *) { return nullptr; } +ALWAYS_INLINE inline void * lookupResultGetMapped(ClearableHashTableCell *) { return nullptr; } template < diff --git a/dbms/src/Common/HashTable/FixedHashMap.h b/dbms/src/Common/HashTable/FixedHashMap.h index e0f1a2494e0..986b4af67c0 100644 --- a/dbms/src/Common/HashTable/FixedHashMap.h +++ b/dbms/src/Common/HashTable/FixedHashMap.h @@ -48,10 +48,12 @@ struct FixedHashMapCell }; template -void * lookupResultGetKey(FixedHashMapCell *) { return nullptr; } +ALWAYS_INLINE inline void * lookupResultGetKey(FixedHashMapCell *) +{ return nullptr; } template -auto lookupResultGetMapped(FixedHashMapCell * cell) { return &cell->getSecond(); } +ALWAYS_INLINE inline auto lookupResultGetMapped(FixedHashMapCell * cell) +{ return &cell->getSecond(); } template class FixedHashMap : public FixedHashTable, Allocator> diff --git a/dbms/src/Common/HashTable/HashMap.h b/dbms/src/Common/HashTable/HashMap.h index 1d9ca29b77a..f273d5bcdc7 100644 --- a/dbms/src/Common/HashTable/HashMap.h +++ b/dbms/src/Common/HashTable/HashMap.h @@ -111,10 +111,12 @@ struct HashMapCell }; template -auto lookupResultGetKey(HashMapCell * cell) { return &cell->getFirst(); } +ALWAYS_INLINE inline auto lookupResultGetKey(HashMapCell * cell) +{ return &cell->getFirst(); } template -auto lookupResultGetMapped(HashMapCell * cell) { return &cell->getSecond(); } +ALWAYS_INLINE inline auto lookupResultGetMapped(HashMapCell * cell) +{ return &cell->getSecond(); } template @@ -135,10 +137,12 @@ struct HashMapCellWithSavedHash : public HashMapCell }; template -auto lookupResultGetKey(HashMapCellWithSavedHash * cell) { return &cell->getFirst(); } +ALWAYS_INLINE inline auto lookupResultGetKey(HashMapCellWithSavedHash * cell) +{ return &cell->getFirst(); } template -auto lookupResultGetMapped(HashMapCellWithSavedHash * cell) { return &cell->getSecond(); } +ALWAYS_INLINE inline auto lookupResultGetMapped(HashMapCellWithSavedHash * cell) +{ return &cell->getSecond(); } template < diff --git a/dbms/src/Common/HashTable/HashSet.h b/dbms/src/Common/HashTable/HashSet.h index 9c25f7f906b..4b3aa5204ea 100644 --- a/dbms/src/Common/HashTable/HashSet.h +++ b/dbms/src/Common/HashTable/HashSet.h @@ -85,10 +85,12 @@ struct HashSetCellWithSavedHash : public HashTableCell }; template -auto lookupResultGetKey(HashSetCellWithSavedHash * cell) { return &cell->key; } +ALWAYS_INLINE inline auto lookupResultGetKey(HashSetCellWithSavedHash * cell) +{ return &cell->key; } template -void * lookupResultGetMapped(HashSetCellWithSavedHash *) { return nullptr; } +ALWAYS_INLINE inline void * lookupResultGetMapped(HashSetCellWithSavedHash *) +{ return nullptr; } template < diff --git a/dbms/src/Common/HashTable/HashTable.h b/dbms/src/Common/HashTable/HashTable.h index 03822996361..f13d6f6e3dd 100644 --- a/dbms/src/Common/HashTable/HashTable.h +++ b/dbms/src/Common/HashTable/HashTable.h @@ -124,7 +124,7 @@ void set(T & x) { x = 0; } * The default implementation of GetMapped that is used for the above case (2). */ template -inline auto lookupResultGetMapped(PointerLike && ptr) { return &*ptr; } +ALWAYS_INLINE inline auto lookupResultGetMapped(PointerLike && ptr) { return &*ptr; } /** * Generic const wrapper for lookupResultGetMapped, that calls a non-const @@ -132,7 +132,7 @@ inline auto lookupResultGetMapped(PointerLike && ptr) { return &*ptr; } * arithmetics. */ template -auto lookupResultGetMapped(const T * obj) +ALWAYS_INLINE inline auto lookupResultGetMapped(const T * obj) { auto mapped_ptr = lookupResultGetMapped(const_cast(obj)); const auto const_mapped_ptr = mapped_ptr; @@ -208,10 +208,12 @@ struct HashTableCell }; template -auto lookupResultGetKey(HashTableCell * cell) { return &cell->key; } +ALWAYS_INLINE inline auto lookupResultGetKey(HashTableCell * cell) +{ return &cell->key; } template -void * lookupResultGetMapped(HashTableCell *) { return nullptr; } +ALWAYS_INLINE inline void * lookupResultGetMapped(HashTableCell *) +{ return nullptr; } /** * A helper function for HashTable::insert() to set the "mapped" value. From d642304b1d38317b494aaa432403704293e7dad5 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 24 Sep 2019 22:56:45 +0300 Subject: [PATCH 247/309] add CustomSeparated format --- dbms/src/Core/Settings.h | 10 +++++ .../Formats/ParsedTemplateFormatString.cpp | 40 ++++++++++++++++++- dbms/src/Formats/ParsedTemplateFormatString.h | 7 +++- .../Impl/TemplateBlockOutputFormat.cpp | 16 ++++++++ .../Formats/Impl/TemplateRowInputFormat.cpp | 18 +++++++++ .../Formats/Impl/TemplateRowInputFormat.h | 2 +- .../01014_format_custom_separated.reference | 10 +++++ .../01014_format_custom_separated.sh | 33 +++++++++++++++ 8 files changed, 132 insertions(+), 4 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/01014_format_custom_separated.reference create mode 100755 dbms/tests/queries/0_stateless/01014_format_custom_separated.sh diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index b2646749ee5..b1422c62da9 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -216,10 +216,20 @@ struct Settings : public SettingsCollection M(SettingInt64, distributed_ddl_task_timeout, 180, "Timeout for DDL query responses from all hosts in cluster. If a ddl request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. Negative value means infinite.") \ M(SettingMilliseconds, stream_flush_interval_ms, 7500, "Timeout for flushing data from streaming storages.") \ M(SettingMilliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from/to streaming storages.") \ + \ M(SettingString, format_schema, "", "Schema identifier (used by schema-based formats)") \ M(SettingString, format_template_resultset, "", "Path to file which contains format string for result set (for Template format)") \ M(SettingString, format_template_row, "", "Path to file which contains format string for rows (for Template format)") \ M(SettingString, format_template_rows_between_delimiter, "\n", "Delimiter between rows (for Template format)") \ + \ + M(SettingString, format_custom_escaping_rule, "", "Field escaping rule (for CustomSeparated format)") \ + M(SettingString, format_custom_field_delimiter, "\t", "Delimiter between fields (for CustomSeparated format)") \ + M(SettingString, format_custom_row_before_delimiter, "", "Delimiter before field of the first column (for CustomSeparated format)") \ + M(SettingString, format_custom_row_after_delimiter, "", "Delimiter after field of the last column (for CustomSeparated format)") \ + M(SettingString, format_custom_row_between_delimiter, "\n", "Delimiter between rows (for CustomSeparated format)") \ + M(SettingString, format_custom_result_before_delimiter, "", "Prefix before result set (for CustomSeparated format)") \ + M(SettingString, format_custom_result_after_delimiter, "", "Suffix after result set (for CustomSeparated format)") \ + \ M(SettingBool, insert_allow_materialized_columns, 0, "If setting is enabled, Allow materialized columns in INSERT.") \ M(SettingSeconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.") \ M(SettingSeconds, http_send_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP send timeout") \ diff --git a/dbms/src/Formats/ParsedTemplateFormatString.cpp b/dbms/src/Formats/ParsedTemplateFormatString.cpp index 4e9612717f2..e02b972345e 100644 --- a/dbms/src/Formats/ParsedTemplateFormatString.cpp +++ b/dbms/src/Formats/ParsedTemplateFormatString.cpp @@ -4,6 +4,8 @@ #include #include #include +#include +#include namespace DB { @@ -119,7 +121,7 @@ void ParsedTemplateFormatString::parse(const String & format_string, const Colum } -ParsedTemplateFormatString::ColumnFormat ParsedTemplateFormatString::stringToFormat(const String & col_format) const +ParsedTemplateFormatString::ColumnFormat ParsedTemplateFormatString::stringToFormat(const String & col_format) { if (col_format.empty()) return ColumnFormat::None; @@ -138,7 +140,7 @@ ParsedTemplateFormatString::ColumnFormat ParsedTemplateFormatString::stringToFor else if (col_format == "Raw") return ColumnFormat::Raw; else - throwInvalidFormat("Unknown field format " + col_format, columnsCount()); + throw Exception("Unknown field format \"" + col_format + "\"", ErrorCodes::BAD_ARGUMENTS); } size_t ParsedTemplateFormatString::columnsCount() const @@ -233,4 +235,38 @@ void ParsedTemplateFormatString::throwInvalidFormat(const String & message, size ErrorCodes::INVALID_TEMPLATE_FORMAT); } +ParsedTemplateFormatString ParsedTemplateFormatString::setupCustomSeparatedResultsetFormat(const Context & context) +{ + const Settings & settings = context.getSettingsRef(); + + /// Set resultset format to "result_before_delimiter ${data} result_after_delimiter" + ParsedTemplateFormatString resultset_format; + resultset_format.delimiters.emplace_back(settings.format_custom_result_before_delimiter); + resultset_format.delimiters.emplace_back(settings.format_custom_result_after_delimiter); + resultset_format.formats.emplace_back(ParsedTemplateFormatString::ColumnFormat::None); + resultset_format.format_idx_to_column_idx.emplace_back(0); + resultset_format.column_names.emplace_back("data"); + return resultset_format; +} + +ParsedTemplateFormatString ParsedTemplateFormatString::setupCustomSeparatedRowFormat(const Context & context, const Block & sample) +{ + const Settings & settings = context.getSettingsRef(); + + /// Set row format to + /// "row_before_delimiter ${Col0:escaping} field_delimiter ${Col1:escaping} field_delimiter ... ${ColN:escaping} row_after_delimiter" + ParsedTemplateFormatString::ColumnFormat escaping = ParsedTemplateFormatString::stringToFormat(settings.format_custom_escaping_rule); + ParsedTemplateFormatString row_format; + row_format.delimiters.emplace_back(settings.format_custom_row_before_delimiter); + for (size_t i = 0; i < sample.columns(); ++i) + { + row_format.formats.emplace_back(escaping); + row_format.format_idx_to_column_idx.emplace_back(i); + row_format.column_names.emplace_back(sample.getByPosition(i).name); + bool last_column = i == sample.columns() - 1; + row_format.delimiters.emplace_back(last_column ? settings.format_custom_row_after_delimiter : settings.format_custom_field_delimiter); + } + return row_format; +} + } diff --git a/dbms/src/Formats/ParsedTemplateFormatString.h b/dbms/src/Formats/ParsedTemplateFormatString.h index a1f6831c3fe..cb751d1412a 100644 --- a/dbms/src/Formats/ParsedTemplateFormatString.h +++ b/dbms/src/Formats/ParsedTemplateFormatString.h @@ -8,6 +8,8 @@ namespace DB { +class Block; + struct ParsedTemplateFormatString { enum class ColumnFormat @@ -39,13 +41,16 @@ struct ParsedTemplateFormatString void parse(const String & format_string, const ColumnIdxGetter & idx_by_name); - ColumnFormat stringToFormat(const String & format) const; + static ColumnFormat stringToFormat(const String & format); static String formatToString(ColumnFormat format); static const char * readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s); size_t columnsCount() const; String dump() const; [[noreturn]] void throwInvalidFormat(const String & message, size_t column) const; + + static ParsedTemplateFormatString setupCustomSeparatedResultsetFormat(const Context & context); + static ParsedTemplateFormatString setupCustomSeparatedRowFormat(const Context & context, const Block & sample); }; } diff --git a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index 12078e33172..aa7bfbeaff1 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace DB @@ -262,5 +263,20 @@ void registerOutputFormatProcessorTemplate(FormatFactory & factory) return std::make_shared(sample, buf, settings, resultset_format, row_format); }); + + factory.registerOutputFormatProcessor("CustomSeparated", []( + WriteBuffer & buf, + const Block & sample, + const Context & context, + FormatFactory::WriteCallback, + const FormatSettings & settings) + { + ParsedTemplateFormatString resultset_format = ParsedTemplateFormatString::setupCustomSeparatedResultsetFormat(context); + ParsedTemplateFormatString row_format = ParsedTemplateFormatString::setupCustomSeparatedRowFormat(context, sample); + FormatSettings format_settings = settings; + format_settings.template_settings.row_between_delimiter = context.getSettingsRef().format_custom_row_between_delimiter; + + return std::make_shared(sample, buf, format_settings, resultset_format, row_format); + }); } } diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 9f4b2147452..f0b2238ec7f 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -518,6 +518,24 @@ void registerInputFormatProcessorTemplate(FormatFactory & factory) return std::make_shared(sample, buf, params, settings, ignore_spaces, resultset_format, row_format); }); } + + for (bool ignore_spaces : {false, true}) + { + factory.registerInputFormatProcessor(ignore_spaces ? "CustomSeparatedIgnoreSpaces" : "CustomSeparated", [=]( + ReadBuffer & buf, + const Block & sample, + const Context & context, + IRowInputFormat::Params params, + const FormatSettings & settings) + { + ParsedTemplateFormatString resultset_format = ParsedTemplateFormatString::setupCustomSeparatedResultsetFormat(context); + ParsedTemplateFormatString row_format = ParsedTemplateFormatString::setupCustomSeparatedRowFormat(context, sample); + FormatSettings format_settings = settings; + format_settings.template_settings.row_between_delimiter = context.getSettingsRef().format_custom_row_between_delimiter; + + return std::make_shared(sample, buf, params, format_settings, ignore_spaces, resultset_format, row_format); + }); + } } } diff --git a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h index 3ad80f48207..2142d492988 100644 --- a/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h +++ b/dbms/src/Processors/Formats/Impl/TemplateRowInputFormat.h @@ -50,7 +50,7 @@ private: PeekableReadBuffer buf; DataTypes data_types; - FormatSettings settings; + const FormatSettings settings; const bool ignore_spaces; ParsedTemplateFormatString format; ParsedTemplateFormatString row_format; diff --git a/dbms/tests/queries/0_stateless/01014_format_custom_separated.reference b/dbms/tests/queries/0_stateless/01014_format_custom_separated.reference new file mode 100644 index 00000000000..d46a6fdf5b1 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01014_format_custom_separated.reference @@ -0,0 +1,10 @@ +========== result ========== +||0 | "2019-09-24" | "hello" || +||1 | "2019-09-25" | "world" || +||2 | "2019-09-26" | "custom" || +||3 | "2019-09-27" | "separated" || +============================ +0,"2019-09-24","hello" +1,"2019-09-25","world" +2,"2019-09-26","custom" +3,"2019-09-27","separated" diff --git a/dbms/tests/queries/0_stateless/01014_format_custom_separated.sh b/dbms/tests/queries/0_stateless/01014_format_custom_separated.sh new file mode 100755 index 00000000000..672e3686bc2 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01014_format_custom_separated.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +. $CURDIR/../shell_config.sh + +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS custom_separated" +$CLICKHOUSE_CLIENT --query="CREATE TABLE custom_separated (n UInt64, d Date, s String) ENGINE = Memory()" +$CLICKHOUSE_CLIENT --query="INSERT INTO custom_separated VALUES (0, '2019-09-24', 'hello'), (1, '2019-09-25', 'world'), (2, '2019-09-26', 'custom'), (3, '2019-09-27', 'separated')" + +$CLICKHOUSE_CLIENT --query="SELECT * FROM custom_separated ORDER BY n FORMAT CustomSeparated SETTINGS \ +format_custom_escaping_rule = 'CSV', \ +format_custom_field_delimiter = '\t|\t', \ +format_custom_row_before_delimiter = '||', \ +format_custom_row_after_delimiter = '\t||', \ +format_custom_result_before_delimiter = '========== result ==========\n', \ +format_custom_result_after_delimiter = '\n============================\n'" + +$CLICKHOUSE_CLIENT --query="TRUNCATE TABLE custom_separated" + +echo '0, "2019-09-24", "hello" +1, 2019-09-25, "world" +2, "2019-09-26", custom +3, 2019-09-27, separated +end' | $CLICKHOUSE_CLIENT --query="INSERT INTO custom_separated FORMAT CustomSeparated SETTINGS \ +format_custom_escaping_rule = 'CSV', \ +format_custom_field_delimiter = ', ', \ +format_custom_row_after_delimiter = '\n', \ +format_custom_row_between_delimiter = '', \ +format_custom_result_after_delimiter = 'end\n'" + +$CLICKHOUSE_CLIENT --query="SELECT * FROM custom_separated ORDER BY n FORMAT CSV" + +$CLICKHOUSE_CLIENT --query="DROP TABLE custom_separated" From dfb762d93ca3cfbae00e5eee36ed899d1ad9e852 Mon Sep 17 00:00:00 2001 From: Yuriy Date: Wed, 25 Sep 2019 01:28:59 +0300 Subject: [PATCH 248/309] Revert "Revert "Updated MariaDB"" This reverts commit f8b7cc86931ab33362dcbe452db79bff408adf47. --- contrib/CMakeLists.txt | 16 +- contrib/mariadb-connector-c | 2 +- .../mariadb-connector-c-cmake/CMakeLists.txt | 74 --- .../common/include/mysql/mysql.h | 1 - .../common/include/mysql/mysqld_error.h | 1 - .../linux_x86_64/include/config.h | 269 ---------- .../linux_x86_64/include/ma_config.h | 269 ---------- .../linux_x86_64/include/mariadb_version.h | 36 -- .../libmariadb/ma_client_plugin.c | 502 ------------------ libs/libmysqlxx/CMakeLists.txt | 3 +- libs/libmysqlxx/cmake/find_mysqlclient.cmake | 4 +- libs/libmysqlxx/src/Connection.cpp | 4 +- libs/libmysqlxx/src/Exception.cpp | 4 +- libs/libmysqlxx/src/Pool.cpp | 6 +- libs/libmysqlxx/src/Query.cpp | 4 +- libs/libmysqlxx/src/ResultBase.cpp | 4 +- libs/libmysqlxx/src/Row.cpp | 4 +- libs/libmysqlxx/src/StoreQueryResult.cpp | 4 +- libs/libmysqlxx/src/UseQueryResult.cpp | 4 +- 19 files changed, 32 insertions(+), 1179 deletions(-) delete mode 100644 contrib/mariadb-connector-c-cmake/CMakeLists.txt delete mode 100644 contrib/mariadb-connector-c-cmake/common/include/mysql/mysql.h delete mode 100644 contrib/mariadb-connector-c-cmake/common/include/mysql/mysqld_error.h delete mode 100644 contrib/mariadb-connector-c-cmake/linux_x86_64/include/config.h delete mode 100644 contrib/mariadb-connector-c-cmake/linux_x86_64/include/ma_config.h delete mode 100644 contrib/mariadb-connector-c-cmake/linux_x86_64/include/mariadb_version.h delete mode 100644 contrib/mariadb-connector-c-cmake/linux_x86_64/libmariadb/ma_client_plugin.c diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 0833614594d..5e6f90b6a59 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -65,7 +65,7 @@ if (USE_INTERNAL_ZLIB_LIBRARY) endif () add_subdirectory (${INTERNAL_ZLIB_NAME}) - # todo: make pull to Dead2/zlib-ng and remove: + # TODO: make pull to Dead2/zlib-ng and remove: # We should use same defines when including zlib.h as used when zlib compiled target_compile_definitions (zlib PUBLIC ZLIB_COMPAT WITH_GZFILEOP) target_compile_definitions (zlibstatic PUBLIC ZLIB_COMPAT WITH_GZFILEOP) @@ -125,11 +125,15 @@ if (USE_INTERNAL_SSL_LIBRARY) endif () if (ENABLE_MYSQL AND USE_INTERNAL_MYSQL_LIBRARY) - add_subdirectory (mariadb-connector-c-cmake) - target_include_directories(mysqlclient BEFORE PRIVATE ${ZLIB_INCLUDE_DIR}) - if(OPENSSL_INCLUDE_DIR) - target_include_directories(mysqlclient BEFORE PRIVATE ${OPENSSL_INCLUDE_DIR}) - endif() + set(CLIENT_PLUGIN_CACHING_SHA2_PASSWORD STATIC) + set(CLIENT_PLUGIN_SHA256_PASSWORD STATIC) + set(CLIENT_PLUGIN_REMOTE_IO OFF) + set(CLIENT_PLUGIN_DIALOG OFF) + set(CLIENT_PLUGIN_CLIENT_ED25519 OFF) + set(CLIENT_PLUGIN_MYSQL_CLEAR_PASSWORD OFF) + set(SKIP_TESTS 1) + set(LIBM glibc-compatibility) + add_subdirectory (mariadb-connector-c) endif () if (USE_INTERNAL_RDKAFKA_LIBRARY) diff --git a/contrib/mariadb-connector-c b/contrib/mariadb-connector-c index c6503d3acc8..18016300b00 160000 --- a/contrib/mariadb-connector-c +++ b/contrib/mariadb-connector-c @@ -1 +1 @@ -Subproject commit c6503d3acc85ca1a7f5e7e38b605d7c9410aac1e +Subproject commit 18016300b00825a3fcbc6fb2aa37ac3e51416f71 diff --git a/contrib/mariadb-connector-c-cmake/CMakeLists.txt b/contrib/mariadb-connector-c-cmake/CMakeLists.txt deleted file mode 100644 index 2e80b0c325f..00000000000 --- a/contrib/mariadb-connector-c-cmake/CMakeLists.txt +++ /dev/null @@ -1,74 +0,0 @@ -set(MARIADB_CLIENT_SOURCE_DIR ${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c) -set(MARIADB_CLIENT_BINARY_DIR ${ClickHouse_BINARY_DIR}/contrib/mariadb-connector-c) - -set(SRCS -#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/bmove_upp.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/get_password.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_alloc.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_array.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_charset.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_compress.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_context.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_default.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_dtoa.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_errmsg.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_hash.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_init.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_io.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_list.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_ll2str.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_loaddata.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_net.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_password.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_pvio.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/mariadb_async.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/mariadb_charset.c -#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/mariadb_dyncol.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/mariadb_lib.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/mariadb_stmt.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_sha1.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_stmt_codec.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_string.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_time.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/ma_tls.c -${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/openssl_crypt.c -#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/gnutls.c -#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/ma_schannel.c -#${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/schannel.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/auth_gssapi_client.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/dialog.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/gssapi_client.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/gssapi_errmsg.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/mariadb_cleartext.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/my_auth.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/old_password.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/sha256_pw.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/caching_sha2_pw.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/sspi_client.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/auth/sspi_errmsg.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/connection/aurora.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/connection/replication.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/io/remote_io.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/pvio/pvio_npipe.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/pvio/pvio_shmem.c -${MARIADB_CLIENT_SOURCE_DIR}/plugins/pvio/pvio_socket.c -#${MARIADB_CLIENT_SOURCE_DIR}/plugins/trace/trace_example.c -${CMAKE_CURRENT_SOURCE_DIR}/linux_x86_64/libmariadb/ma_client_plugin.c -) - -if(OPENSSL_LIBRARIES) - list(APPEND SRCS ${MARIADB_CLIENT_SOURCE_DIR}/libmariadb/secure/openssl.c) -endif() - -add_library(mysqlclient ${SRCS}) - -if(OPENSSL_LIBRARIES) - target_link_libraries(mysqlclient PRIVATE ${OPENSSL_LIBRARIES}) - target_compile_definitions(mysqlclient PRIVATE -D HAVE_OPENSSL -D HAVE_TLS) -endif() - -target_include_directories(mysqlclient PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/linux_x86_64/include) -target_include_directories(mysqlclient PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/common/include) -target_include_directories(mysqlclient PUBLIC ${MARIADB_CLIENT_SOURCE_DIR}/include) - -target_compile_definitions(mysqlclient PRIVATE -D THREAD) diff --git a/contrib/mariadb-connector-c-cmake/common/include/mysql/mysql.h b/contrib/mariadb-connector-c-cmake/common/include/mysql/mysql.h deleted file mode 100644 index 741c7ba03c9..00000000000 --- a/contrib/mariadb-connector-c-cmake/common/include/mysql/mysql.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/contrib/mariadb-connector-c-cmake/common/include/mysql/mysqld_error.h b/contrib/mariadb-connector-c-cmake/common/include/mysql/mysqld_error.h deleted file mode 100644 index 95d26eef163..00000000000 --- a/contrib/mariadb-connector-c-cmake/common/include/mysql/mysqld_error.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/contrib/mariadb-connector-c-cmake/linux_x86_64/include/config.h b/contrib/mariadb-connector-c-cmake/linux_x86_64/include/config.h deleted file mode 100644 index 90c42c97df6..00000000000 --- a/contrib/mariadb-connector-c-cmake/linux_x86_64/include/config.h +++ /dev/null @@ -1,269 +0,0 @@ - -/* - * Include file constants (processed in LibmysqlIncludeFiles.txt 1 - */ -#define HAVE_ALLOCA_H 1 -/* #undef HAVE_BIGENDIAN */ -#define HAVE_SETLOCALE 1 -#define HAVE_NL_LANGINFO 1 -#define HAVE_ARPA_INET_H 1 -#define HAVE_CRYPT_H 1 -#define HAVE_DIRENT_H 1 -#define HAVE_DLFCN_H 1 -#define HAVE_EXECINFO_H 1 -#define HAVE_FCNTL_H 1 -#define HAVE_FENV_H 1 -#define HAVE_FLOAT_H 1 -/* #undef HAVE_FPU_CONTROL_H */ -#define HAVE_GRP_H 1 -/* #undef HAVE_IEEEFP_H */ -#define HAVE_LIMITS_H 1 -#define HAVE_MALLOC_H 1 -#define HAVE_MEMORY_H 1 -#define HAVE_NETINET_IN_H 1 -#define HAVE_PATHS_H 1 -#define HAVE_PWD_H 1 -#define HAVE_SCHED_H 1 -/* #undef HAVE_SELECT_H */ -#define HAVE_STDDEF_H 1 -#define HAVE_STDINT_H 1 -#define HAVE_STDLIB_H 1 -#define HAVE_STRING_H 1 -#define HAVE_STRINGS_H 1 -/* #undef HAVE_SYNCH_H */ -/* #undef HAVE_SYS_FPU_H */ -#define HAVE_SYS_IOCTL_H 1 -#define HAVE_SYS_IPC_H 1 -#define HAVE_SYS_MMAN_H 1 -#define HAVE_SYS_PRCTL_H 1 -#define HAVE_SYS_SELECT_H 1 -#define HAVE_SYS_SHM_H 1 -#define HAVE_SYS_SOCKET_H 1 -#define HAVE_SYS_STAT_H 1 -/* #undef HAVE_SYS_STREAM_H */ -#define HAVE_SYS_TIMEB_H 1 -#define HAVE_SYS_TYPES_H 1 -#define HAVE_SYS_UN_H 1 -/* #undef HAVE_SYSENT_H */ -#define HAVE_TERMIO_H 1 -#define HAVE_TERMIOS_H 1 -#define HAVE_UNISTD_H 1 -#define HAVE_UTIME_H 1 -#define HAVE_UCONTEXT_H 1 - -/* - * function definitions - processed in LibmysqlFunctions.txt - */ -#define HAVE_ACCESS 1 -/* #undef HAVE_AIOWAIT */ -#define HAVE_ALARM 1 -/* #undef HAVE_ALLOCA */ -#define HAVE_BCMP 1 -/* #undef HAVE_BFILL */ -/* #undef HAVE_BMOVE */ -#define HAVE_BZERO 1 -#define HAVE_CLOCK_GETTIME 1 -/* #undef HAVE_COMPRESS */ -/* #undef HAVE_CRYPT */ -#define HAVE_DLERROR 1 -#define HAVE_DLOPEN 1 -#define HAVE_FCHMOD 1 -#define HAVE_FCNTL 1 -/* #undef HAVE_FCONVERT */ -#define HAVE_FDATASYNC 1 -#define HAVE_FESETROUND 1 -#define HAVE_FINITE 1 -#define HAVE_FSEEKO 1 -#define HAVE_FSYNC 1 -#define HAVE_GETADDRINFO 1 -#define HAVE_GETCWD 1 -#define HAVE_GETHOSTBYADDR_R 1 -#define HAVE_GETHOSTBYNAME_R 1 -/* #undef HAVE_GETHRTIME */ -#define HAVE_GETNAMEINFO 1 -#define HAVE_GETPAGESIZE 1 -#define HAVE_GETPASS 1 -/* #undef HAVE_GETPASSPHRASE */ -#define HAVE_GETPWNAM 1 -#define HAVE_GETPWUID 1 -#define HAVE_GETRLIMIT 1 -#define HAVE_GETRUSAGE 1 -#define HAVE_GETWD 1 -#define HAVE_GMTIME_R 1 -#define HAVE_INITGROUPS 1 -#define HAVE_LDIV 1 -#define HAVE_LOCALTIME_R 1 -#define HAVE_LOG2 1 -#define HAVE_LONGJMP 1 -#define HAVE_LSTAT 1 -#define HAVE_MADVISE 1 -#define HAVE_MALLINFO 1 -#define HAVE_MEMALIGN 1 -#define HAVE_MEMCPY 1 -#define HAVE_MEMMOVE 1 -#define HAVE_MKSTEMP 1 -#define HAVE_MLOCK 1 -#define HAVE_MLOCKALL 1 -#define HAVE_MMAP 1 -#define HAVE_MMAP64 1 -#define HAVE_PERROR 1 -#define HAVE_POLL 1 -#define HAVE_PREAD 1 -/* #undef HAVE_PTHREAD_ATTR_CREATE */ -#define HAVE_PTHREAD_ATTR_GETSTACKSIZE 1 -/* #undef HAVE_PTHREAD_ATTR_SETPRIO */ -#define HAVE_PTHREAD_ATTR_SETSCHEDPARAM 1 -#define HAVE_PTHREAD_ATTR_SETSCOPE 1 -#define HAVE_PTHREAD_ATTR_SETSTACKSIZE 1 -/* #undef HAVE_PTHREAD_CONDATTR_CREATE */ -/* #undef HAVE_PTHREAD_INIT */ -#define HAVE_PTHREAD_KEY_DELETE 1 -#define HAVE_PTHREAD_KILL 1 -#define HAVE_PTHREAD_RWLOCK_RDLOCK 1 -/* #undef HAVE_PTHREAD_SETPRIO_NP */ -#define HAVE_PTHREAD_SETSCHEDPARAM 1 -#define HAVE_PTHREAD_SIGMASK 1 -/* #undef HAVE_PTHREAD_THREADMASK */ -/* #undef HAVE_PTHREAD_YIELD_NP */ -#define HAVE_READDIR_R 1 -#define HAVE_READLINK 1 -#define HAVE_REALPATH 1 -#define HAVE_RENAME 1 -#define HAVE_SCHED_YIELD 1 -#define HAVE_SELECT 1 -/* #undef HAVE_SETFD */ -/* #undef HAVE_SETFILEPOINTER */ -#define HAVE_SIGNAL 1 -#define HAVE_SIGACTION 1 -/* #undef HAVE_SIGTHREADMASK */ -#define HAVE_SIGWAIT 1 -#define HAVE_SLEEP 1 -#define HAVE_SNPRINTF 1 -/* #undef HAVE_SQLITE */ -#define HAVE_STPCPY 1 -#define HAVE_STRERROR 1 -/* #undef HAVE_STRLCPY */ -#define HAVE_STRNLEN 1 -#define HAVE_STRPBRK 1 -#define HAVE_STRSEP 1 -#define HAVE_STRSTR 1 -#define HAVE_STRTOK_R 1 -#define HAVE_STRTOL 1 -#define HAVE_STRTOLL 1 -#define HAVE_STRTOUL 1 -#define HAVE_STRTOULL 1 -/* #undef HAVE_TELL */ -/* #undef HAVE_THR_SETCONCURRENCY */ -/* #undef HAVE_THR_YIELD */ -#define HAVE_VASPRINTF 1 -#define HAVE_VSNPRINTF 1 - -/* - * types and sizes - */ -/* Types we may use */ -#define SIZEOF_CHAR 1 -#if defined(SIZEOF_CHAR) -# define HAVE_CHAR 1 -#endif - -#define SIZEOF_CHARP 8 -#if defined(SIZEOF_CHARP) -# define HAVE_CHARP 1 -#endif - -#define SIZEOF_SHORT 2 -#if defined(SIZEOF_SHORT) -# define HAVE_SHORT 1 -#endif - -#define SIZEOF_INT 4 -#if defined(SIZEOF_INT) -# define HAVE_INT 1 -#endif - -#define SIZEOF_LONG 8 -#if defined(SIZEOF_LONG) -# define HAVE_LONG 1 -#endif - -#define SIZEOF_LONG_LONG 8 -#if defined(SIZEOF_LONG_LONG) -# define HAVE_LONG_LONG 1 -#endif - - -#define SIZEOF_SIGSET_T 128 -#if defined(SIZEOF_SIGSET_T) -# define HAVE_SIGSET_T 1 -#endif - -#define SIZEOF_SIZE_T 8 -#if defined(SIZEOF_SIZE_T) -# define HAVE_SIZE_T 1 -#endif - -/* #undef SIZEOF_UCHAR */ -#if defined(SIZEOF_UCHAR) -# define HAVE_UCHAR 1 -#endif - -#define SIZEOF_UINT 4 -#if defined(SIZEOF_UINT) -# define HAVE_UINT 1 -#endif - -#define SIZEOF_ULONG 8 -#if defined(SIZEOF_ULONG) -# define HAVE_ULONG 1 -#endif - -/* #undef SIZEOF_INT8 */ -#if defined(SIZEOF_INT8) -# define HAVE_INT8 1 -#endif -/* #undef SIZEOF_UINT8 */ -#if defined(SIZEOF_UINT8) -# define HAVE_UINT8 1 -#endif - -/* #undef SIZEOF_INT16 */ -#if defined(SIZEOF_INT16) -# define HAVE_INT16 1 -#endif -/* #undef SIZEOF_UINT16 */ -#if defined(SIZEOF_UINT16) -# define HAVE_UINT16 1 -#endif - -/* #undef SIZEOF_INT32 */ -#if defined(SIZEOF_INT32) -# define HAVE_INT32 1 -#endif -/* #undef SIZEOF_UINT32 */ -#if defined(SIZEOF_UINT32) -# define HAVE_UINT32 1 -#endif -/* #undef SIZEOF_U_INT32_T */ -#if defined(SIZEOF_U_INT32_T) -# define HAVE_U_INT32_T 1 -#endif - -/* #undef SIZEOF_INT64 */ -#if defined(SIZEOF_INT64) -# define HAVE_INT64 1 -#endif -/* #undef SIZEOF_UINT64 */ -#if defined(SIZEOF_UINT64) -# define HAVE_UINT64 1 -#endif - -/* #undef SIZEOF_SOCKLEN_T */ -#if defined(SIZEOF_SOCKLEN_T) -# define HAVE_SOCKLEN_T 1 -#endif - -#define SOCKET_SIZE_TYPE socklen_t - -#define MARIADB_DEFAULT_CHARSET "latin1" - diff --git a/contrib/mariadb-connector-c-cmake/linux_x86_64/include/ma_config.h b/contrib/mariadb-connector-c-cmake/linux_x86_64/include/ma_config.h deleted file mode 100644 index 90c42c97df6..00000000000 --- a/contrib/mariadb-connector-c-cmake/linux_x86_64/include/ma_config.h +++ /dev/null @@ -1,269 +0,0 @@ - -/* - * Include file constants (processed in LibmysqlIncludeFiles.txt 1 - */ -#define HAVE_ALLOCA_H 1 -/* #undef HAVE_BIGENDIAN */ -#define HAVE_SETLOCALE 1 -#define HAVE_NL_LANGINFO 1 -#define HAVE_ARPA_INET_H 1 -#define HAVE_CRYPT_H 1 -#define HAVE_DIRENT_H 1 -#define HAVE_DLFCN_H 1 -#define HAVE_EXECINFO_H 1 -#define HAVE_FCNTL_H 1 -#define HAVE_FENV_H 1 -#define HAVE_FLOAT_H 1 -/* #undef HAVE_FPU_CONTROL_H */ -#define HAVE_GRP_H 1 -/* #undef HAVE_IEEEFP_H */ -#define HAVE_LIMITS_H 1 -#define HAVE_MALLOC_H 1 -#define HAVE_MEMORY_H 1 -#define HAVE_NETINET_IN_H 1 -#define HAVE_PATHS_H 1 -#define HAVE_PWD_H 1 -#define HAVE_SCHED_H 1 -/* #undef HAVE_SELECT_H */ -#define HAVE_STDDEF_H 1 -#define HAVE_STDINT_H 1 -#define HAVE_STDLIB_H 1 -#define HAVE_STRING_H 1 -#define HAVE_STRINGS_H 1 -/* #undef HAVE_SYNCH_H */ -/* #undef HAVE_SYS_FPU_H */ -#define HAVE_SYS_IOCTL_H 1 -#define HAVE_SYS_IPC_H 1 -#define HAVE_SYS_MMAN_H 1 -#define HAVE_SYS_PRCTL_H 1 -#define HAVE_SYS_SELECT_H 1 -#define HAVE_SYS_SHM_H 1 -#define HAVE_SYS_SOCKET_H 1 -#define HAVE_SYS_STAT_H 1 -/* #undef HAVE_SYS_STREAM_H */ -#define HAVE_SYS_TIMEB_H 1 -#define HAVE_SYS_TYPES_H 1 -#define HAVE_SYS_UN_H 1 -/* #undef HAVE_SYSENT_H */ -#define HAVE_TERMIO_H 1 -#define HAVE_TERMIOS_H 1 -#define HAVE_UNISTD_H 1 -#define HAVE_UTIME_H 1 -#define HAVE_UCONTEXT_H 1 - -/* - * function definitions - processed in LibmysqlFunctions.txt - */ -#define HAVE_ACCESS 1 -/* #undef HAVE_AIOWAIT */ -#define HAVE_ALARM 1 -/* #undef HAVE_ALLOCA */ -#define HAVE_BCMP 1 -/* #undef HAVE_BFILL */ -/* #undef HAVE_BMOVE */ -#define HAVE_BZERO 1 -#define HAVE_CLOCK_GETTIME 1 -/* #undef HAVE_COMPRESS */ -/* #undef HAVE_CRYPT */ -#define HAVE_DLERROR 1 -#define HAVE_DLOPEN 1 -#define HAVE_FCHMOD 1 -#define HAVE_FCNTL 1 -/* #undef HAVE_FCONVERT */ -#define HAVE_FDATASYNC 1 -#define HAVE_FESETROUND 1 -#define HAVE_FINITE 1 -#define HAVE_FSEEKO 1 -#define HAVE_FSYNC 1 -#define HAVE_GETADDRINFO 1 -#define HAVE_GETCWD 1 -#define HAVE_GETHOSTBYADDR_R 1 -#define HAVE_GETHOSTBYNAME_R 1 -/* #undef HAVE_GETHRTIME */ -#define HAVE_GETNAMEINFO 1 -#define HAVE_GETPAGESIZE 1 -#define HAVE_GETPASS 1 -/* #undef HAVE_GETPASSPHRASE */ -#define HAVE_GETPWNAM 1 -#define HAVE_GETPWUID 1 -#define HAVE_GETRLIMIT 1 -#define HAVE_GETRUSAGE 1 -#define HAVE_GETWD 1 -#define HAVE_GMTIME_R 1 -#define HAVE_INITGROUPS 1 -#define HAVE_LDIV 1 -#define HAVE_LOCALTIME_R 1 -#define HAVE_LOG2 1 -#define HAVE_LONGJMP 1 -#define HAVE_LSTAT 1 -#define HAVE_MADVISE 1 -#define HAVE_MALLINFO 1 -#define HAVE_MEMALIGN 1 -#define HAVE_MEMCPY 1 -#define HAVE_MEMMOVE 1 -#define HAVE_MKSTEMP 1 -#define HAVE_MLOCK 1 -#define HAVE_MLOCKALL 1 -#define HAVE_MMAP 1 -#define HAVE_MMAP64 1 -#define HAVE_PERROR 1 -#define HAVE_POLL 1 -#define HAVE_PREAD 1 -/* #undef HAVE_PTHREAD_ATTR_CREATE */ -#define HAVE_PTHREAD_ATTR_GETSTACKSIZE 1 -/* #undef HAVE_PTHREAD_ATTR_SETPRIO */ -#define HAVE_PTHREAD_ATTR_SETSCHEDPARAM 1 -#define HAVE_PTHREAD_ATTR_SETSCOPE 1 -#define HAVE_PTHREAD_ATTR_SETSTACKSIZE 1 -/* #undef HAVE_PTHREAD_CONDATTR_CREATE */ -/* #undef HAVE_PTHREAD_INIT */ -#define HAVE_PTHREAD_KEY_DELETE 1 -#define HAVE_PTHREAD_KILL 1 -#define HAVE_PTHREAD_RWLOCK_RDLOCK 1 -/* #undef HAVE_PTHREAD_SETPRIO_NP */ -#define HAVE_PTHREAD_SETSCHEDPARAM 1 -#define HAVE_PTHREAD_SIGMASK 1 -/* #undef HAVE_PTHREAD_THREADMASK */ -/* #undef HAVE_PTHREAD_YIELD_NP */ -#define HAVE_READDIR_R 1 -#define HAVE_READLINK 1 -#define HAVE_REALPATH 1 -#define HAVE_RENAME 1 -#define HAVE_SCHED_YIELD 1 -#define HAVE_SELECT 1 -/* #undef HAVE_SETFD */ -/* #undef HAVE_SETFILEPOINTER */ -#define HAVE_SIGNAL 1 -#define HAVE_SIGACTION 1 -/* #undef HAVE_SIGTHREADMASK */ -#define HAVE_SIGWAIT 1 -#define HAVE_SLEEP 1 -#define HAVE_SNPRINTF 1 -/* #undef HAVE_SQLITE */ -#define HAVE_STPCPY 1 -#define HAVE_STRERROR 1 -/* #undef HAVE_STRLCPY */ -#define HAVE_STRNLEN 1 -#define HAVE_STRPBRK 1 -#define HAVE_STRSEP 1 -#define HAVE_STRSTR 1 -#define HAVE_STRTOK_R 1 -#define HAVE_STRTOL 1 -#define HAVE_STRTOLL 1 -#define HAVE_STRTOUL 1 -#define HAVE_STRTOULL 1 -/* #undef HAVE_TELL */ -/* #undef HAVE_THR_SETCONCURRENCY */ -/* #undef HAVE_THR_YIELD */ -#define HAVE_VASPRINTF 1 -#define HAVE_VSNPRINTF 1 - -/* - * types and sizes - */ -/* Types we may use */ -#define SIZEOF_CHAR 1 -#if defined(SIZEOF_CHAR) -# define HAVE_CHAR 1 -#endif - -#define SIZEOF_CHARP 8 -#if defined(SIZEOF_CHARP) -# define HAVE_CHARP 1 -#endif - -#define SIZEOF_SHORT 2 -#if defined(SIZEOF_SHORT) -# define HAVE_SHORT 1 -#endif - -#define SIZEOF_INT 4 -#if defined(SIZEOF_INT) -# define HAVE_INT 1 -#endif - -#define SIZEOF_LONG 8 -#if defined(SIZEOF_LONG) -# define HAVE_LONG 1 -#endif - -#define SIZEOF_LONG_LONG 8 -#if defined(SIZEOF_LONG_LONG) -# define HAVE_LONG_LONG 1 -#endif - - -#define SIZEOF_SIGSET_T 128 -#if defined(SIZEOF_SIGSET_T) -# define HAVE_SIGSET_T 1 -#endif - -#define SIZEOF_SIZE_T 8 -#if defined(SIZEOF_SIZE_T) -# define HAVE_SIZE_T 1 -#endif - -/* #undef SIZEOF_UCHAR */ -#if defined(SIZEOF_UCHAR) -# define HAVE_UCHAR 1 -#endif - -#define SIZEOF_UINT 4 -#if defined(SIZEOF_UINT) -# define HAVE_UINT 1 -#endif - -#define SIZEOF_ULONG 8 -#if defined(SIZEOF_ULONG) -# define HAVE_ULONG 1 -#endif - -/* #undef SIZEOF_INT8 */ -#if defined(SIZEOF_INT8) -# define HAVE_INT8 1 -#endif -/* #undef SIZEOF_UINT8 */ -#if defined(SIZEOF_UINT8) -# define HAVE_UINT8 1 -#endif - -/* #undef SIZEOF_INT16 */ -#if defined(SIZEOF_INT16) -# define HAVE_INT16 1 -#endif -/* #undef SIZEOF_UINT16 */ -#if defined(SIZEOF_UINT16) -# define HAVE_UINT16 1 -#endif - -/* #undef SIZEOF_INT32 */ -#if defined(SIZEOF_INT32) -# define HAVE_INT32 1 -#endif -/* #undef SIZEOF_UINT32 */ -#if defined(SIZEOF_UINT32) -# define HAVE_UINT32 1 -#endif -/* #undef SIZEOF_U_INT32_T */ -#if defined(SIZEOF_U_INT32_T) -# define HAVE_U_INT32_T 1 -#endif - -/* #undef SIZEOF_INT64 */ -#if defined(SIZEOF_INT64) -# define HAVE_INT64 1 -#endif -/* #undef SIZEOF_UINT64 */ -#if defined(SIZEOF_UINT64) -# define HAVE_UINT64 1 -#endif - -/* #undef SIZEOF_SOCKLEN_T */ -#if defined(SIZEOF_SOCKLEN_T) -# define HAVE_SOCKLEN_T 1 -#endif - -#define SOCKET_SIZE_TYPE socklen_t - -#define MARIADB_DEFAULT_CHARSET "latin1" - diff --git a/contrib/mariadb-connector-c-cmake/linux_x86_64/include/mariadb_version.h b/contrib/mariadb-connector-c-cmake/linux_x86_64/include/mariadb_version.h deleted file mode 100644 index 821a7f8add2..00000000000 --- a/contrib/mariadb-connector-c-cmake/linux_x86_64/include/mariadb_version.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright Abandoned 1996, 1999, 2001 MySQL AB - This file is public domain and comes with NO WARRANTY of any kind */ - -/* Version numbers for protocol & mysqld */ - -#ifndef _mariadb_version_h_ -#define _mariadb_version_h_ - -#ifdef _CUSTOMCONFIG_ -#include -#else -#define PROTOCOL_VERSION 10 -#define MARIADB_CLIENT_VERSION_STR "10.3.6" -#define MARIADB_BASE_VERSION "mariadb-10.3" -#define MARIADB_VERSION_ID 100306 -#define MYSQL_VERSION_ID 100306 -#define MARIADB_PORT 3306 -#define MARIADB_UNIX_ADDR "/var/run/mysqld/mysqld.sock" -#define MYSQL_CONFIG_NAME "my" - -#define MARIADB_PACKAGE_VERSION "3.0.6" -#define MARIADB_PACKAGE_VERSION_ID 30006 -#define MARIADB_SYSTEM_TYPE "Linux" -#define MARIADB_MACHINE_TYPE "x86_64" -#define MARIADB_PLUGINDIR "lib/mariadb/plugin" - -/* mysqld compile time options */ -#ifndef MYSQL_CHARSET -#define MYSQL_CHARSET "" -#endif -#endif - -/* Source information */ -#define CC_SOURCE_REVISION "a0fd36cc5a5313414a5a2ebe9322577a29b4782a" - -#endif /* _mariadb_version_h_ */ diff --git a/contrib/mariadb-connector-c-cmake/linux_x86_64/libmariadb/ma_client_plugin.c b/contrib/mariadb-connector-c-cmake/linux_x86_64/libmariadb/ma_client_plugin.c deleted file mode 100644 index 434a4b3f4c3..00000000000 --- a/contrib/mariadb-connector-c-cmake/linux_x86_64/libmariadb/ma_client_plugin.c +++ /dev/null @@ -1,502 +0,0 @@ -/* Copyright (C) 2010 - 2012 Sergei Golubchik and Monty Program Ab - 2015-2016 MariaDB Corporation AB - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public - License along with this library; if not see - or write to the Free Software Foundation, Inc., - 51 Franklin St., Fifth Floor, Boston, MA 02110, USA */ - -/** - @file - - Support code for the client side (libmariadb) plugins - - Client plugins are somewhat different from server plugins, they are simpler. - - They do not need to be installed or in any way explicitly loaded on the - client, they are loaded automatically on demand. - One client plugin per shared object, soname *must* match the plugin name. - - There is no reference counting and no unloading either. -*/ - -#if _MSC_VER -/* Silence warnings about variable 'unused' being used. */ -#define FORCE_INIT_OF_VARS 1 -#endif - -#include -#include -#include -#include -#include - -#include "errmsg.h" -#include - -struct st_client_plugin_int { - struct st_client_plugin_int *next; - void *dlhandle; - struct st_mysql_client_plugin *plugin; -}; - -static my_bool initialized= 0; -static MA_MEM_ROOT mem_root; - -static uint valid_plugins[][2]= { - {MYSQL_CLIENT_AUTHENTICATION_PLUGIN, MYSQL_CLIENT_AUTHENTICATION_PLUGIN_INTERFACE_VERSION}, - {MARIADB_CLIENT_PVIO_PLUGIN, MARIADB_CLIENT_PVIO_PLUGIN_INTERFACE_VERSION}, - {MARIADB_CLIENT_TRACE_PLUGIN, MARIADB_CLIENT_TRACE_PLUGIN_INTERFACE_VERSION}, - {MARIADB_CLIENT_CONNECTION_PLUGIN, MARIADB_CLIENT_CONNECTION_PLUGIN_INTERFACE_VERSION}, - {0, 0} -}; - -/* - Loaded plugins are stored in a linked list. - The list is append-only, the elements are added to the head (like in a stack). - The elements are added under a mutex, but the list can be read and traversed - without any mutex because once an element is added to the list, it stays - there. The main purpose of a mutex is to prevent two threads from - loading the same plugin twice in parallel. -*/ - - -struct st_client_plugin_int *plugin_list[MYSQL_CLIENT_MAX_PLUGINS + MARIADB_CLIENT_MAX_PLUGINS]; -#ifdef THREAD -static pthread_mutex_t LOCK_load_client_plugin; -#endif - -extern struct st_mysql_client_plugin mysql_native_password_client_plugin; -extern struct st_mysql_client_plugin mysql_old_password_client_plugin; -extern struct st_mysql_client_plugin pvio_socket_client_plugin; -extern struct st_mysql_client_plugin sha256_password_client_plugin; -extern struct st_mysql_client_plugin caching_sha2_password_client_plugin; - - -struct st_mysql_client_plugin *mysql_client_builtins[]= -{ - (struct st_mysql_client_plugin *)&mysql_native_password_client_plugin, - (struct st_mysql_client_plugin *)&mysql_old_password_client_plugin, - (struct st_mysql_client_plugin *)&pvio_socket_client_plugin, - (struct st_mysql_client_plugin *)&sha256_password_client_plugin, - (struct st_mysql_client_plugin *)&caching_sha2_password_client_plugin, - 0 -}; - - -static int is_not_initialized(MYSQL *mysql, const char *name) -{ - if (initialized) - return 0; - - my_set_error(mysql, CR_AUTH_PLUGIN_CANNOT_LOAD, - SQLSTATE_UNKNOWN, ER(CR_AUTH_PLUGIN_CANNOT_LOAD), - name, "not initialized"); - return 1; -} - -static int get_plugin_nr(uint type) -{ - uint i= 0; - for(; valid_plugins[i][1]; i++) - if (valid_plugins[i][0] == type) - return i; - return -1; -} - -static const char *check_plugin_version(struct st_mysql_client_plugin *plugin, unsigned int version) -{ - if (plugin->interface_version < version || - (plugin->interface_version >> 8) > (version >> 8)) - return "Incompatible client plugin interface"; - return 0; -} - -/** - finds a plugin in the list - - @param name plugin name to search for - @param type plugin type - - @note this does NOT necessarily need a mutex, take care! - - @retval a pointer to a found plugin or 0 -*/ -static struct st_mysql_client_plugin *find_plugin(const char *name, int type) -{ - struct st_client_plugin_int *p; - int plugin_nr= get_plugin_nr(type); - - DBUG_ASSERT(initialized); - if (plugin_nr == -1) - return 0; - - if (!name) - return plugin_list[plugin_nr]->plugin; - - for (p= plugin_list[plugin_nr]; p; p= p->next) - { - if (strcmp(p->plugin->name, name) == 0) - return p->plugin; - } - return NULL; -} - - -/** - verifies the plugin and adds it to the list - - @param mysql MYSQL structure (for error reporting) - @param plugin plugin to install - @param dlhandle a handle to the shared object (returned by dlopen) - or 0 if the plugin was not dynamically loaded - @param argc number of arguments in the 'va_list args' - @param args arguments passed to the plugin initialization function - - @retval a pointer to an installed plugin or 0 -*/ - -static struct st_mysql_client_plugin * -add_plugin(MYSQL *mysql, struct st_mysql_client_plugin *plugin, void *dlhandle, - int argc, va_list args) -{ - const char *errmsg; - struct st_client_plugin_int plugin_int, *p; - char errbuf[1024]; - int plugin_nr; - - DBUG_ASSERT(initialized); - - plugin_int.plugin= plugin; - plugin_int.dlhandle= dlhandle; - - if ((plugin_nr= get_plugin_nr(plugin->type)) == -1) - { - errmsg= "Unknown client plugin type"; - goto err1; - } - if ((errmsg= check_plugin_version(plugin, valid_plugins[plugin_nr][1]))) - goto err1; - - /* Call the plugin initialization function, if any */ - if (plugin->init && plugin->init(errbuf, sizeof(errbuf), argc, args)) - { - errmsg= errbuf; - goto err1; - } - - p= (struct st_client_plugin_int *) - ma_memdup_root(&mem_root, (char *)&plugin_int, sizeof(plugin_int)); - - if (!p) - { - errmsg= "Out of memory"; - goto err2; - } - -#ifdef THREAD - safe_mutex_assert_owner(&LOCK_load_client_plugin); -#endif - - p->next= plugin_list[plugin_nr]; - plugin_list[plugin_nr]= p; - - return plugin; - -err2: - if (plugin->deinit) - plugin->deinit(); -err1: - my_set_error(mysql, CR_AUTH_PLUGIN_CANNOT_LOAD, SQLSTATE_UNKNOWN, - ER(CR_AUTH_PLUGIN_CANNOT_LOAD), plugin->name, errmsg); - if (dlhandle) - (void)dlclose(dlhandle); - return NULL; -} - - -/** - Loads plugins which are specified in the environment variable - LIBMYSQL_PLUGINS. - - Multiple plugins must be separated by semicolon. This function doesn't - return or log an error. - - The function is be called by mysql_client_plugin_init - - @todo - Support extended syntax, passing parameters to plugins, for example - LIBMYSQL_PLUGINS="plugin1(param1,param2);plugin2;..." - or - LIBMYSQL_PLUGINS="plugin1=int:param1,str:param2;plugin2;..." -*/ - -static void load_env_plugins(MYSQL *mysql) -{ - char *plugs, *free_env, *s= getenv("LIBMYSQL_PLUGINS"); - - if (ma_check_env_str(s)) - return; - - free_env= strdup(s); - plugs= s= free_env; - - do { - if ((s= strchr(plugs, ';'))) - *s= '\0'; - mysql_load_plugin(mysql, plugs, -1, 0); - plugs= s + 1; - } while (s); - - free(free_env); -} - -/********** extern functions to be used by libmariadb *********************/ - -/** - Initializes the client plugin layer. - - This function must be called before any other client plugin function. - - @retval 0 successful - @retval != 0 error occurred -*/ - -int mysql_client_plugin_init() -{ - MYSQL mysql; - struct st_mysql_client_plugin **builtin; - va_list unused; - LINT_INIT_STRUCT(unused); - - if (initialized) - return 0; - - memset(&mysql, 0, sizeof(mysql)); /* dummy mysql for set_mysql_extended_error */ - - pthread_mutex_init(&LOCK_load_client_plugin, MY_MUTEX_INIT_SLOW); - ma_init_alloc_root(&mem_root, 128, 128); - - memset(&plugin_list, 0, sizeof(plugin_list)); - - initialized= 1; - - pthread_mutex_lock(&LOCK_load_client_plugin); - for (builtin= mysql_client_builtins; *builtin; builtin++) - add_plugin(&mysql, *builtin, 0, 0, unused); - - pthread_mutex_unlock(&LOCK_load_client_plugin); - - load_env_plugins(&mysql); - - return 0; -} - - -/** - Deinitializes the client plugin layer. - - Unloades all client plugins and frees any associated resources. -*/ - -void mysql_client_plugin_deinit() -{ - int i; - struct st_client_plugin_int *p; - - if (!initialized) - return; - - for (i=0; i < MYSQL_CLIENT_MAX_PLUGINS; i++) - for (p= plugin_list[i]; p; p= p->next) - { - if (p->plugin->deinit) - p->plugin->deinit(); - if (p->dlhandle) - (void)dlclose(p->dlhandle); - } - - memset(&plugin_list, 0, sizeof(plugin_list)); - initialized= 0; - ma_free_root(&mem_root, MYF(0)); - pthread_mutex_destroy(&LOCK_load_client_plugin); -} - -/************* public facing functions, for client consumption *********/ - -/* see for a full description */ -struct st_mysql_client_plugin * STDCALL -mysql_client_register_plugin(MYSQL *mysql, - struct st_mysql_client_plugin *plugin) -{ - va_list unused; - LINT_INIT_STRUCT(unused); - - if (is_not_initialized(mysql, plugin->name)) - return NULL; - - pthread_mutex_lock(&LOCK_load_client_plugin); - - /* make sure the plugin wasn't loaded meanwhile */ - if (find_plugin(plugin->name, plugin->type)) - { - my_set_error(mysql, CR_AUTH_PLUGIN_CANNOT_LOAD, - SQLSTATE_UNKNOWN, ER(CR_AUTH_PLUGIN_CANNOT_LOAD), - plugin->name, "it is already loaded"); - plugin= NULL; - } - else - plugin= add_plugin(mysql, plugin, 0, 0, unused); - - pthread_mutex_unlock(&LOCK_load_client_plugin); - return plugin; -} - - -/* see for a full description */ -struct st_mysql_client_plugin * STDCALL -mysql_load_plugin_v(MYSQL *mysql, const char *name, int type, - int argc, va_list args) -{ - const char *errmsg; -#ifdef _WIN32 - char errbuf[1024]; -#endif - char dlpath[FN_REFLEN+1]; - void *sym, *dlhandle = NULL; - struct st_mysql_client_plugin *plugin; - char *env_plugin_dir= getenv("MARIADB_PLUGIN_DIR"); - - CLEAR_CLIENT_ERROR(mysql); - if (is_not_initialized(mysql, name)) - return NULL; - - pthread_mutex_lock(&LOCK_load_client_plugin); - - /* make sure the plugin wasn't loaded meanwhile */ - if (type >= 0 && find_plugin(name, type)) - { - errmsg= "it is already loaded"; - goto err; - } - - /* Compile dll path */ - snprintf(dlpath, sizeof(dlpath) - 1, "%s/%s%s", - mysql->options.extension && mysql->options.extension->plugin_dir ? - mysql->options.extension->plugin_dir : (env_plugin_dir) ? env_plugin_dir : - MARIADB_PLUGINDIR, name, SO_EXT); - - /* Open new dll handle */ - if (!(dlhandle= dlopen((const char *)dlpath, RTLD_NOW))) - { -#ifdef _WIN32 - char winmsg[255]; - size_t len; - winmsg[0] = 0; - FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, - NULL, - GetLastError(), - MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), - winmsg, 255, NULL); - len= strlen(winmsg); - while (len > 0 && (winmsg[len - 1] == '\n' || winmsg[len - 1] == '\r')) - len--; - if (len) - winmsg[len] = 0; - snprintf(errbuf, sizeof(errbuf), "%s Library path is '%s'", winmsg, dlpath); - errmsg= errbuf; -#else - errmsg= dlerror(); -#endif - goto err; - } - - - if (!(sym= dlsym(dlhandle, plugin_declarations_sym))) - { - errmsg= "not a plugin"; - (void)dlclose(dlhandle); - goto err; - } - - plugin= (struct st_mysql_client_plugin*)sym; - - if (type >=0 && type != plugin->type) - { - errmsg= "type mismatch"; - goto err; - } - - if (strcmp(name, plugin->name)) - { - errmsg= "name mismatch"; - goto err; - } - - if (type < 0 && find_plugin(name, plugin->type)) - { - errmsg= "it is already loaded"; - goto err; - } - - plugin= add_plugin(mysql, plugin, dlhandle, argc, args); - - pthread_mutex_unlock(&LOCK_load_client_plugin); - - return plugin; - -err: - if (dlhandle) - dlclose(dlhandle); - pthread_mutex_unlock(&LOCK_load_client_plugin); - my_set_error(mysql, CR_AUTH_PLUGIN_CANNOT_LOAD, SQLSTATE_UNKNOWN, - ER(CR_AUTH_PLUGIN_CANNOT_LOAD), name, errmsg); - return NULL; -} - - -/* see for a full description */ -struct st_mysql_client_plugin * STDCALL -mysql_load_plugin(MYSQL *mysql, const char *name, int type, int argc, ...) -{ - struct st_mysql_client_plugin *p; - va_list args; - va_start(args, argc); - p= mysql_load_plugin_v(mysql, name, type, argc, args); - va_end(args); - return p; -} - -/* see for a full description */ -struct st_mysql_client_plugin * STDCALL -mysql_client_find_plugin(MYSQL *mysql, const char *name, int type) -{ - struct st_mysql_client_plugin *p; - int plugin_nr= get_plugin_nr(type); - - if (is_not_initialized(mysql, name)) - return NULL; - - if (plugin_nr == -1) - { - my_set_error(mysql, CR_AUTH_PLUGIN_CANNOT_LOAD, SQLSTATE_UNKNOWN, - ER(CR_AUTH_PLUGIN_CANNOT_LOAD), name, "invalid type"); - } - - if ((p= find_plugin(name, type))) - return p; - - /* not found, load it */ - return mysql_load_plugin(mysql, name, type, 0); -} - diff --git a/libs/libmysqlxx/CMakeLists.txt b/libs/libmysqlxx/CMakeLists.txt index 263a031d7b0..2d2ad75628d 100644 --- a/libs/libmysqlxx/CMakeLists.txt +++ b/libs/libmysqlxx/CMakeLists.txt @@ -29,7 +29,8 @@ add_library (mysqlxx target_include_directories (mysqlxx PUBLIC include) if (USE_INTERNAL_MYSQL_LIBRARY) - + target_include_directories (mysqlxx PUBLIC ${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c/include) + target_include_directories (mysqlxx PUBLIC ${ClickHouse_BINARY_DIR}/contrib/mariadb-connector-c/include) else () set(PLATFORM_LIBRARIES ${CMAKE_DL_LIBS}) diff --git a/libs/libmysqlxx/cmake/find_mysqlclient.cmake b/libs/libmysqlxx/cmake/find_mysqlclient.cmake index 98b42a0a9b4..e07ebe2304d 100644 --- a/libs/libmysqlxx/cmake/find_mysqlclient.cmake +++ b/libs/libmysqlxx/cmake/find_mysqlclient.cmake @@ -6,14 +6,14 @@ if(ENABLE_MYSQL) option(USE_INTERNAL_MYSQL_LIBRARY "Set to FALSE to use system mysqlclient library instead of bundled" OFF) endif() - if(USE_INTERNAL_MYSQL_LIBRARY AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c/README.md") + if(USE_INTERNAL_MYSQL_LIBRARY AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/mariadb-connector-c/README") message(WARNING "submodule contrib/mariadb-connector-c is missing. to fix try run: \n git submodule update --init --recursive") set(USE_INTERNAL_MYSQL_LIBRARY 0) endif() if (USE_INTERNAL_MYSQL_LIBRARY) - set (MYSQLCLIENT_LIBRARIES mysqlclient) + set (MYSQLCLIENT_LIBRARIES mariadbclient) set (USE_MYSQL 1) set (MYSQLXX_LIBRARY mysqlxx) else () diff --git a/libs/libmysqlxx/src/Connection.cpp b/libs/libmysqlxx/src/Connection.cpp index 0e7d7bd5d3e..7ba14c9baba 100644 --- a/libs/libmysqlxx/src/Connection.cpp +++ b/libs/libmysqlxx/src/Connection.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/Exception.cpp b/libs/libmysqlxx/src/Exception.cpp index dadd37e29e7..b065d17ed51 100644 --- a/libs/libmysqlxx/src/Exception.cpp +++ b/libs/libmysqlxx/src/Exception.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/Pool.cpp b/libs/libmysqlxx/src/Pool.cpp index a17246e5d6d..410ac062039 100644 --- a/libs/libmysqlxx/src/Pool.cpp +++ b/libs/libmysqlxx/src/Pool.cpp @@ -1,6 +1,6 @@ -#if __has_include() -#include -#include +#if __has_include() +#include +#include #else #include #include diff --git a/libs/libmysqlxx/src/Query.cpp b/libs/libmysqlxx/src/Query.cpp index 6f275c918a5..dc5c3274641 100644 --- a/libs/libmysqlxx/src/Query.cpp +++ b/libs/libmysqlxx/src/Query.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/ResultBase.cpp b/libs/libmysqlxx/src/ResultBase.cpp index b03f92e38f2..eac1e22ca3d 100644 --- a/libs/libmysqlxx/src/ResultBase.cpp +++ b/libs/libmysqlxx/src/ResultBase.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/Row.cpp b/libs/libmysqlxx/src/Row.cpp index e4baa681d69..aecec46e519 100644 --- a/libs/libmysqlxx/src/Row.cpp +++ b/libs/libmysqlxx/src/Row.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/StoreQueryResult.cpp b/libs/libmysqlxx/src/StoreQueryResult.cpp index 05ad4299e17..a09986a3014 100644 --- a/libs/libmysqlxx/src/StoreQueryResult.cpp +++ b/libs/libmysqlxx/src/StoreQueryResult.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif diff --git a/libs/libmysqlxx/src/UseQueryResult.cpp b/libs/libmysqlxx/src/UseQueryResult.cpp index c5c52ffcb9c..19daca90b15 100644 --- a/libs/libmysqlxx/src/UseQueryResult.cpp +++ b/libs/libmysqlxx/src/UseQueryResult.cpp @@ -1,5 +1,5 @@ -#if __has_include() -#include +#if __has_include() +#include #else #include #endif From 403b156114f5ef9cfe312b3e2e481ff1abc9329d Mon Sep 17 00:00:00 2001 From: Yuriy Date: Wed, 25 Sep 2019 01:30:26 +0300 Subject: [PATCH 249/309] fixed build without glibc-compatibility --- contrib/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 5e6f90b6a59..38f2c96b0f9 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -132,7 +132,9 @@ if (ENABLE_MYSQL AND USE_INTERNAL_MYSQL_LIBRARY) set(CLIENT_PLUGIN_CLIENT_ED25519 OFF) set(CLIENT_PLUGIN_MYSQL_CLEAR_PASSWORD OFF) set(SKIP_TESTS 1) - set(LIBM glibc-compatibility) + if (GLIBC_COMPATIBILITY) + set(LIBM glibc-compatibility) + endif() add_subdirectory (mariadb-connector-c) endif () From c3c3f3829ea2b470beadfcfbe7959db4337650f7 Mon Sep 17 00:00:00 2001 From: BayoNet Date: Wed, 25 Sep 2019 02:07:52 +0300 Subject: [PATCH 250/309] DOCAPI-7719: Columns matcher docs (#6831) * Typo fix. * DOCAPI-7719: Docs * DOCAPI-7719: Text. * DOCAPI-7719: Text. * DOCAPI-7719: Update. * DOCAPI-7719: Update. --- docs/en/operations/table_engines/mysql.md | 4 +- docs/en/query_language/select.md | 62 +++++++++++++++++++++-- 2 files changed, 59 insertions(+), 7 deletions(-) diff --git a/docs/en/operations/table_engines/mysql.md b/docs/en/operations/table_engines/mysql.md index 9dac9ba6478..198f98d9838 100644 --- a/docs/en/operations/table_engines/mysql.md +++ b/docs/en/operations/table_engines/mysql.md @@ -66,7 +66,7 @@ mysql> select * from test; 1 row in set (0,00 sec) ``` -Table in ClickHouse, retrieving data from the MySQL table: +Table in ClickHouse, retrieving data from the MySQL table created above: ```sql CREATE TABLE mysql_table @@ -77,7 +77,7 @@ CREATE TABLE mysql_table ENGINE = MySQL('localhost:3306', 'test', 'test', 'bayonet', '123') ``` ```sql -SELECT * FROM mysql_table6 +SELECT * FROM mysql_table ``` ```text ┌─float_nullable─┬─int_id─┐ diff --git a/docs/en/query_language/select.md b/docs/en/query_language/select.md index fb1c529a75b..8ffc4b8efdc 100644 --- a/docs/en/query_language/select.md +++ b/docs/en/query_language/select.md @@ -962,12 +962,64 @@ Running a query may use more memory than 'max_bytes_before_external_sort'. For t External sorting works much less effectively than sorting in RAM. -### SELECT Clause +### SELECT Clause {#select-select} + +[Expressions](syntax.md#syntax-expressions) that specified in the `SELECT` clause are analyzed after the calculations for all the clauses listed above are completed. More specifically, expressions are analyzed that are above the aggregate functions, if there are any aggregate functions. The aggregate functions and everything below them are calculated during aggregation (`GROUP BY`). These expressions work as if they are applied to separate rows in the result. + +If you want to get all columns in the result, use the asterisk (`*`) symbol. For example, `SELECT * FROM ...`. + +To match some columns in the result by a [re2](https://en.wikipedia.org/wiki/RE2_(software)) regular expression, you can use the `COLUMNS` expression. + +```sql +COLUMNS('regexp') +``` + +For example, consider the table: + +```sql +CREATE TABLE default.col_names (aa Int8, ab Int8, bc Int8) ENGINE = TinyLog +``` + +The following query selects data from all the columns containing the `a` symbol in their name. + +```sql +SELECT COLUMNS('a') FROM col_names +``` +```text +┌─aa─┬─ab─┐ +│ 1 │ 1 │ +└────┴────┘ +``` + +You can use multiple `COLUMNS` expressions in a query, also you can apply functions to it. + +For example: + +```sql +SELECT COLUMNS('a'), COLUMNS('c'), toTypeName(COLUMNS('c')) FROM col_names +``` +```text +┌─aa─┬─ab─┬─bc─┬─toTypeName(bc)─┐ +│ 1 │ 1 │ 1 │ Int8 │ +└────┴────┴────┴────────────────┘ +``` + +Be careful when using functions because the `COLUMN` expression returns variable number of columns, and, if a function doesn't support this number of arguments, ClickHouse throws an exception. + +For example: + +```sql +SELECT COLUMNS('a') + COLUMNS('c') FROM col_names +``` +```text +Received exception from server (version 19.14.1): +Code: 42. DB::Exception: Received from localhost:9000. DB::Exception: Number of arguments for function plus doesn't match: passed 3, should be 2. +``` + +In this example, `COLUMNS('a')` returns two columns `aa`, `ab`, and `COLUMNS('c')` returns the `bc` column. The `+` operator can't apply to 3 arguments, so ClickHouse throws an exception with the message about it. + +Columns that matched by the `COLUMNS` expression can be in different types. If `COLUMNS` doesn't match any columns and it is the single expression in `SELECT`, ClickHouse throws an exception. -The expressions specified in the SELECT clause are analyzed after the calculations for all the clauses listed above are completed. -More specifically, expressions are analyzed that are above the aggregate functions, if there are any aggregate functions. -The aggregate functions and everything below them are calculated during aggregation (GROUP BY). -These expressions work as if they are applied to separate rows in the result. ### DISTINCT Clause {#select-distinct} From 30136fa4ad1b64ace38f51411ccdfe85b71f1110 Mon Sep 17 00:00:00 2001 From: Denis Zhuravlev Date: Tue, 24 Sep 2019 20:14:23 -0300 Subject: [PATCH 251/309] Doc change. Added description for interserver_http_credentials (#6927) * Update settings.md Added description for interserver_http_credentials * Update settings.md Added description for interserver_http_credentials * Update settings.md * Update settings.md * Update settings.md * Update settings.md --- .../en/operations/server_settings/settings.md | 19 ++++++++++++++++++ .../ru/operations/server_settings/settings.md | 20 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/docs/en/operations/server_settings/settings.md b/docs/en/operations/server_settings/settings.md index f884a7b2963..796e4621475 100644 --- a/docs/en/operations/server_settings/settings.md +++ b/docs/en/operations/server_settings/settings.md @@ -260,6 +260,25 @@ Useful for breaking away from a specific network interface. example.yandex.ru ``` +## interserver_http_credentials {#server-settings-interserver_http_credentials} + +The username and password used to authenticate during [replication](../table_engines/replication.md) with the Replicated* engines. These credentials are used only for communication between replicas and are unrelated to credentials for ClickHouse clients. The server is checking these credentials for connecting replicas and use the same credentials when connecting to other replicas. So, these credentials should be set the same for all replicas in a cluster. +By default, the authentication is not used. + +This section contains the following parameters: + +- `user` — username. +- `password` — password. + +**Example** + +```xml + + admin + 222 + +``` + ## keep_alive_timeout diff --git a/docs/ru/operations/server_settings/settings.md b/docs/ru/operations/server_settings/settings.md index 39523db7d36..fa90d61c876 100644 --- a/docs/ru/operations/server_settings/settings.md +++ b/docs/ru/operations/server_settings/settings.md @@ -260,6 +260,26 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat ``` +## interserver_http_credentials {#server-settings-interserver_http_credentials} + +Имя пользователя и пароль, использующиеся для аутентификации при [репликации](../table_engines/replication.md) движками Replicated*. Это имя пользователя и пароль используются только для взаимодействия между репликами кластера и никак не связаны с аутентификацией клиентов ClickHouse. Сервер проверяет совпадение имени и пароля для соединяющихся с ним реплик, а также использует это же имя и пароль для соединения с другими репликами. Соответственно, эти имя и пароль должны быть прописаны одинаковыми для всех реплик кластера. +По умолчанию аутентификация не используется. + +Раздел содержит следующие параметры: + +- `user` — имя пользователя. +- `password` — пароль. + +**Пример конфигурации** + +```xml + + admin + 222 + +``` + + ## keep_alive_timeout Время в секундах, в течение которого ClickHouse ожидает входящих запросов прежде, чем закрыть соединение. From f5ba279a6da8f1253a4412707ef051f700ddf35c Mon Sep 17 00:00:00 2001 From: BayoNet Date: Wed, 25 Sep 2019 02:19:14 +0300 Subject: [PATCH 252/309] DOCAPI-8163: Update for EN docs for sequenceMatch and sequenceCount (#6952) * Typo fix. * Links fix. * Fixed links in docs. * More fixes. * DOCAPI-8163: Docs for sequenceMatch and sequenceCount. --- .../agg_functions/parametric_functions.md | 164 ++++++++++++++---- 1 file changed, 133 insertions(+), 31 deletions(-) diff --git a/docs/en/query_language/agg_functions/parametric_functions.md b/docs/en/query_language/agg_functions/parametric_functions.md index 47196e2a4eb..13cbc2b05d8 100644 --- a/docs/en/query_language/agg_functions/parametric_functions.md +++ b/docs/en/query_language/agg_functions/parametric_functions.md @@ -71,51 +71,153 @@ FROM In this case, you should remember that you don't know the histogram bin borders. -## sequenceMatch(pattern)(time, cond1, cond2, ...) +## sequenceMatch(pattern)(timestamp, cond1, cond2, ...) {#function-sequencematch} -Pattern matching for event chains. - -`pattern` is a string containing a pattern to match. The pattern is similar to a regular expression. - -`time` is the time of the event, type support: `Date`,`DateTime`, and other unsigned integer types. - -`cond1`, `cond2` ... is from one to 32 arguments of type UInt8 that indicate whether a certain condition was met for the event. - -The function collects a sequence of events in RAM. Then it checks whether this sequence matches the pattern. -It returns UInt8: 0 if the pattern isn't matched, or 1 if it matches. - -Example: `sequenceMatch ('(?1).*(?2)')(EventTime, URL LIKE '%company%', URL LIKE '%cart%')` - -- whether there was a chain of events in which a pageview with 'company' in the address occurred earlier than a pageview with 'cart' in the address. - -This is a singular example. You could write it using other aggregate functions: +Checks whether the sequence contains the event chain that matches the pattern. ```sql -minIf(EventTime, URL LIKE '%company%') < maxIf(EventTime, URL LIKE '%cart%'). +sequenceMatch(pattern)(timestamp, cond1, cond2, ...) ``` -However, there is no such solution for more complex situations. +!!! warning "Warning" + Events that occur at the same second may lay in the sequence in an undefined order affecting the result. -Pattern syntax: -`(?1)` refers to the condition (any number can be used in place of 1). +**Parameters** -`.*` is any number of any events. +- `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax). -`(?t>=1800)` is a time condition. +- `timestamp` — Column that considered to contain time data. Typical data types are `Date`, and `DateTime`. You can use also any of the supported [UInt](../../data_types/int_uint.md) data types. -Any quantity of any type of events is allowed over the specified time. +- `cond1`, `cond2` — Conditions that describe the chain of events. Data type: `UInt8`. You can pass up to 32 condition arguments. The function takes into account only the events described in these conditions. If the sequence contains data that are not described with conditions the function skips them. -Instead of `>=`, the following operators can be used:`<`, `>`, `<=`. -Any number may be specified in place of 1800. +**Returned values** -Events that occur during the same second can be put in the chain in any order. This may affect the result of the function. +- 1, if the pattern is matched. +- 0, if the pattern isn't matched. -## sequenceCount(pattern)(time, cond1, cond2, ...) -Works the same way as the sequenceMatch function, but instead of returning whether there is an event chain, it returns UInt64 with the number of event chains found. -Chains are searched for without overlapping. In other words, the next chain can start only after the end of the previous one. +Type: `UInt8`. + + + +**Pattern syntax** + +- `(?N)` — Matches the condition argument at the position `N`. Conditions are numbered in the `[1, 32]` range. For example, `(?1)` matches the argument passed to the `cond1` parameter. + +- `.*` — Matches any number of any events. You don't need the conditional arguments to match this element of the pattern. + +- `(?t operator value)` — Sets the time in seconds that should separate two events. For example, pattern `(?1)(?t>1800)(?2)` matches events that distanced from each other for more than 1800 seconds. An arbitrary number of any events can lay between these events. You can use the `>=`, `>`, `<`, `<=` operators. + +**Examples** + +Consider data in the `t` table: + +```text +┌─time─┬─number─┐ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 3 │ 2 │ +└──────┴────────┘ +``` + +Perform the query: + +```sql +SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2) FROM t +``` +```text +┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2))─┐ +│ 1 │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +The function has found the event chain where number 2 follows number 1. It skipped number 3 between them, because the number is not described as an event. If we want to take this number into account when searching for the event chain, showed in the example, we should make a condition for it. + +```sql +SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 3) FROM t +``` +```text +┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2), equals(number, 3))─┐ +│ 0 │ +└──────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +In this case the function couldn't find the event chain matching the pattern, because there is the event for number 3 occured between 1 and 2. If in the same case we checked the condition for number 4, the sequence would match the pattern. + +```sql +SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM t +``` +```text +┌─sequenceMatch('(?1)(?2)')(time, equals(number, 1), equals(number, 2), equals(number, 4))─┐ +│ 1 │ +└──────────────────────────────────────────────────────────────────────────────────────────┘ +``` + + +**See Also** + +- [sequenceCount](#function-sequencecount) + + +## sequenceCount(pattern)(time, cond1, cond2, ...) {#function-sequencecount} + +Counts the number of event chains that matched the pattern. The function searches event chains that not overlap. It starts to search for the next chain after the current chain is matched. + +!!! warning "Warning" + Events that occur at the same second may lay in the sequence in an undefined order affecting the result. + +```sql +sequenceCount(pattern)(timestamp, cond1, cond2, ...) +``` + +**Parameters** + +- `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax). + +- `timestamp` — Column that considered to contain time data. Typical data types are `Date`, and `DateTime`. You can also use any of the supported [UInt](../../data_types/int_uint.md) data types. + +- `cond1`, `cond2` — Conditions that describe the chain of events. Data type: `UInt8`. You can pass up to 32 condition arguments. The function takes into account only the events described in these conditions. If the sequence contains data that are not described with conditions the function skips them. + + +**Returned values** + +- Number of non-overlapping event chains that are matched + +Type: `UInt64`. + + +**Example** + +Consider data in the `t` table: + +```text +┌─time─┬─number─┐ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 3 │ 2 │ +│ 4 │ 1 │ +│ 5 │ 3 │ +│ 6 │ 2 │ +└──────┴────────┘ +``` + +Count how many times the number 2 occurs after the number 1 with any amount of other numbers between them: + +```sql +SELECT sequenceCount('(?1).*(?2)')(time, number = 1, number = 2) FROM t +``` +```text +┌─sequenceCount('(?1).*(?2)')(time, equals(number, 1), equals(number, 2))─┐ +│ 2 │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +**See Also** + +- [sequenceMatch](#function-sequencematch) + ## windowFunnel(window)(timestamp, cond1, cond2, cond3, ...) @@ -128,7 +230,7 @@ windowFunnel(window)(timestamp, cond1, cond2, cond3, ...) **Parameters:** - `window` — Length of the sliding window in seconds. -- `timestamp` — Name of the column containing the timestamp. Data type support: `Date`,`DateTime`, and other unsigned integer types(Note that though timestamp support `UInt64` type, there is a limitation it's value can't overflow maximum of Int64, which is 2^63 - 1). +- `timestamp` — Name of the column containing the timestamp. Data type support: `Date`,`DateTime`, and other unsigned integer types (note that though timestamp support `UInt64` type, there is a limitation it's value can't overflow maximum of Int64, which is 2^63 - 1). - `cond1`, `cond2`... — Conditions or data describing the chain of events. Data type: `UInt8`. Values can be 0 or 1. **Algorithm** From 2435e9a12c19b45bbd2675a27e6fbb56eec863cf Mon Sep 17 00:00:00 2001 From: Guillaume Tassery Date: Wed, 25 Sep 2019 06:33:54 +0200 Subject: [PATCH 253/309] Add simple HTTP right credentials --- dbms/src/Dictionaries/HTTPDictionarySource.cpp | 17 +++++++++++++---- dbms/src/Dictionaries/HTTPDictionarySource.h | 2 ++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/dbms/src/Dictionaries/HTTPDictionarySource.cpp b/dbms/src/Dictionaries/HTTPDictionarySource.cpp index fffbcc402b0..b6421480f93 100644 --- a/dbms/src/Dictionaries/HTTPDictionarySource.cpp +++ b/dbms/src/Dictionaries/HTTPDictionarySource.cpp @@ -34,6 +34,13 @@ HTTPDictionarySource::HTTPDictionarySource( , context(context_) , timeouts(ConnectionTimeouts::getHTTPTimeouts(context)) { + const auto & credentials_prefix = config_prefix + ".credentials"; + + if (config.has(credentials_prefix)) + { + this->credentials.setUsername(config.getString(credentials_prefix + ".user", "")); + this->credentials.setPassword(config.getString(credentials_prefix + ".password", "")); + } } HTTPDictionarySource::HTTPDictionarySource(const HTTPDictionarySource & other) @@ -47,6 +54,8 @@ HTTPDictionarySource::HTTPDictionarySource(const HTTPDictionarySource & other) , context(other.context) , timeouts(ConnectionTimeouts::getHTTPTimeouts(context)) { + this->credentials.setUsername(other.credentials.getUsername()); + this->credentials.setPassword(other.credentials.getPassword()); } void HTTPDictionarySource::getUpdateFieldAndDate(Poco::URI & uri) @@ -74,7 +83,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadAll() LOG_TRACE(log, "loadAll " + toString()); Poco::URI uri(url); auto in_ptr = std::make_unique( - uri, Poco::Net::HTTPRequest::HTTP_GET, ReadWriteBufferFromHTTP::OutStreamCallback(), timeouts); + uri, Poco::Net::HTTPRequest::HTTP_GET, ReadWriteBufferFromHTTP::OutStreamCallback(), timeouts, 0, this->credentials); auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } @@ -85,7 +94,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadUpdatedAll() getUpdateFieldAndDate(uri); LOG_TRACE(log, "loadUpdatedAll " + uri.toString()); auto in_ptr = std::make_unique( - uri, Poco::Net::HTTPRequest::HTTP_GET, ReadWriteBufferFromHTTP::OutStreamCallback(), timeouts); + uri, Poco::Net::HTTPRequest::HTTP_GET, ReadWriteBufferFromHTTP::OutStreamCallback(), timeouts, 0, this->credentials); auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } @@ -102,7 +111,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadIds(const std::vector & id }; Poco::URI uri(url); - auto in_ptr = std::make_unique(uri, Poco::Net::HTTPRequest::HTTP_POST, out_stream_callback, timeouts); + auto in_ptr = std::make_unique(uri, Poco::Net::HTTPRequest::HTTP_POST, out_stream_callback, timeouts, 0, this->credentials); auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } @@ -119,7 +128,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadKeys(const Columns & key_columns, }; Poco::URI uri(url); - auto in_ptr = std::make_unique(uri, Poco::Net::HTTPRequest::HTTP_POST, out_stream_callback, timeouts); + auto in_ptr = std::make_unique(uri, Poco::Net::HTTPRequest::HTTP_POST, out_stream_callback, timeouts, 0, this->credentials); auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } diff --git a/dbms/src/Dictionaries/HTTPDictionarySource.h b/dbms/src/Dictionaries/HTTPDictionarySource.h index 705095193d1..c4c85b310f0 100644 --- a/dbms/src/Dictionaries/HTTPDictionarySource.h +++ b/dbms/src/Dictionaries/HTTPDictionarySource.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include "DictionaryStructure.h" @@ -56,6 +57,7 @@ private: std::chrono::time_point update_time; const DictionaryStructure dict_struct; const std::string url; + Poco::Net::HTTPBasicCredentials credentials; std::string update_field; const std::string format; Block sample_block; From 77f2dfee6a788cffb6d0d93f7fad5b5ab964411d Mon Sep 17 00:00:00 2001 From: Guillaume Tassery Date: Wed, 25 Sep 2019 10:08:46 +0200 Subject: [PATCH 254/309] Add the positibility to add an header on an HTTP source --- .../src/Dictionaries/HTTPDictionarySource.cpp | 32 ++++++++++++++++--- dbms/src/Dictionaries/HTTPDictionarySource.h | 2 ++ dbms/src/IO/ReadWriteBufferFromHTTP.h | 21 +++++++++--- 3 files changed, 47 insertions(+), 8 deletions(-) diff --git a/dbms/src/Dictionaries/HTTPDictionarySource.cpp b/dbms/src/Dictionaries/HTTPDictionarySource.cpp index b6421480f93..7b15188a13d 100644 --- a/dbms/src/Dictionaries/HTTPDictionarySource.cpp +++ b/dbms/src/Dictionaries/HTTPDictionarySource.cpp @@ -41,6 +41,23 @@ HTTPDictionarySource::HTTPDictionarySource( this->credentials.setUsername(config.getString(credentials_prefix + ".user", "")); this->credentials.setPassword(config.getString(credentials_prefix + ".password", "")); } + + const auto & http_headers_prefix = config_prefix + ".http-headers"; + + if (config.has(http_headers_prefix)) + { + Poco::Util::AbstractConfiguration::Keys config_keys; + config.keys(http_headers_prefix, config_keys); + + this->header_entries.reserve(config_keys.size()); + for (const auto & key : config_keys) + { + const auto header_key = config.getString(http_headers_prefix + "." + key + ".key", ""); + const auto header_value = config.getString(http_headers_prefix + "." + key + ".value", ""); + this->header_entries.emplace_back(std::make_tuple(header_key, header_value)); + } + } + } HTTPDictionarySource::HTTPDictionarySource(const HTTPDictionarySource & other) @@ -48,6 +65,7 @@ HTTPDictionarySource::HTTPDictionarySource(const HTTPDictionarySource & other) , update_time{other.update_time} , dict_struct{other.dict_struct} , url{other.url} + , header_entries{other.header_entries} , update_field{other.update_field} , format{other.format} , sample_block{other.sample_block} @@ -83,7 +101,8 @@ BlockInputStreamPtr HTTPDictionarySource::loadAll() LOG_TRACE(log, "loadAll " + toString()); Poco::URI uri(url); auto in_ptr = std::make_unique( - uri, Poco::Net::HTTPRequest::HTTP_GET, ReadWriteBufferFromHTTP::OutStreamCallback(), timeouts, 0, this->credentials); + uri, Poco::Net::HTTPRequest::HTTP_GET, ReadWriteBufferFromHTTP::OutStreamCallback(), timeouts, + 0, this->credentials, DBMS_DEFAULT_BUFFER_SIZE, this->header_entries); auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } @@ -94,7 +113,8 @@ BlockInputStreamPtr HTTPDictionarySource::loadUpdatedAll() getUpdateFieldAndDate(uri); LOG_TRACE(log, "loadUpdatedAll " + uri.toString()); auto in_ptr = std::make_unique( - uri, Poco::Net::HTTPRequest::HTTP_GET, ReadWriteBufferFromHTTP::OutStreamCallback(), timeouts, 0, this->credentials); + uri, Poco::Net::HTTPRequest::HTTP_GET, ReadWriteBufferFromHTTP::OutStreamCallback(), timeouts, + 0, this->credentials, DBMS_DEFAULT_BUFFER_SIZE, this->header_entries); auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } @@ -111,7 +131,9 @@ BlockInputStreamPtr HTTPDictionarySource::loadIds(const std::vector & id }; Poco::URI uri(url); - auto in_ptr = std::make_unique(uri, Poco::Net::HTTPRequest::HTTP_POST, out_stream_callback, timeouts, 0, this->credentials); + auto in_ptr = std::make_unique( + uri, Poco::Net::HTTPRequest::HTTP_POST, out_stream_callback, timeouts, + 0, this->credentials, DBMS_DEFAULT_BUFFER_SIZE, this->header_entries); auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } @@ -128,7 +150,9 @@ BlockInputStreamPtr HTTPDictionarySource::loadKeys(const Columns & key_columns, }; Poco::URI uri(url); - auto in_ptr = std::make_unique(uri, Poco::Net::HTTPRequest::HTTP_POST, out_stream_callback, timeouts, 0, this->credentials); + auto in_ptr = std::make_unique( + uri, Poco::Net::HTTPRequest::HTTP_POST, out_stream_callback, timeouts, + 0, this->credentials, DBMS_DEFAULT_BUFFER_SIZE, this->header_entries); auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } diff --git a/dbms/src/Dictionaries/HTTPDictionarySource.h b/dbms/src/Dictionaries/HTTPDictionarySource.h index c4c85b310f0..75261cf148f 100644 --- a/dbms/src/Dictionaries/HTTPDictionarySource.h +++ b/dbms/src/Dictionaries/HTTPDictionarySource.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -58,6 +59,7 @@ private: const DictionaryStructure dict_struct; const std::string url; Poco::Net::HTTPBasicCredentials credentials; + ReadWriteBufferFromHTTP::HttpHeaderEntries header_entries; std::string update_field; const std::string format; Block sample_block; diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.h b/dbms/src/IO/ReadWriteBufferFromHTTP.h index d2140e14792..273994687c2 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.h +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.h @@ -86,6 +86,10 @@ namespace detail template class ReadWriteBufferFromHTTPBase : public ReadBuffer { + public: + using HttpHeaderEntry = std::tuple; + using HttpHeaderEntries = std::vector; + protected: Poco::URI uri; std::string method; @@ -96,6 +100,7 @@ namespace detail std::function out_stream_callback; const Poco::Net::HTTPBasicCredentials & credentials; std::vector cookies; + HttpHeaderEntries http_header_entries; std::istream * call(const Poco::URI uri_, Poco::Net::HTTPResponse & response) { @@ -109,10 +114,15 @@ namespace detail if (out_stream_callback) request.setChunkedTransferEncoding(true); + for (auto & http_header_entry: http_header_entries) + { + request.set(std::get<0>(http_header_entry), std::get<1>(http_header_entry)); + } + if (!credentials.getUsername().empty()) credentials.authenticate(request); - LOG_TRACE((&Logger::get("ReadWriteBufferFromHTTP")), "Sending request to " << uri.toString()); + LOG_TRACE((&Logger::get("ReadWriteBufferFromHTTP")), "Sending request to " << uri.toString()); auto sess = session->getSession(); @@ -146,13 +156,15 @@ namespace detail const std::string & method_ = {}, OutStreamCallback out_stream_callback_ = {}, const Poco::Net::HTTPBasicCredentials & credentials_ = {}, - size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE) + size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, + HttpHeaderEntries http_header_entries_ = {}) : ReadBuffer(nullptr, 0) , uri {uri_} , method {!method_.empty() ? method_ : out_stream_callback_ ? Poco::Net::HTTPRequest::HTTP_POST : Poco::Net::HTTPRequest::HTTP_GET} , session {session_} , out_stream_callback {out_stream_callback_} , credentials {credentials_} + , http_header_entries {http_header_entries_} { Poco::Net::HTTPResponse response; @@ -230,8 +242,9 @@ public: const ConnectionTimeouts & timeouts = {}, const DB::SettingUInt64 max_redirects = 0, const Poco::Net::HTTPBasicCredentials & credentials_ = {}, - size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE) - : Parent(std::make_shared(uri_, timeouts, max_redirects), uri_, method_, out_stream_callback_, credentials_, buffer_size_) + size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, + const HttpHeaderEntries & http_header_entries_ = {}) + : Parent(std::make_shared(uri_, timeouts, max_redirects), uri_, method_, out_stream_callback_, credentials_, buffer_size_, http_header_entries_) { } }; From 8b4e789847ae8db60acf2beae51aa2b4825bdb27 Mon Sep 17 00:00:00 2001 From: Guillaume Tassery Date: Wed, 25 Sep 2019 11:42:08 +0200 Subject: [PATCH 255/309] Rename key to name --- dbms/src/Dictionaries/HTTPDictionarySource.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Dictionaries/HTTPDictionarySource.cpp b/dbms/src/Dictionaries/HTTPDictionarySource.cpp index 7b15188a13d..9ed8296ea2a 100644 --- a/dbms/src/Dictionaries/HTTPDictionarySource.cpp +++ b/dbms/src/Dictionaries/HTTPDictionarySource.cpp @@ -52,7 +52,7 @@ HTTPDictionarySource::HTTPDictionarySource( this->header_entries.reserve(config_keys.size()); for (const auto & key : config_keys) { - const auto header_key = config.getString(http_headers_prefix + "." + key + ".key", ""); + const auto header_key = config.getString(http_headers_prefix + "." + key + ".name", ""); const auto header_value = config.getString(http_headers_prefix + "." + key + ".value", ""); this->header_entries.emplace_back(std::make_tuple(header_key, header_value)); } From 0267c65e279f2757f3663de4f86c759256b2569e Mon Sep 17 00:00:00 2001 From: Guillaume Tassery Date: Wed, 25 Sep 2019 11:42:17 +0200 Subject: [PATCH 256/309] Add documentation --- .../dicts/external_dicts_dict_sources.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/docs/en/query_language/dicts/external_dicts_dict_sources.md b/docs/en/query_language/dicts/external_dicts_dict_sources.md index 493b75a9cbb..61b5badce7a 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_sources.md +++ b/docs/en/query_language/dicts/external_dicts_dict_sources.md @@ -84,6 +84,16 @@ Example of settings: http://[::1]/os.tsv TabSeparated + + user + password + + + + API-KEY + key + + ``` @@ -94,7 +104,13 @@ Setting fields: - `url` – The source URL. - `format` – The file format. All the formats described in "[Formats](../../interfaces/formats.md#formats)" are supported. - +- `credentials` – Basic HTTP authentification. + - `user` – Username required for the authentification. + - `password` – Password required for the authentification. +- `http-headers` – All custom HTTP headers entries used for the HTTP request. + - `http-header` – Single HTTP header entry. + - `key` – Identifiant name used for the header send on the request. + - `value` – Value set for a specific identifiant name. ## ODBC {#dicts-external_dicts_dict_sources-odbc} From 367a0dcdb48f6e6d00f112a7e6507842a7ce69f2 Mon Sep 17 00:00:00 2001 From: Guillaume Tassery Date: Wed, 25 Sep 2019 11:45:36 +0200 Subject: [PATCH 257/309] cosmetic --- dbms/src/Dictionaries/HTTPDictionarySource.cpp | 1 - dbms/src/IO/ReadWriteBufferFromHTTP.h | 2 +- docs/en/query_language/dicts/external_dicts_dict_sources.md | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Dictionaries/HTTPDictionarySource.cpp b/dbms/src/Dictionaries/HTTPDictionarySource.cpp index 9ed8296ea2a..ce4980506fe 100644 --- a/dbms/src/Dictionaries/HTTPDictionarySource.cpp +++ b/dbms/src/Dictionaries/HTTPDictionarySource.cpp @@ -57,7 +57,6 @@ HTTPDictionarySource::HTTPDictionarySource( this->header_entries.emplace_back(std::make_tuple(header_key, header_value)); } } - } HTTPDictionarySource::HTTPDictionarySource(const HTTPDictionarySource & other) diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.h b/dbms/src/IO/ReadWriteBufferFromHTTP.h index 273994687c2..9716a15ad6e 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.h +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.h @@ -122,7 +122,7 @@ namespace detail if (!credentials.getUsername().empty()) credentials.authenticate(request); - LOG_TRACE((&Logger::get("ReadWriteBufferFromHTTP")), "Sending request to " << uri.toString()); + LOG_TRACE((&Logger::get("ReadWriteBufferFromHTTP")), "Sending request to " << uri.toString()); auto sess = session->getSession(); diff --git a/docs/en/query_language/dicts/external_dicts_dict_sources.md b/docs/en/query_language/dicts/external_dicts_dict_sources.md index 61b5badce7a..37f362c0cf4 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_sources.md +++ b/docs/en/query_language/dicts/external_dicts_dict_sources.md @@ -112,6 +112,7 @@ Setting fields: - `key` – Identifiant name used for the header send on the request. - `value` – Value set for a specific identifiant name. + ## ODBC {#dicts-external_dicts_dict_sources-odbc} You can use this method to connect any database that has an ODBC driver. From 97a2ae176ba1401e55d445dbce4bed30e494b193 Mon Sep 17 00:00:00 2001 From: millb Date: Wed, 25 Sep 2019 13:37:43 +0300 Subject: [PATCH 258/309] Add CountOfMerges metric --- dbms/src/Common/ProfileEvents.cpp | 1 + dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/dbms/src/Common/ProfileEvents.cpp b/dbms/src/Common/ProfileEvents.cpp index 947e3890078..89e15f73f54 100644 --- a/dbms/src/Common/ProfileEvents.cpp +++ b/dbms/src/Common/ProfileEvents.cpp @@ -114,6 +114,7 @@ M(SelectedRanges, "Number of (non-adjacent) ranges in all data parts selected to read from a MergeTree table.") \ M(SelectedMarks, "Number of marks (index granules) selected to read from a MergeTree table.") \ \ + M(CountOfMerges, "") \ M(MergedRows, "Rows read for background merges. This is the number of rows before merge.") \ M(MergedUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) that was read for background merges. This is the number before merge.") \ M(MergesTimeMilliseconds, "Total time spent for background merges.")\ diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index 83604f99dd1..0825e6fb2a0 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -39,6 +39,7 @@ namespace ProfileEvents extern const Event MergedRows; extern const Event MergedUncompressedBytes; extern const Event MergesTimeMilliseconds; + extern const Event CountOfMerges; } namespace CurrentMetrics @@ -507,8 +508,10 @@ public: void operator() (const Progress & value) { ProfileEvents::increment(ProfileEvents::MergedUncompressedBytes, value.read_bytes); - if (stage.is_first) + if (stage.is_first) { ProfileEvents::increment(ProfileEvents::MergedRows, value.read_rows); + ProfileEvents::increment(ProfileEvents::CountOfMerges); + } updateWatch(); merge_entry->bytes_read_uncompressed += value.read_bytes; From 4e893eb9493e019698386ce31d021b2221cb4b2d Mon Sep 17 00:00:00 2001 From: millb Date: Wed, 25 Sep 2019 13:46:24 +0300 Subject: [PATCH 259/309] Fixed codestyle --- dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index 0825e6fb2a0..abdcd2d0f98 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -508,7 +508,8 @@ public: void operator() (const Progress & value) { ProfileEvents::increment(ProfileEvents::MergedUncompressedBytes, value.read_bytes); - if (stage.is_first) { + if (stage.is_first) + { ProfileEvents::increment(ProfileEvents::MergedRows, value.read_rows); ProfileEvents::increment(ProfileEvents::CountOfMerges); } From f9660bbe4754b44647838be1d1af11a730013d22 Mon Sep 17 00:00:00 2001 From: Guillaume Tassery Date: Wed, 25 Sep 2019 13:37:13 +0200 Subject: [PATCH 260/309] Update external_dicts_dict_sources.md --- docs/en/query_language/dicts/external_dicts_dict_sources.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/query_language/dicts/external_dicts_dict_sources.md b/docs/en/query_language/dicts/external_dicts_dict_sources.md index 37f362c0cf4..16ea0471199 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_sources.md +++ b/docs/en/query_language/dicts/external_dicts_dict_sources.md @@ -109,7 +109,7 @@ Setting fields: - `password` – Password required for the authentification. - `http-headers` – All custom HTTP headers entries used for the HTTP request. - `http-header` – Single HTTP header entry. - - `key` – Identifiant name used for the header send on the request. + - `name` – Identifiant name used for the header send on the request. - `value` – Value set for a specific identifiant name. From 4a60e8187b27df7d036b20f393db5a234c18fa19 Mon Sep 17 00:00:00 2001 From: BayoNet Date: Wed, 25 Sep 2019 19:43:10 +0300 Subject: [PATCH 261/309] Fixing links in docs (#7098) * Typo fix. * Links fix. * Fixed links in docs. * More fixes. * Link fixes. --- docs/en/operations/table_engines/hdfs.md | 2 +- docs/ru/development/build_cross.md | 1 + docs/ru/operations/table_engines/hdfs.md | 2 +- docs/ru/query_language/agg_functions/parametric_functions.md | 2 +- docs/ru/query_language/functions/other_functions.md | 2 +- docs/ru/query_language/select.md | 2 +- docs/toc_en.yml | 1 + docs/toc_ru.yml | 1 + 8 files changed, 8 insertions(+), 5 deletions(-) create mode 120000 docs/ru/development/build_cross.md diff --git a/docs/en/operations/table_engines/hdfs.md b/docs/en/operations/table_engines/hdfs.md index 1f6ecc50a79..9e2947341bf 100644 --- a/docs/en/operations/table_engines/hdfs.md +++ b/docs/en/operations/table_engines/hdfs.md @@ -58,7 +58,7 @@ Multiple path components can have globs. For being processed file should exists - `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. - `{N..M}` — Substitutes any number in range from N to M including both borders. - Constructions with `{}` are similar to the [remote table function](../../query_language/table_functions/remote.md)). + Constructions with `{}` are similar to the [remote](../../query_language/table_functions/remote.md) table function. **Example** diff --git a/docs/ru/development/build_cross.md b/docs/ru/development/build_cross.md new file mode 120000 index 00000000000..f595f252de3 --- /dev/null +++ b/docs/ru/development/build_cross.md @@ -0,0 +1 @@ +../../en/development/build_cross.md \ No newline at end of file diff --git a/docs/ru/operations/table_engines/hdfs.md b/docs/ru/operations/table_engines/hdfs.md index 303f0a07d19..b384eb3bf60 100644 --- a/docs/ru/operations/table_engines/hdfs.md +++ b/docs/ru/operations/table_engines/hdfs.md @@ -55,7 +55,7 @@ SELECT * FROM hdfs_engine_table LIMIT 2 - `{some_string,another_string,yet_another_one}` — Заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. - `{N..M}` — Заменяет любое число в интервале от `N` до `M` включительно. -Конструкция с `{}` аналогична табличной функции [remote](remote.md). +Конструкция с `{}` аналогична табличной функции [remote](../../query_language/table_functions/remote.md). **Пример** diff --git a/docs/ru/query_language/agg_functions/parametric_functions.md b/docs/ru/query_language/agg_functions/parametric_functions.md index 5adf20dfce5..b0ece3ced11 100644 --- a/docs/ru/query_language/agg_functions/parametric_functions.md +++ b/docs/ru/query_language/agg_functions/parametric_functions.md @@ -45,7 +45,7 @@ FROM ( └─────────────────────────────────────────────────────────────────────────┘ ``` -С помощью функции [bar](../other_functions.md#function-bar) можно визуализировать гистограмму, например: +С помощью функции [bar](../functions/other_functions.md#function-bar) можно визуализировать гистограмму, например: ```sql WITH histogram(5)(rand() % 100) AS hist diff --git a/docs/ru/query_language/functions/other_functions.md b/docs/ru/query_language/functions/other_functions.md index 987840cac99..3cc56bb1217 100644 --- a/docs/ru/query_language/functions/other_functions.md +++ b/docs/ru/query_language/functions/other_functions.md @@ -117,7 +117,7 @@ SELECT visibleWidth(NULL) Функция кидает исключение, если таблица не существует. Для элементов вложенной структуры данных функция проверяет существование столбца. Для самой же вложенной структуры данных функция возвращает 0. -## bar +## bar {#function-bar} Позволяет построить unicode-art диаграмму. diff --git a/docs/ru/query_language/select.md b/docs/ru/query_language/select.md index d206ba42c0b..61854066f32 100644 --- a/docs/ru/query_language/select.md +++ b/docs/ru/query_language/select.md @@ -92,7 +92,7 @@ FROM └───────────┴───────────┘ ``` -### Секция FROM +### Секция FROM {#select-from} Если секция FROM отсутствует, то данные будут читаться из таблицы `system.one`. Таблица `system.one` содержит ровно одну строку (то есть, эта таблица выполняет такую же роль, как таблица DUAL, которую можно найти в других СУБД). diff --git a/docs/toc_en.yml b/docs/toc_en.yml index dccd51f3cb1..b3a46303e49 100644 --- a/docs/toc_en.yml +++ b/docs/toc_en.yml @@ -210,6 +210,7 @@ nav: - 'Overview of ClickHouse Architecture': 'development/architecture.md' - 'How to Build ClickHouse on Linux': 'development/build.md' - 'How to Build ClickHouse on Mac OS X': 'development/build_osx.md' + - 'How to Build ClickHouse on Linux for Mac OS X': 'development/build_cross.md' - 'How to Write C++ code': 'development/style.md' - 'How to Run ClickHouse Tests': 'development/tests.md' - 'Third-Party Libraries Used': 'development/contrib.md' diff --git a/docs/toc_ru.yml b/docs/toc_ru.yml index b21bcc838dc..98c7b27a746 100644 --- a/docs/toc_ru.yml +++ b/docs/toc_ru.yml @@ -211,6 +211,7 @@ nav: - 'Обзор архитектуры ClickHouse': 'development/architecture.md' - 'Как собрать ClickHouse на Linux': 'development/build.md' - 'Как собрать ClickHouse на Mac OS X': 'development/build_osx.md' + - 'Как собрать ClickHouse на Linux для Mac OS X': 'development/build_cross.md' - 'Как писать код на C++': 'development/style.md' - 'Как запустить тесты': 'development/tests.md' - 'Сторонние библиотеки': 'development/contrib.md' From 8bce0fa9cbd7a8c39b3afe13cdeb048c2c405b81 Mon Sep 17 00:00:00 2001 From: Ivan <5627721+abyss7@users.noreply.github.com> Date: Wed, 25 Sep 2019 21:51:09 +0300 Subject: [PATCH 262/309] Disable some contribs for cross-compilation (#7101) --- cmake/target.cmake | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cmake/target.cmake b/cmake/target.cmake index 1be6abe8152..51b268b7a04 100644 --- a/cmake/target.cmake +++ b/cmake/target.cmake @@ -64,6 +64,15 @@ if (CMAKE_CROSSCOMPILING) set (HAS_POST_2038_EXITCODE "0" CACHE STRING "Result from TRY_RUN" FORCE) set (HAS_POST_2038_EXITCODE__TRYRUN_OUTPUT "" CACHE STRING "Output from TRY_RUN" FORCE) + + # FIXME: broken dependencies + set (USE_SNAPPY OFF) + set (ENABLE_SSL OFF) + set (ENABLE_PROTOBUF OFF) + set (ENABLE_PARQUET OFF) + set (ENABLE_READLINE OFF) + set (ENABLE_ICU OFF) + set (ENABLE_FASTOPS OFF) endif () # Don't know why but CXX_STANDARD doesn't work for cross-compilation From 647568a6f6db7f5434af697ae723fa4be2fd0ab7 Mon Sep 17 00:00:00 2001 From: Guillaume Tassery Date: Thu, 26 Sep 2019 05:34:22 +0200 Subject: [PATCH 263/309] cosmetic --- .../src/Dictionaries/HTTPDictionarySource.cpp | 30 +++++++++---------- dbms/src/Dictionaries/HTTPDictionarySource.h | 2 +- dbms/src/IO/ReadWriteBufferFromHTTP.h | 10 +++---- .../dicts/external_dicts_dict_sources.md | 4 +-- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/dbms/src/Dictionaries/HTTPDictionarySource.cpp b/dbms/src/Dictionaries/HTTPDictionarySource.cpp index ce4980506fe..184470ae7c5 100644 --- a/dbms/src/Dictionaries/HTTPDictionarySource.cpp +++ b/dbms/src/Dictionaries/HTTPDictionarySource.cpp @@ -38,23 +38,23 @@ HTTPDictionarySource::HTTPDictionarySource( if (config.has(credentials_prefix)) { - this->credentials.setUsername(config.getString(credentials_prefix + ".user", "")); - this->credentials.setPassword(config.getString(credentials_prefix + ".password", "")); + credentials.setUsername(config.getString(credentials_prefix + ".user", "")); + credentials.setPassword(config.getString(credentials_prefix + ".password", "")); } - const auto & http_headers_prefix = config_prefix + ".http-headers"; + const auto & headers_prefix = config_prefix + ".headers"; - if (config.has(http_headers_prefix)) + if (config.has(headers_prefix)) { Poco::Util::AbstractConfiguration::Keys config_keys; - config.keys(http_headers_prefix, config_keys); + config.keys(headers_prefix, config_keys); - this->header_entries.reserve(config_keys.size()); + header_entries.reserve(config_keys.size()); for (const auto & key : config_keys) { - const auto header_key = config.getString(http_headers_prefix + "." + key + ".name", ""); - const auto header_value = config.getString(http_headers_prefix + "." + key + ".value", ""); - this->header_entries.emplace_back(std::make_tuple(header_key, header_value)); + const auto header_key = config.getString(headers_prefix + "." + key + ".name", ""); + const auto header_value = config.getString(headers_prefix + "." + key + ".value", ""); + header_entries.emplace_back(std::make_tuple(header_key, header_value)); } } } @@ -71,8 +71,8 @@ HTTPDictionarySource::HTTPDictionarySource(const HTTPDictionarySource & other) , context(other.context) , timeouts(ConnectionTimeouts::getHTTPTimeouts(context)) { - this->credentials.setUsername(other.credentials.getUsername()); - this->credentials.setPassword(other.credentials.getPassword()); + credentials.setUsername(other.credentials.getUsername()); + credentials.setPassword(other.credentials.getPassword()); } void HTTPDictionarySource::getUpdateFieldAndDate(Poco::URI & uri) @@ -101,7 +101,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadAll() Poco::URI uri(url); auto in_ptr = std::make_unique( uri, Poco::Net::HTTPRequest::HTTP_GET, ReadWriteBufferFromHTTP::OutStreamCallback(), timeouts, - 0, this->credentials, DBMS_DEFAULT_BUFFER_SIZE, this->header_entries); + 0, credentials, DBMS_DEFAULT_BUFFER_SIZE, header_entries); auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } @@ -113,7 +113,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadUpdatedAll() LOG_TRACE(log, "loadUpdatedAll " + uri.toString()); auto in_ptr = std::make_unique( uri, Poco::Net::HTTPRequest::HTTP_GET, ReadWriteBufferFromHTTP::OutStreamCallback(), timeouts, - 0, this->credentials, DBMS_DEFAULT_BUFFER_SIZE, this->header_entries); + 0, credentials, DBMS_DEFAULT_BUFFER_SIZE, header_entries); auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } @@ -132,7 +132,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadIds(const std::vector & id Poco::URI uri(url); auto in_ptr = std::make_unique( uri, Poco::Net::HTTPRequest::HTTP_POST, out_stream_callback, timeouts, - 0, this->credentials, DBMS_DEFAULT_BUFFER_SIZE, this->header_entries); + 0, credentials, DBMS_DEFAULT_BUFFER_SIZE, header_entries); auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } @@ -151,7 +151,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadKeys(const Columns & key_columns, Poco::URI uri(url); auto in_ptr = std::make_unique( uri, Poco::Net::HTTPRequest::HTTP_POST, out_stream_callback, timeouts, - 0, this->credentials, DBMS_DEFAULT_BUFFER_SIZE, this->header_entries); + 0, credentials, DBMS_DEFAULT_BUFFER_SIZE, header_entries); auto input_stream = context.getInputFormat(format, *in_ptr, sample_block, max_block_size); return std::make_shared>(input_stream, std::move(in_ptr)); } diff --git a/dbms/src/Dictionaries/HTTPDictionarySource.h b/dbms/src/Dictionaries/HTTPDictionarySource.h index 75261cf148f..d0266b0870c 100644 --- a/dbms/src/Dictionaries/HTTPDictionarySource.h +++ b/dbms/src/Dictionaries/HTTPDictionarySource.h @@ -59,7 +59,7 @@ private: const DictionaryStructure dict_struct; const std::string url; Poco::Net::HTTPBasicCredentials credentials; - ReadWriteBufferFromHTTP::HttpHeaderEntries header_entries; + ReadWriteBufferFromHTTP::HTTPHeaderEntries header_entries; std::string update_field; const std::string format; Block sample_block; diff --git a/dbms/src/IO/ReadWriteBufferFromHTTP.h b/dbms/src/IO/ReadWriteBufferFromHTTP.h index 9716a15ad6e..6b408568800 100644 --- a/dbms/src/IO/ReadWriteBufferFromHTTP.h +++ b/dbms/src/IO/ReadWriteBufferFromHTTP.h @@ -87,8 +87,8 @@ namespace detail class ReadWriteBufferFromHTTPBase : public ReadBuffer { public: - using HttpHeaderEntry = std::tuple; - using HttpHeaderEntries = std::vector; + using HTTPHeaderEntry = std::tuple; + using HTTPHeaderEntries = std::vector; protected: Poco::URI uri; @@ -100,7 +100,7 @@ namespace detail std::function out_stream_callback; const Poco::Net::HTTPBasicCredentials & credentials; std::vector cookies; - HttpHeaderEntries http_header_entries; + HTTPHeaderEntries http_header_entries; std::istream * call(const Poco::URI uri_, Poco::Net::HTTPResponse & response) { @@ -157,7 +157,7 @@ namespace detail OutStreamCallback out_stream_callback_ = {}, const Poco::Net::HTTPBasicCredentials & credentials_ = {}, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, - HttpHeaderEntries http_header_entries_ = {}) + HTTPHeaderEntries http_header_entries_ = {}) : ReadBuffer(nullptr, 0) , uri {uri_} , method {!method_.empty() ? method_ : out_stream_callback_ ? Poco::Net::HTTPRequest::HTTP_POST : Poco::Net::HTTPRequest::HTTP_GET} @@ -243,7 +243,7 @@ public: const DB::SettingUInt64 max_redirects = 0, const Poco::Net::HTTPBasicCredentials & credentials_ = {}, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, - const HttpHeaderEntries & http_header_entries_ = {}) + const HTTPHeaderEntries & http_header_entries_ = {}) : Parent(std::make_shared(uri_, timeouts, max_redirects), uri_, method_, out_stream_callback_, credentials_, buffer_size_, http_header_entries_) { } diff --git a/docs/en/query_language/dicts/external_dicts_dict_sources.md b/docs/en/query_language/dicts/external_dicts_dict_sources.md index 37f362c0cf4..b894602ee46 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_sources.md +++ b/docs/en/query_language/dicts/external_dicts_dict_sources.md @@ -104,10 +104,10 @@ Setting fields: - `url` – The source URL. - `format` – The file format. All the formats described in "[Formats](../../interfaces/formats.md#formats)" are supported. -- `credentials` – Basic HTTP authentification. +- `credentials` – Basic HTTP authentification. Optional parameter. - `user` – Username required for the authentification. - `password` – Password required for the authentification. -- `http-headers` – All custom HTTP headers entries used for the HTTP request. +- `headers` – All custom HTTP headers entries used for the HTTP request. Optional parameter. - `http-header` – Single HTTP header entry. - `key` – Identifiant name used for the header send on the request. - `value` – Value set for a specific identifiant name. From 53b10d5b6088aff6edb42a8ed0d2bff8e253a9b1 Mon Sep 17 00:00:00 2001 From: Guillaume Tassery Date: Thu, 26 Sep 2019 05:41:00 +0200 Subject: [PATCH 264/309] docs --- .../dicts/external_dicts_dict_sources.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/en/query_language/dicts/external_dicts_dict_sources.md b/docs/en/query_language/dicts/external_dicts_dict_sources.md index 0a74ce00bdc..945d202da07 100644 --- a/docs/en/query_language/dicts/external_dicts_dict_sources.md +++ b/docs/en/query_language/dicts/external_dicts_dict_sources.md @@ -88,12 +88,12 @@ Example of settings: user password - - + +
    API-KEY key - - +
    +
    ``` @@ -108,7 +108,7 @@ Setting fields: - `user` – Username required for the authentification. - `password` – Password required for the authentification. - `headers` – All custom HTTP headers entries used for the HTTP request. Optional parameter. - - `http-header` – Single HTTP header entry. + - `header` – Single HTTP header entry. - `name` – Identifiant name used for the header send on the request. - `value` – Value set for a specific identifiant name. From 40c9d44a66ac11069a844820fefb6211cd74e8af Mon Sep 17 00:00:00 2001 From: filimonov <1549571+filimonov@users.noreply.github.com> Date: Thu, 26 Sep 2019 07:10:41 +0200 Subject: [PATCH 265/309] rpm: preserve existing configs on package upgrade --- utils/release/release_lib.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/release/release_lib.sh b/utils/release/release_lib.sh index 823ba7f0cc9..5aa48a60926 100644 --- a/utils/release/release_lib.sh +++ b/utils/release/release_lib.sh @@ -210,6 +210,7 @@ function make_rpm { | grep -vF '%dir "/etc/cron.d/"' \ | grep -vF '%dir "/etc/systemd/system/"' \ | grep -vF '%dir "/etc/systemd/"' \ + | sed -e 's|%config |%config(noreplace) |' \ > ${PACKAGE}-$VERSION_FULL-2.spec } From 1fdc2b5675f8bd8386421349b927cce9aebd246c Mon Sep 17 00:00:00 2001 From: BayoNet Date: Thu, 26 Sep 2019 11:45:08 +0300 Subject: [PATCH 266/309] DOCAPI-6422: EN docs for adaptive index granularity and some settings (#7012) * Typo fix. * Links fix. * Fixed links in docs. * More fixes. * DOCAPI-6422: Adaptive granularity * DOCAPI-6422: fix. * Update settings.md * Update settings.md * DOCAPI-6422: Clarifications and fixes. * DOCAPI-6422: Fix. * DOCAPI-6422: Link fix. --- docs/en/operations/settings/settings.md | 68 +++++++++++++++---- docs/en/operations/table_engines/mergetree.md | 15 ++-- docs/toc_zh.yml | 1 + docs/zh/development/build_cross.md | 1 + 4 files changed, 64 insertions(+), 21 deletions(-) create mode 120000 docs/zh/development/build_cross.md diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 5591f82d037..fe5dfa41527 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -377,52 +377,90 @@ By default: 1,000,000. It only works when reading from MergeTree engines. ClickHouse uses multiple threads when reading from [MergeTree*](../table_engines/mergetree.md) tables. This setting turns on/off the uniform distribution of reading tasks over the working threads. The algorithm of the uniform distribution aims to make execution time for all the threads approximately equal in a `SELECT` query. -**Possible values** +Possible values: - 0 — Do not use uniform read distribution. - 1 — Use uniform read distribution. -**Default value**: 1. +Default value: 1. ## merge_tree_min_rows_for_concurrent_read {#setting-merge_tree_min_rows_for_concurrent_read} If the number of rows to be read from a file of a [MergeTree*](../table_engines/mergetree.md) table exceeds `merge_tree_min_rows_for_concurrent_read` then ClickHouse tries to perform a concurrent reading from this file on several threads. -**Possible values** +Possible values: -Any positive integer. +- Any positive integer. + +Default value: 163840. + +## merge_tree_min_bytes_for_concurrent_read {#setting-merge_tree_min_bytes_for_concurrent_read} + +If a number of bytes to read from one file of a [MergeTree*](../table_engines/mergetree.md)-engine table exceeds `merge_tree_min_bytes_for_concurrent_read` then ClickHouse tries to perform a concurrent reading from this file on several threads. + +Possible values: + +- Any positive integer. + +Default value: 240 ✕ 1024 ✕ 1024. -**Default value**: 163840. ## merge_tree_min_rows_for_seek {#setting-merge_tree_min_rows_for_seek} If the distance between two data blocks to be read in one file is less than `merge_tree_min_rows_for_seek` rows, then ClickHouse does not seek through the file, but reads the data sequentially. -**Possible values** +Possible values: -Any positive integer. +- Any positive integer. + +Default value: 0. + +## merge_tree_min_bytes_for_seek {#setting-merge_tree_min_bytes_for_seek} + +If the distance between two data blocks to be read in one file is less than `merge_tree_min_bytes_for_seek` rows, then ClickHouse does not seek through the file, but reads the data sequentially. + +Possible values: + +- Any positive integer. + +Default value: 0. -**Default value**: 0. ## merge_tree_coarse_index_granularity {#setting-merge_tree_coarse_index_granularity} When searching data, ClickHouse checks the data marks in the index file. If ClickHouse finds that required keys are in some range, it divides this range into `merge_tree_coarse_index_granularity` subranges and searches the required keys there recursively. -**Possible values** +Possible values: -Any positive even integer. +- Any positive even integer. -**Default value**: 8. +Default value: 8. ## merge_tree_max_rows_to_use_cache {#setting-merge_tree_max_rows_to_use_cache} -If ClickHouse should read more than `merge_tree_max_rows_to_use_cache` rows in one query, it does not use the cash of uncompressed blocks. The [uncompressed_cache_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. +If ClickHouse should read more than `merge_tree_max_rows_to_use_cache` rows in one query, it does not use the cache of uncompressed blocks. The [uncompressed_cache_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. -**Possible values** +The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries reading a large amount of data. -Any positive integer. +Possible values: + +- Any positive integer. + +Default value: 128 ✕ 8192. + + +## merge_tree_max_bytes_to_use_cache {#setting-merge_tree_max_bytes_to_use_cache} + +If ClickHouse should read more than `merge_tree_max_bytes_to_use_cache` bytes in one query, it does not use the cache of uncompressed blocks. The [uncompressed_cache_size](../server_settings/settings.md#server-settings-uncompressed_cache_size) server setting defines the size of the cache of uncompressed blocks. + +The cache of uncompressed blocks stores data extracted for queries. ClickHouse uses this cache to speed up responses to repeated small queries. This setting protects the cache from trashing by queries reading a large amount of data. + +Possible values: + +- Any positive integer. + +Default value: 1920 ✕ 1024 ✕ 1024. -**Default value**: 1048576. ## min_bytes_to_use_direct_io {#settings-min_bytes_to_use_direct_io} diff --git a/docs/en/operations/table_engines/mergetree.md b/docs/en/operations/table_engines/mergetree.md index 82ea3b23cc7..a8d4d62f2d0 100644 --- a/docs/en/operations/table_engines/mergetree.md +++ b/docs/en/operations/table_engines/mergetree.md @@ -78,11 +78,14 @@ For a description of parameters, see the [CREATE query description](../../query_ For more details, see [TTL for columns and tables](#table_engine-mergetree-ttl) - `SETTINGS` — Additional parameters that control the behavior of the `MergeTree`: - - `index_granularity` — The granularity of an index. The number of data rows between the "marks" of an index. By default, 8192. For the list of available parameters, see [MergeTreeSettings.h](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Storages/MergeTree/MergeTreeSettings.h). + - `index_granularity` — Maximum number of data rows between the marks of an index. Default value: 8192. See [Data Storage](#mergetree-data-storage). + - `index_granularity_bytes` — Maximum size of data granule in bytes. Default value: 10Mb. To restrict the size of granule only by number of rows set 0 (not recommended). See [Data Storage](#mergetree-data-storage). + - `enable_mixed_granularity_parts` — Enables or disables transition to controlling the granule size with the `index_granularity_bytes` setting. Before the version 19.11 there was the only `index_granularity` setting for the granule size restriction. The `index_granularity_bytes` setting improves ClickHouse performance when selecting data from the tables with big rows (tens and hundreds of megabytes). So if you have tables with big rows, you can turn the setting on for the tables to get better efficiency of your `SELECT` queries. - `use_minimalistic_part_header_in_zookeeper` — Storage method of the data parts headers in ZooKeeper. If `use_minimalistic_part_header_in_zookeeper=1`, then ZooKeeper stores less data. For more information, see the [setting description](../server_settings/settings.md#server-settings-use_minimalistic_part_header_in_zookeeper) in "Server configuration parameters". - `min_merge_bytes_to_use_direct_io` — The minimum data volume for merge operation that is required for using direct I/O access to the storage disk. When merging data parts, ClickHouse calculates the total storage volume of all the data to be merged. If the volume exceeds `min_merge_bytes_to_use_direct_io` bytes, ClickHouse reads and writes the data to the storage disk using the direct I/O interface (`O_DIRECT` option). If `min_merge_bytes_to_use_direct_io = 0`, then direct I/O is disabled. Default value: `10 * 1024 * 1024 * 1024` bytes. - `merge_with_ttl_timeout` — Minimum delay in seconds before repeating a merge with TTL. Default value: 86400 (1 day). + - `write_final_mark` — Enables or disables writing the final index mark at the end of data part. Default value: 1. Don't turn it off. **Example of Sections Setting** @@ -126,7 +129,7 @@ MergeTree(EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID) The `MergeTree` engine is configured in the same way as in the example above for the main engine configuration method. -## Data Storage +## Data Storage {#mergetree-data-storage} A table consists of data parts sorted by primary key. @@ -134,9 +137,9 @@ When data is inserted in a table, separate data parts are created and each of th Data belonging to different partitions are separated into different parts. In the background, ClickHouse merges data parts for more efficient storage. Parts belonging to different partitions are not merged. The merge mechanism does not guarantee that all rows with the same primary key will be in the same data part. -For each data part, ClickHouse creates an index file that contains the primary key value for each index row ("mark"). Index row numbers are defined as `n * index_granularity`. The maximum value `n` is equal to the integer part of dividing the total number of rows by the `index_granularity`. For each column, the "marks" are also written for the same index rows as the primary key. These "marks" allow you to find the data directly in the columns. +Each data part is logically divided by granules. A granule is the smallest indivisible data set that ClickHouse reads when selecting data. ClickHouse doesn't split rows or values, so each granule always contains an integer number of rows. The first row of a granule is marked with the value of the primary key for this row. For each data part, ClickHouse creates an index file that stores the marks. For each column, whether it is in the primary key or not, ClickHouse also stores the same marks. These marks allow finding the data directly in the columns. -You can use a single large table and continually add data to it in small chunks – this is what the `MergeTree` engine is intended for. +The size of a granule is restricted by the `index_granularity` and `index_granularity_bytes` settings of the table engine. The number of rows in granule lays in the `[1, index_granularity]` range, depending on the size of rows. The size of a granule can exceed `index_granularity_bytes` if the size of the single row is greater than the value of the setting. In this case, the size of the granule equals the size of the row. ## Primary Keys and Indexes in Queries {#primary-keys-and-indexes-in-queries} @@ -159,9 +162,9 @@ If the data query specifies: The examples above show that it is always more effective to use an index than a full scan. -A sparse index allows extra data to be read. When reading a single range of the primary key, up to `index_granularity * 2` extra rows in each data block can be read. In most cases, ClickHouse performance does not degrade when `index_granularity = 8192`. +A sparse index allows extra data to be read. When reading a single range of the primary key, up to `index_granularity * 2` extra rows in each data block can be read. -Sparse indexes allow you to work with a very large number of table rows, because such indexes are always stored in the computer's RAM. +Sparse indexes allow you to work with a very large number of table rows, because such indexes fit the computer's RAM in the very most cases. ClickHouse does not require a unique primary key. You can insert multiple rows with the same primary key. diff --git a/docs/toc_zh.yml b/docs/toc_zh.yml index d0d73cea636..f90ace045d6 100644 --- a/docs/toc_zh.yml +++ b/docs/toc_zh.yml @@ -209,6 +209,7 @@ nav: - 'ClickHouse架构概述': 'development/architecture.md' - '如何在Linux中编译ClickHouse': 'development/build.md' - '如何在Mac OS X中编译ClickHouse': 'development/build_osx.md' + - 'How to Build ClickHouse on Linux for Mac OS X': 'development/build_cross.md' - '如何编写C++代码': 'development/style.md' - '如何运行ClickHouse测试': 'development/tests.md' - '使用的第三方库': 'development/contrib.md' diff --git a/docs/zh/development/build_cross.md b/docs/zh/development/build_cross.md new file mode 120000 index 00000000000..f595f252de3 --- /dev/null +++ b/docs/zh/development/build_cross.md @@ -0,0 +1 @@ +../../en/development/build_cross.md \ No newline at end of file From df8cb4b6195021c49c3b7d07ede05e78aef3e0ac Mon Sep 17 00:00:00 2001 From: BayoNet Date: Thu, 26 Sep 2019 11:51:48 +0300 Subject: [PATCH 267/309] DOCAPI-7413: EN revew, RU translation. T64 codec docs (#7021) * Update create.md (#32) * DOCAPI-7413: Fixes. * DOCAPI-7413: Fixes. * DOCAPI-7413: RU translation. --- docs/en/query_language/create.md | 26 ++++----- docs/ru/query_language/create.md | 94 ++++++++++++++++++-------------- 2 files changed, 65 insertions(+), 55 deletions(-) diff --git a/docs/en/query_language/create.md b/docs/en/query_language/create.md index 2a8c546233c..0b400adca30 100644 --- a/docs/en/query_language/create.md +++ b/docs/en/query_language/create.md @@ -107,8 +107,6 @@ It is not possible to set default values for elements in nested data structures. ### Constraints {#constraints} -WARNING: This feature is experimental. Correct work is not guaranteed on non-MergeTree family engines. - Along with columns descriptions constraints could be defined: ```sql @@ -125,15 +123,15 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Adding large amount of constraints can negatively affect performance of big `INSERT` queries. -### TTL expression +### TTL Expression Defines storage time for values. Can be specified only for MergeTree-family tables. For the detailed description, see [TTL for columns and tables](../operations/table_engines/mergetree.md#table_engine-mergetree-ttl). -## Column Compression Codecs +### Column Compression Codecs -By default, ClickHouse applies to columns the compression method, defined in [server settings](../operations/server_settings/settings.md#compression). Also, you can define compression method for each individual column in the `CREATE TABLE` query. +By default, ClickHouse applies the compression method, defined in [server settings](../operations/server_settings/settings.md#compression), to columns. You can also define the compression method for each individual column in the `CREATE TABLE` query. -``` +```sql CREATE TABLE codec_example ( dt Date CODEC(ZSTD), @@ -146,12 +144,12 @@ ENGINE = ... ``` -If a codec is specified, the default codec doesn't apply. Codecs can be combined in a pipeline, for example, `CODEC(Delta, ZSTD)`. To select the best codecs combination for you project, pass benchmarks, similar to described in the Altinity [New Encodings to Improve ClickHouse Efficiency](https://www.altinity.com/blog/2019/7/new-encodings-to-improve-clickhouse) article. +If a codec is specified, the default codec doesn't apply. Codecs can be combined in a pipeline, for example, `CODEC(Delta, ZSTD)`. To select the best codec combination for you project, pass benchmarks similar to described in the Altinity [New Encodings to Improve ClickHouse Efficiency](https://www.altinity.com/blog/2019/7/new-encodings-to-improve-clickhouse) article. -!!!warning - You cannot decompress ClickHouse database files with external utilities, for example, `lz4`. Use the special utility, [clickhouse-compressor](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/programs/compressor). +!!!warning "Warning" + You can't decompress ClickHouse database files with external utilities like `lz4`. Instead, use the special [clickhouse-compressor](https://github.com/yandex/ClickHouse/tree/master/dbms/programs/compressor) utility. -Compression is supported for the table engines: +Compression is supported for the following table engines: - [MergeTree](../operations/table_engines/mergetree.md) family - [Log](../operations/table_engines/log_family.md) family @@ -160,9 +158,9 @@ Compression is supported for the table engines: ClickHouse supports common purpose codecs and specialized codecs. -### Specialized codecs {#create-query-specialized-codecs} +#### Specialized Codecs {#create-query-specialized-codecs} -These codecs are designed to make compression more effective using specifities of the data. Some of this codecs don't compress data by itself, but they prepare data to be compressed better by common purpose codecs. +These codecs are designed to make compression more effective by using specific features of data. Some of these codecs don't compress data themself. Instead, they prepare the data for a common purpose codec, which compresses it better than without this preparation. Specialized codecs: @@ -182,7 +180,7 @@ CREATE TABLE codec_example ENGINE = MergeTree() ``` -### Common purpose codecs {#create-query-common-purpose-codecs} +#### Common purpose codecs {#create-query-common-purpose-codecs} Codecs: @@ -191,7 +189,7 @@ Codecs: - `LZ4HC[(level)]` — LZ4 HC (high compression) algorithm with configurable level. Default level: 9. Setting `level <= 0` applies the default level. Possible levels: [1, 12]. Recommended level range: [4, 9]. - `ZSTD[(level)]` — [ZSTD compression algorithm](https://en.wikipedia.org/wiki/Zstandard) with configurable `level`. Possible levels: [1, 22]. Default value: 1. -High compression levels useful for asymmetric scenarios, like compress once, decompress a lot of times. Greater levels stands for better compression and higher CPU usage. +High compression levels are useful for asymmetric scenarios, like compress once, decompress repeatedly. Higher levels mean better compression and higher CPU usage. ## Temporary Tables diff --git a/docs/ru/query_language/create.md b/docs/ru/query_language/create.md index c10cfff7685..81994580022 100644 --- a/docs/ru/query_language/create.md +++ b/docs/ru/query_language/create.md @@ -105,9 +105,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ... ### Ограничения (constraints) {#constraints} -WARNING: Находится в экспериментальном режиме, поддержано в MergeTree (работоспособность на других типах движков таблиц не гарантируется). - -Наряду с объявлением столбцов можно объявить ограчения на значения в столбцах таблицы: +Наряду с объявлением столбцов можно объявить ограничения на значения в столбцах таблицы: ```sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] @@ -127,56 +125,70 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Определяет время хранения значений. Может быть указано только для таблиц семейства MergeTree. Подробнее смотрите в [TTL для столбцов и таблиц](../operations/table_engines/mergetree.md#table_engine-mergetree-ttl). -## Кодеки сжатия столбцов +### Кодеки сжатия столбцов -Помимо сжатия данных по умолчанию, определяемого [конфигурационными параметрами сервера](../operations/server_settings/settings.md#compression), можно задать сжатие для каждого отдельного столбца. - -Поддерживаемые алгоритмы сжатия: - -- `NONE` — без сжатия. -- `LZ4` — [алгоритм сжатия данных](https://github.com/lz4/lz4) без потерь, используемый по умолчанию. Применяет быстрое сжатие LZ4. -- `LZ4HC[(level)]` — алгоритм сильного сжатия LZ4 HC с настраиваемым уровнем. Уровень по умолчанию — 9. Настройка `level <= 0` устанавливает уровень по умолчанию. Возможные уровни: [1, 12]. Рекомендуемый диапазон уровней: [4, 9]. -- `ZSTD[(level)]` — [Алгоритм сжатия ZSTD](https://en.wikipedia.org/wiki/Zstandard) с настаиваемым уровнем `level`. Возможные уровни: [1, 22]. Значение по умолчанию — 1. -- `Delta(delta_bytes)` — способ сжатия, при котором исходные значения заменяются разностью двух соседних значений. Для хранение разностей используется до `delta_bytes` байтов, т.е. `delta_bytes` — это максимальный размер исходных значений. Возможные значения `delta_bytes` — 1, 2, 4, 8. Значение `delta_bytes` по умолчанию равно `sizeof(type)`, если вычисленный размер равен 1, 2, 4 или 8. Во всех остальных случаях — 1. -- `DoubleDelta` — Сжимает значения вплоть до размера в 1 бит благодаря сохранению разностей. Оптимальные уровни сжатия достигаются для монотонных последовательностей с постоянным шагом, например, временные ряды. Может использоваться с любым типом данных фиксированного размера. Реализует алгоритм, используемый в Gorilla TSDB, расширяя его для поддержки 64-битных типов. Использует 1 дополнительный бит для 32-байтовых значений: 5-битные префиксы вместо 4-битных префиксов. Подробнее смотрите в разделе "Compressing Time Stamps" в [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). -- `Gorilla` — Сжимает значения вплоть до размера в 1 bit. Эффективен при хранении рядов медленно изменяющихся чисел с плавающей запятой, потому, что лучшее сжатие достигается, когда соседние значения бинарно равны. Реализует алгоритм, используемый в Gorilla TSDB, расширяя его для поддержки 64-битных типов. Подробнее смотрите в разделе "Compressing Values" в [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). - -Высокие уровни сжатия полезны для асимметричных сценариев, например, для таких, в которых требуется однократное сжатие и многократная распаковка. Более высокие уровни обеспечивают лучшее сжатие, но более высокое потребление вычислительных ресурсов. - -!!! warning "Предупреждение" - Базу данных ClickHouse не получится распаковать с помощью внешних утилит типа `lz4`. Используйте специальную программу [clickhouse-compressor](https://github.com/ClickHouse/ClickHouse/tree/master/dbms/programs/compressor). - -Пример использования: +По умолчанию, ClickHouse применяет к столбцу метод сжатия, определённый в [конфигурации сервера](../operations/server_settings/settings.md#compression). Кроме этого, можно задать метод сжатия для каждого отдельного столбца в запросе `CREATE TABLE`. ```sql CREATE TABLE codec_example ( - dt Date CODEC(ZSTD), /* используется уровень сжатия по умолчанию */ + dt Date CODEC(ZSTD), ts DateTime CODEC(LZ4HC), float_value Float32 CODEC(NONE), double_value Float64 CODEC(LZ4HC(9)) -) -ENGINE = MergeTree -PARTITION BY tuple() -ORDER BY dt -``` - -Кодеки можно комбинировать. Если для колонки указана своя последовательность кодеков, то общий табличный кодек не применяется (должен быть указан в последовательности принудительно, если нужен). В примере ниже - оптимизация для хранения timeseries метрик. -Как правило, значения одной и той же метрики `path` не сильно различаются между собой, и выгоднее использовать дельта-компрессию вместо записи всего числа: - -```sql -CREATE TABLE timeseries_example -( - dt Date, - ts DateTime, - path String, value Float32 CODEC(Delta, ZSTD) ) -ENGINE = MergeTree -PARTITION BY dt -ORDER BY (path, ts) +ENGINE = +... ``` +Если задать кодек для столбца, то кодек по умолчанию не применяется. Кодеки можно последовательно комбинировать, например, `CODEC(Delta, ZSTD)`. Чтобы выбрать наиболее подходящую для вашего проекта комбинацию кодеков, необходимо провести сравнительные тесты, подобные тем, что описаны в статье Altinity [New Encodings to Improve ClickHouse Efficiency](https://www.altinity.com/blog/2019/7/new-encodings-to-improve-clickhouse). + +!!!warning "Предупреждение" + Нельзя распаковать базу данных ClickHouse с помощью сторонних утилит наподобие `lz4`. Необходимо использовать специальную утилиту [clickhouse-compressor](https://github.com/yandex/ClickHouse/tree/master/dbms/programs/compressor). + +Сжатие поддерживается для следующих движков таблиц: + +- [MergeTree family](../operations/table_engines/mergetree.md) +- [Log family](../operations/table_engines/log_family.md) +- [Set](../operations/table_engines/set.md) +- [Join](../operations/table_engines/join.md) + +ClickHouse поддерживает кодеки общего назначения и специализированные кодеки. + +#### Специализированные кодеки {#create-query-specialized-codecs} + +Эти кодеки разработаны для того, чтобы, используя особенности данных сделать сжатие более эффективным. Некоторые из этих кодеков не сжимают данные самостоятельно. Они готовят данные для кодеков общего назначения, которые сжимают подготовленные данные эффективнее, чем неподготовленные. + +Специализированные кодеки: + +- `Delta(delta_bytes)` — Метод, в котором исходные значения заменяются разностью двух соседних значений, за исключением первого значения, которое остаётся неизменным. Для хранения разниц используется до `delta_bytes`, т.е. `delta_bytes` — это максимальный размер исходных данных. Возможные значения `delta_bytes`: 1, 2, 4, 8. Значение по умолчанию для `delta_bytes` равно `sizeof(type)`, если результат 1, 2, 4, or 8. Во всех других случаях — 1. +- `DoubleDelta` — Вычисляется разницу от разниц и сохраняет её в компакном бинарном виде. Оптимальная степень сжатия достигается для монотонных последовательностей с постоянным шагом, наподобие временных рядов. Можно использовать с любым типом данных фиксированного размера. Реализует алгоритм, используемый в TSDB Gorilla, поддерживает 64-битные типы данных. Использует 1 дополнительный бит для 32-байтовых значений: 5-битные префиксы вместо 4-битных префиксов. Подробнее читайте в разделе "Compressing Time Stamps" документа [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). +- `Gorilla` — Вычисляет XOR между текущим и предыдущим значением и записывает результат в компактной бинарной форме. Еффективно сохраняет ряды медленно изменяющихся чисел с плавающей запятой, поскольку наилучший коэффициен сжатия достигается, если соседние значения одинаковые. Реализует алгоритм, используемый в TSDB Gorilla, адаптируя его для работы с 64-битными значениями. Подробнее читайте в разделе "Compressing Values" документа [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). +- `T64` — Метод сжатия который обрезает неиспользуемые старшие биты целочисленных значений (включая `Enum`, `Date` и `DateTime`). На каждом шаге алгоритма, кодек помещает блок из 64 значений в матрицу 64✕64, транспонирует её, обрезает неиспользуемые биты, а то, что осталось возвращает в виде последовательности. Неиспользуемые биты, это биты, которые не изменяются от минимального к максимальному на всём диапазоне значений куска данных. + +Кодеки `DoubleDelta` и `Gorilla` используются в TSDB Gorilla как компоненты алгоритма сжатия. Подход Gorilla эффективен в сценариях, когда данные представляют собой медленно изменяющиеся во времени величины. Метки времени эффективно сжимаются кодеком `DoubleDelta`, а значения кодеком `Gorilla`. Например, чтобы создать эффективно хранящуюся таблицу, используйте следующую конфигурацию: + +```sql +CREATE TABLE codec_example +( + timestamp DateTime CODEC(DoubleDelta), + slow_values Float32 CODEC(Gorilla) +) +ENGINE = MergeTree() +``` + +#### Кодеки общего назначения {#create-query-common-purpose-codecs} + +Кодеки: + +- `NONE` — без сжатия. +- `LZ4` — [алгоритм сжатия без потерь](https://github.com/lz4/lz4) используемый по умолчанию. Применяет быстрое сжатие LZ4. +- `LZ4HC[(level)]` — алгоритм LZ4 HC (high compression) с настраиваемым уровнем сжатия. Уровень по умолчанию — 9. Настройка `level <= 0` устанавливает уровень сжания по умолчанию. Возможные уровни сжатия: [1, 12]. Рекомендуемый диапазон уровней: [4, 9]. +- `ZSTD[(level)]` — [алгоритм сжатия ZSTD](https://en.wikipedia.org/wiki/Zstandard) с настраиваемым уровнем сжатия `level`. Возможные уровни сжатия: [1, 22]. Уровень сжатия по умолчанию: 1. + +Высокие уровни сжатия полезны для ассимметричных сценариев, подобных "один раз сжал, много раз распаковал". Высокие уровни сжатия подразумеваю лучшее сжатие, но большее использование CPU. + ## Временные таблицы From e2c7b83747641cb39ca90d690931ab9aa6f2a748 Mon Sep 17 00:00:00 2001 From: "alex.lvxin" Date: Thu, 26 Sep 2019 17:14:14 +0800 Subject: [PATCH 268/309] Update mergetree.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix the error content。 --- docs/zh/operations/table_engines/mergetree.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh/operations/table_engines/mergetree.md b/docs/zh/operations/table_engines/mergetree.md index 2afb50af155..4c35f3cf6b9 100644 --- a/docs/zh/operations/table_engines/mergetree.md +++ b/docs/zh/operations/table_engines/mergetree.md @@ -10,7 +10,7 @@ Clickhouse 中最强大的表引擎当属 `MergeTree` (合并树)引擎及 这让你可以创建一个用于快速检索数据的小稀疏索引。 -- 允许使用分区,如果指定了 [主键](custom_partitioning_key.md) 的话。 +- 允许使用分区,如果指定了 [分区键](custom_partitioning_key.md) 的话。 在相同数据集和相同结果集的情况下 ClickHouse 中某些带分区的操作会比普通操作更快。查询中指定了分区键时 ClickHouse 会自动截取分区数据。这也有效增加了查询性能。 From 903f826640e3bc1a2ba834aaba571e22b2320086 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 26 Sep 2019 13:08:38 +0300 Subject: [PATCH 269/309] Remove runtime factory and remove one redundant object --- dbms/src/Dictionaries/CacheDictionary.inc.h | 2 +- dbms/src/Interpreters/Context.cpp | 6 ++-- dbms/src/Interpreters/Context.h | 4 +-- dbms/src/Interpreters/ExternalLoader.cpp | 2 +- dbms/src/Interpreters/ExternalLoader.h | 2 +- dbms/src/Interpreters/IExternalLoadable.cpp | 5 ++- dbms/src/Interpreters/IExternalLoadable.h | 13 ++------ .../Interpreters/IRuntimeComponentsFactory.h | 31 ------------------- .../Interpreters/RuntimeComponentsFactory.h | 11 +++---- 9 files changed, 19 insertions(+), 57 deletions(-) delete mode 100644 dbms/src/Interpreters/IRuntimeComponentsFactory.h diff --git a/dbms/src/Dictionaries/CacheDictionary.inc.h b/dbms/src/Dictionaries/CacheDictionary.inc.h index 51d515a63dd..c10cde8c4fd 100644 --- a/dbms/src/Dictionaries/CacheDictionary.inc.h +++ b/dbms/src/Dictionaries/CacheDictionary.inc.h @@ -331,7 +331,7 @@ void CacheDictionary::update( { ++error_count; last_exception = std::current_exception(); - backoff_end_time = now + std::chrono::seconds(ExternalLoadableBackoff{}.calculateDuration(rnd_engine, error_count)); + backoff_end_time = now + std::chrono::seconds(calculateDurationWithBackoff(rnd_engine, error_count)); tryLogException(last_exception, log, "Could not update cache dictionary '" + getName() + "', next update is scheduled at " + DateLUT::instance().timeToString(std::chrono::system_clock::to_time_t(backoff_end_time))); diff --git a/dbms/src/Interpreters/Context.cpp b/dbms/src/Interpreters/Context.cpp index 086d060c171..df04f0f56f5 100644 --- a/dbms/src/Interpreters/Context.cpp +++ b/dbms/src/Interpreters/Context.cpp @@ -97,7 +97,7 @@ struct ContextShared { Logger * log = &Logger::get("Context"); - std::unique_ptr runtime_components_factory; + std::unique_ptr runtime_components_factory; /// For access of most of shared objects. Recursive mutex. mutable std::recursive_mutex mutex; @@ -210,7 +210,7 @@ struct ContextShared Context::ConfigReloadCallback config_reload_callback; - ContextShared(std::unique_ptr runtime_components_factory_) + ContextShared(std::unique_ptr runtime_components_factory_) : runtime_components_factory(std::move(runtime_components_factory_)), macros(std::make_unique()) { /// TODO: make it singleton (?) @@ -318,7 +318,7 @@ Context::Context(const Context &) = default; Context & Context::operator=(const Context &) = default; -Context Context::createGlobal(std::unique_ptr runtime_components_factory) +Context Context::createGlobal(std::unique_ptr runtime_components_factory) { Context res; res.shared = std::make_shared(std::move(runtime_components_factory)); diff --git a/dbms/src/Interpreters/Context.h b/dbms/src/Interpreters/Context.h index 9c001916347..6b074acf056 100644 --- a/dbms/src/Interpreters/Context.h +++ b/dbms/src/Interpreters/Context.h @@ -43,7 +43,7 @@ namespace DB struct ContextShared; class Context; -class IRuntimeComponentsFactory; +class RuntimeComponentsFactory; class QuotaForIntervals; class EmbeddedDictionaries; class ExternalDictionaries; @@ -174,7 +174,7 @@ private: public: /// Create initial Context with ContextShared and etc. - static Context createGlobal(std::unique_ptr runtime_components_factory); + static Context createGlobal(std::unique_ptr runtime_components_factory); static Context createGlobal(); Context(const Context &); diff --git a/dbms/src/Interpreters/ExternalLoader.cpp b/dbms/src/Interpreters/ExternalLoader.cpp index 6e16fd37cba..fb6464fb217 100644 --- a/dbms/src/Interpreters/ExternalLoader.cpp +++ b/dbms/src/Interpreters/ExternalLoader.cpp @@ -985,7 +985,7 @@ public: return std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)}; } - return std::chrono::system_clock::now() + std::chrono::seconds(ExternalLoadableBackoff{}.calculateDuration(rnd_engine, error_count)); + return std::chrono::system_clock::now() + std::chrono::seconds(calculateDurationWithBackoff(rnd_engine, error_count)); } private: diff --git a/dbms/src/Interpreters/ExternalLoader.h b/dbms/src/Interpreters/ExternalLoader.h index ecfc43c2dd9..4976c28d8e6 100644 --- a/dbms/src/Interpreters/ExternalLoader.h +++ b/dbms/src/Interpreters/ExternalLoader.h @@ -29,7 +29,7 @@ struct ExternalLoaderConfigSettings }; -/** Manages user-defined objects. +/** Iterface for manage user-defined objects. * Monitors configuration file and automatically reloads objects in separate threads. * The monitoring thread wakes up every 'check_period_sec' seconds and checks * modification time of objects' configuration file. If said time is greater than diff --git a/dbms/src/Interpreters/IExternalLoadable.cpp b/dbms/src/Interpreters/IExternalLoadable.cpp index e8bf8cbaf3c..18439cf999f 100644 --- a/dbms/src/Interpreters/IExternalLoadable.cpp +++ b/dbms/src/Interpreters/IExternalLoadable.cpp @@ -17,8 +17,11 @@ ExternalLoadableLifetime::ExternalLoadableLifetime(const Poco::Util::AbstractCon } -UInt64 ExternalLoadableBackoff::calculateDuration(pcg64 & rnd_engine, size_t error_count) const +UInt64 calculateDurationWithBackoff(pcg64 & rnd_engine, size_t error_count) { + constexpr UInt64 backoff_initial_sec = 5; + constexpr UInt64 backoff_max_sec = 10 * 60; /// 10 minutes + if (error_count < 1) error_count = 1; std::uniform_int_distribution distribution(0, static_cast(std::exp2(error_count - 1))); diff --git a/dbms/src/Interpreters/IExternalLoadable.h b/dbms/src/Interpreters/IExternalLoadable.h index e842fdb8573..d4b93c56d2a 100644 --- a/dbms/src/Interpreters/IExternalLoadable.h +++ b/dbms/src/Interpreters/IExternalLoadable.h @@ -25,17 +25,8 @@ struct ExternalLoadableLifetime ExternalLoadableLifetime(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix); }; - -/// Delay before trying to load again after error. -struct ExternalLoadableBackoff -{ - UInt64 backoff_initial_sec = 5; - UInt64 backoff_max_sec = 10 * 60; /// 10 minutes - - /// Calculates time to try loading again after error. - UInt64 calculateDuration(pcg64 & rnd_engine, size_t error_count = 1) const; -}; - +/// Get delay before trying to load again after error. +UInt64 calculateDurationWithBackoff(pcg64 & rnd_engine, size_t error_count = 1); /// Basic interface for external loadable objects. Is used in ExternalLoader. class IExternalLoadable : public std::enable_shared_from_this, private boost::noncopyable diff --git a/dbms/src/Interpreters/IRuntimeComponentsFactory.h b/dbms/src/Interpreters/IRuntimeComponentsFactory.h deleted file mode 100644 index 1577b6b691d..00000000000 --- a/dbms/src/Interpreters/IRuntimeComponentsFactory.h +++ /dev/null @@ -1,31 +0,0 @@ -#pragma once - -#include -#include -#include - -#include - -namespace DB -{ - -/** Factory of query engine runtime components / services. - * Helps to host query engine in external applications - * by replacing or reconfiguring its components. - */ -class IRuntimeComponentsFactory -{ -public: - virtual ~IRuntimeComponentsFactory() = default; - - virtual std::unique_ptr createUsersManager() = 0; - - virtual std::unique_ptr createGeoDictionariesLoader() = 0; - - // Repositories with configurations of user-defined objects (dictionaries, models) - virtual std::unique_ptr createExternalDictionariesConfigRepository() = 0; - - virtual std::unique_ptr createExternalModelsConfigRepository() = 0; -}; - -} diff --git a/dbms/src/Interpreters/RuntimeComponentsFactory.h b/dbms/src/Interpreters/RuntimeComponentsFactory.h index 4c319911b39..e2b8310dd8c 100644 --- a/dbms/src/Interpreters/RuntimeComponentsFactory.h +++ b/dbms/src/Interpreters/RuntimeComponentsFactory.h @@ -2,7 +2,6 @@ #include #include -#include #include namespace DB @@ -11,25 +10,25 @@ namespace DB /** Default implementation of runtime components factory * used by native server application. */ -class RuntimeComponentsFactory : public IRuntimeComponentsFactory +class RuntimeComponentsFactory { public: - std::unique_ptr createUsersManager() override + std::unique_ptr createUsersManager() { return std::make_unique(); } - std::unique_ptr createGeoDictionariesLoader() override + std::unique_ptr createGeoDictionariesLoader() { return std::make_unique(); } - std::unique_ptr createExternalDictionariesConfigRepository() override + std::unique_ptr createExternalDictionariesConfigRepository() { return std::make_unique(); } - std::unique_ptr createExternalModelsConfigRepository() override + std::unique_ptr createExternalModelsConfigRepository() { return std::make_unique(); } From 5668f55ab779b148fe78655ade27c89d499d8acc Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 26 Sep 2019 13:23:14 +0300 Subject: [PATCH 270/309] Rename ExternalModels to ExternalModelsLoader --- .../src/Functions/FunctionsExternalModels.cpp | 8 ++-- dbms/src/Functions/FunctionsExternalModels.h | 6 +-- dbms/src/Interpreters/Context.cpp | 18 ++++---- dbms/src/Interpreters/Context.h | 6 +-- dbms/src/Interpreters/ExternalModels.cpp | 46 ------------------- dbms/src/Interpreters/ExternalModels.h | 40 ---------------- .../Storages/System/StorageSystemModels.cpp | 6 +-- 7 files changed, 22 insertions(+), 108 deletions(-) delete mode 100644 dbms/src/Interpreters/ExternalModels.cpp delete mode 100644 dbms/src/Interpreters/ExternalModels.h diff --git a/dbms/src/Functions/FunctionsExternalModels.cpp b/dbms/src/Functions/FunctionsExternalModels.cpp index 6c96e09bad2..df9c438d4ca 100644 --- a/dbms/src/Functions/FunctionsExternalModels.cpp +++ b/dbms/src/Functions/FunctionsExternalModels.cpp @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include #include @@ -22,7 +22,7 @@ namespace DB FunctionPtr FunctionModelEvaluate::create(const Context & context) { - return std::make_shared(context.getExternalModels()); + return std::make_shared(context.getExternalModelsLoader()); } namespace ErrorCodes @@ -51,7 +51,7 @@ DataTypePtr FunctionModelEvaluate::getReturnTypeImpl(const ColumnsWithTypeAndNam for (size_t i = 1; i < arguments.size(); ++i) has_nullable = has_nullable || arguments[i].type->isNullable(); - auto model = models.getModel(name_col->getValue()); + auto model = models_loader.getModel(name_col->getValue()); auto type = model->getReturnType(); if (has_nullable) @@ -78,7 +78,7 @@ void FunctionModelEvaluate::executeImpl(Block & block, const ColumnNumbers & arg throw Exception("First argument of function " + getName() + " must be a constant string", ErrorCodes::ILLEGAL_COLUMN); - auto model = models.getModel(name_col->getValue()); + auto model = models_loader.getModel(name_col->getValue()); ColumnRawPtrs columns; Columns materialized_columns; diff --git a/dbms/src/Functions/FunctionsExternalModels.h b/dbms/src/Functions/FunctionsExternalModels.h index 210729db478..e46c97b9c63 100644 --- a/dbms/src/Functions/FunctionsExternalModels.h +++ b/dbms/src/Functions/FunctionsExternalModels.h @@ -4,7 +4,7 @@ namespace DB { -class ExternalModels; +class ExternalModelsLoader; /// Evaluate external model. /// First argument - model name, the others - model arguments. @@ -17,7 +17,7 @@ public: static FunctionPtr create(const Context & context); - explicit FunctionModelEvaluate(const ExternalModels & models_) : models(models_) {} + explicit FunctionModelEvaluate(const ExternalModelsLoader & models_loader_) : models_loader(models_loader_) {} String getName() const override { return name; } @@ -34,7 +34,7 @@ public: void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override; private: - const ExternalModels & models; + const ExternalModelsLoader & models_loader; }; } diff --git a/dbms/src/Interpreters/Context.cpp b/dbms/src/Interpreters/Context.cpp index df04f0f56f5..3835f1f6aba 100644 --- a/dbms/src/Interpreters/Context.cpp +++ b/dbms/src/Interpreters/Context.cpp @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include @@ -125,7 +125,7 @@ struct ContextShared Databases databases; /// List of databases and tables in them. mutable std::optional embedded_dictionaries; /// Metrica's dictionaries. Have lazy initialization. mutable std::optional external_dictionaries; - mutable std::optional external_models; + mutable std::optional external_models_loader; String default_profile_name; /// Default profile name used for default values. String system_profile_name; /// Profile used by system processes std::unique_ptr users_manager; /// Known users. @@ -283,7 +283,7 @@ struct ContextShared system_logs.reset(); embedded_dictionaries.reset(); external_dictionaries.reset(); - external_models.reset(); + external_models_loader.reset(); background_pool.reset(); schedule_pool.reset(); ddl_worker.reset(); @@ -1339,23 +1339,23 @@ ExternalDictionaries & Context::getExternalDictionaries() } -const ExternalModels & Context::getExternalModels() const +const ExternalModelsLoader & Context::getExternalModelsLoader() const { std::lock_guard lock(shared->external_models_mutex); - if (!shared->external_models) + if (!shared->external_models_loader) { if (!this->global_context) throw Exception("Logical error: there is no global context", ErrorCodes::LOGICAL_ERROR); auto config_repository = shared->runtime_components_factory->createExternalModelsConfigRepository(); - shared->external_models.emplace(std::move(config_repository), *this->global_context); + shared->external_models_loader.emplace(std::move(config_repository), *this->global_context); } - return *shared->external_models; + return *shared->external_models_loader; } -ExternalModels & Context::getExternalModels() +ExternalModelsLoader & Context::getExternalModelsLoader() { - return const_cast(const_cast(this)->getExternalModels()); + return const_cast(const_cast(this)->getExternalModelsLoader()); } diff --git a/dbms/src/Interpreters/Context.h b/dbms/src/Interpreters/Context.h index 6b074acf056..ca5da6c68fc 100644 --- a/dbms/src/Interpreters/Context.h +++ b/dbms/src/Interpreters/Context.h @@ -47,7 +47,7 @@ class RuntimeComponentsFactory; class QuotaForIntervals; class EmbeddedDictionaries; class ExternalDictionaries; -class ExternalModels; +class ExternalModelsLoader; class InterserverIOHandler; class BackgroundProcessingPool; class BackgroundSchedulePool; @@ -322,10 +322,10 @@ public: const EmbeddedDictionaries & getEmbeddedDictionaries() const; const ExternalDictionaries & getExternalDictionaries() const; - const ExternalModels & getExternalModels() const; + const ExternalModelsLoader & getExternalModelsLoader() const; EmbeddedDictionaries & getEmbeddedDictionaries(); ExternalDictionaries & getExternalDictionaries(); - ExternalModels & getExternalModels(); + ExternalModelsLoader & getExternalModelsLoader(); void tryCreateEmbeddedDictionaries() const; /// I/O formats. diff --git a/dbms/src/Interpreters/ExternalModels.cpp b/dbms/src/Interpreters/ExternalModels.cpp deleted file mode 100644 index f3c1310410b..00000000000 --- a/dbms/src/Interpreters/ExternalModels.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int INVALID_CONFIG_PARAMETER; -} - - -ExternalModels::ExternalModels( - std::unique_ptr config_repository, - Context & context_) - : ExternalLoader(context_.getConfigRef(), - "external model", - &Logger::get("ExternalModels")), - context(context_) -{ - addConfigRepository(std::move(config_repository), {"model", "name", "models_config"}); - enablePeriodicUpdates(true); -} - -std::shared_ptr ExternalModels::create( - const std::string & name, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix) const -{ - String type = config.getString(config_prefix + ".type"); - ExternalLoadableLifetime lifetime(config, config_prefix + ".lifetime"); - - /// TODO: add models factory. - if (type == "catboost") - { - return std::make_unique( - name, config.getString(config_prefix + ".path"), - context.getConfigRef().getString("catboost_dynamic_library_path"), - lifetime - ); - } - else - { - throw Exception("Unknown model type: " + type, ErrorCodes::INVALID_CONFIG_PARAMETER); - } -} - -} diff --git a/dbms/src/Interpreters/ExternalModels.h b/dbms/src/Interpreters/ExternalModels.h deleted file mode 100644 index 2c4706b0664..00000000000 --- a/dbms/src/Interpreters/ExternalModels.h +++ /dev/null @@ -1,40 +0,0 @@ -#pragma once - -#include -#include -#include -#include - - -namespace DB -{ - -class Context; - -/// Manages user-defined models. -class ExternalModels : public ExternalLoader -{ -public: - using ModelPtr = std::shared_ptr; - - /// Models will be loaded immediately and then will be updated in separate thread, each 'reload_period' seconds. - ExternalModels( - std::unique_ptr config_repository, - Context & context_); - - ModelPtr getModel(const std::string & name) const - { - return std::static_pointer_cast(getLoadable(name)); - } - -protected: - LoadablePtr create(const std::string & name, const Poco::Util::AbstractConfiguration & config, - const std::string & key_in_config) const override; - - friend class StorageSystemModels; -private: - - Context & context; -}; - -} diff --git a/dbms/src/Storages/System/StorageSystemModels.cpp b/dbms/src/Storages/System/StorageSystemModels.cpp index 2db690ea4c3..325dd9ebbd0 100644 --- a/dbms/src/Storages/System/StorageSystemModels.cpp +++ b/dbms/src/Storages/System/StorageSystemModels.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include @@ -27,8 +27,8 @@ NamesAndTypesList StorageSystemModels::getNamesAndTypes() void StorageSystemModels::fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo &) const { - const auto & external_models = context.getExternalModels(); - auto load_results = external_models.getCurrentLoadResults(); + const auto & external_models_loader = context.getExternalModelsLoader(); + auto load_results = external_models_loader.getCurrentLoadResults(); for (const auto & [model_name, load_result] : load_results) { From 7c93ef170680151f92d4fd1f324b3694c06eca35 Mon Sep 17 00:00:00 2001 From: Guillaume Tassery Date: Thu, 26 Sep 2019 12:27:22 +0200 Subject: [PATCH 271/309] Add integration tests for auth on HTTP external dictionaries --- .../external_sources.py | 10 ++++++++++ .../http_server.py | 14 ++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/external_sources.py b/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/external_sources.py index 7ff24b4b28c..869fbbb1874 100644 --- a/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/external_sources.py +++ b/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/external_sources.py @@ -331,6 +331,16 @@ class SourceHTTPBase(ExternalSource): {url} TabSeparated + + foo + bar + + +
    + api-key + secret +
    +
    '''.format(url=url) diff --git a/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/http_server.py b/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/http_server.py index e763614ff1c..d8e4865df81 100644 --- a/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/http_server.py +++ b/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/http_server.py @@ -6,12 +6,26 @@ import ssl import csv +# Decorator used to see if authentification works for external dictionary who use a HTTP source. +def check_auth(fn): + def wrapper(req): + auth_header = self.headers.get('Authorization', None) + api_key = self.headers.get('api-key', None) + if not auth_header or auth_header != 'Zm9vOmJhcg==' or not api_key or api_key != 'secret': + req.send_response(401) + else: + fn(req) + return wrapper + + def start_server(server_address, data_path, schema, cert_path, address_family): class TSVHTTPHandler(BaseHTTPRequestHandler): + @check_auth def do_GET(self): self.__send_headers() self.__send_data() + @check_auth def do_POST(self): ids = self.__read_and_decode_post_ids() print "ids=", ids From 4fdb5a61e58e1821a3da2b1dd1d3518dab93ada0 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 26 Sep 2019 13:41:33 +0300 Subject: [PATCH 272/309] Rename ExternalDictionaries to ExternalDictionariesLoader --- dbms/programs/server/Server.cpp | 4 +- dbms/src/Databases/DatabaseDictionary.cpp | 14 +-- .../Functions/FunctionsExternalDictionaries.h | 112 +++++++++--------- dbms/src/Interpreters/Context.cpp | 22 ++-- dbms/src/Interpreters/Context.h | 6 +- dbms/src/Interpreters/ExpressionAnalyzer.cpp | 2 +- .../src/Interpreters/ExternalDictionaries.cpp | 30 ----- dbms/src/Interpreters/ExternalDictionaries.h | 48 -------- .../Interpreters/InterpreterSystemQuery.cpp | 6 +- dbms/src/Interpreters/SyntaxAnalyzer.cpp | 4 +- dbms/src/Storages/StorageDictionary.cpp | 6 +- .../System/StorageSystemDictionaries.cpp | 4 +- 12 files changed, 90 insertions(+), 168 deletions(-) delete mode 100644 dbms/src/Interpreters/ExternalDictionaries.cpp delete mode 100644 dbms/src/Interpreters/ExternalDictionaries.h diff --git a/dbms/programs/server/Server.cpp b/dbms/programs/server/Server.cpp index 999ef796ca3..84097fe4d7e 100644 --- a/dbms/programs/server/Server.cpp +++ b/dbms/programs/server/Server.cpp @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include @@ -918,7 +918,7 @@ int Server::main(const std::vector & /*args*/) if (!config().getBool("dictionaries_lazy_load", true)) { global_context->tryCreateEmbeddedDictionaries(); - global_context->getExternalDictionaries().enableAlwaysLoadEverything(true); + global_context->getExternalDictionariesLoader().enableAlwaysLoadEverything(true); } } catch (...) diff --git a/dbms/src/Databases/DatabaseDictionary.cpp b/dbms/src/Databases/DatabaseDictionary.cpp index 2bb9bd30238..aecc1b9125f 100644 --- a/dbms/src/Databases/DatabaseDictionary.cpp +++ b/dbms/src/Databases/DatabaseDictionary.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include @@ -38,12 +38,12 @@ Tables DatabaseDictionary::listTables(const Context & context, const FilterByNam if (filter_by_name) { /// If `filter_by_name` is set, we iterate through all dictionaries with such names. That's why we need to load all of them. - loadables = context.getExternalDictionaries().loadAndGet(filter_by_name); + loadables = context.getExternalDictionariesLoader().loadAndGet(filter_by_name); } else { /// If `filter_by_name` isn't set, we iterate through only already loaded dictionaries. We don't try to load all dictionaries in this case. - loadables = context.getExternalDictionaries().getCurrentlyLoadedObjects(); + loadables = context.getExternalDictionariesLoader().getCurrentlyLoadedObjects(); } for (const auto & loadable : loadables) @@ -61,14 +61,14 @@ bool DatabaseDictionary::isTableExist( const Context & context, const String & table_name) const { - return context.getExternalDictionaries().getCurrentStatus(table_name) != ExternalLoader::Status::NOT_EXIST; + return context.getExternalDictionariesLoader().getCurrentStatus(table_name) != ExternalLoader::Status::NOT_EXIST; } StoragePtr DatabaseDictionary::tryGetTable( const Context & context, const String & table_name) const { - auto dict_ptr = context.getExternalDictionaries().tryGetDictionary(table_name); + auto dict_ptr = context.getExternalDictionariesLoader().tryGetDictionary(table_name); if (dict_ptr) { const DictionaryStructure & dictionary_structure = dict_ptr->getStructure(); @@ -86,7 +86,7 @@ DatabaseIteratorPtr DatabaseDictionary::getIterator(const Context & context, con bool DatabaseDictionary::empty(const Context & context) const { - return !context.getExternalDictionaries().hasCurrentlyLoadedObjects(); + return !context.getExternalDictionariesLoader().hasCurrentlyLoadedObjects(); } StoragePtr DatabaseDictionary::detachTable(const String & /*table_name*/) @@ -129,7 +129,7 @@ ASTPtr DatabaseDictionary::getCreateTableQueryImpl(const Context & context, { WriteBufferFromString buffer(query); - const auto & dictionaries = context.getExternalDictionaries(); + const auto & dictionaries = context.getExternalDictionariesLoader(); auto dictionary = throw_on_error ? dictionaries.getDictionary(table_name) : dictionaries.tryGetDictionary(table_name); diff --git a/dbms/src/Functions/FunctionsExternalDictionaries.h b/dbms/src/Functions/FunctionsExternalDictionaries.h index 231a4be12b2..a47dacf5deb 100644 --- a/dbms/src/Functions/FunctionsExternalDictionaries.h +++ b/dbms/src/Functions/FunctionsExternalDictionaries.h @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include @@ -51,7 +51,7 @@ namespace ErrorCodes extern const int DICTIONARY_ACCESS_DENIED; } -/** Functions that use plug-ins (external) dictionaries. +/** Functions that use plug-ins (external) dictionaries_loader. * * Get the value of the attribute of the specified type. * dictGetType(dictionary, attribute, id), @@ -73,11 +73,11 @@ public: static FunctionPtr create(const Context & context) { - return std::make_shared(context.getExternalDictionaries(), context); + return std::make_shared(context.getExternalDictionariesLoader(), context); } - FunctionDictHas(const ExternalDictionaries & dictionaries_, const Context & context_) - : dictionaries(dictionaries_) + FunctionDictHas(const ExternalDictionariesLoader & dictionaries_loader_, const Context & context_) + : dictionaries_loader(dictionaries_loader_) , context(context_) {} String getName() const override { return name; } @@ -124,7 +124,7 @@ private: return; } - auto dict = dictionaries.getDictionary(dict_name_col->getValue()); + auto dict = dictionaries_loader.getDictionary(dict_name_col->getValue()); const auto dict_ptr = dict.get(); if (!context.hasDictionaryAccessRights(dict_ptr->getName())) @@ -191,12 +191,12 @@ private: return true; } - const ExternalDictionaries & dictionaries; + const ExternalDictionariesLoader & dictionaries_loader; const Context & context; }; -static bool isDictGetFunctionInjective(const ExternalDictionaries & dictionaries, const Block & sample_block) +static bool isDictGetFunctionInjective(const ExternalDictionariesLoader & dictionaries_loader, const Block & sample_block) { if (sample_block.columns() != 3 && sample_block.columns() != 4) throw Exception{"Function dictGet... takes 3 or 4 arguments", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH}; @@ -209,7 +209,7 @@ static bool isDictGetFunctionInjective(const ExternalDictionaries & dictionaries if (!attr_name_col) throw Exception{"Second argument of function dictGet... must be a constant string", ErrorCodes::ILLEGAL_COLUMN}; - return dictionaries.getDictionary(dict_name_col->getValue())->isInjective(attr_name_col->getValue()); + return dictionaries_loader.getDictionary(dict_name_col->getValue())->isInjective(attr_name_col->getValue()); } @@ -227,11 +227,11 @@ public: static FunctionPtr create(const Context & context) { - return std::make_shared(context.getExternalDictionaries(), context); + return std::make_shared(context.getExternalDictionariesLoader(), context); } - FunctionDictGetString(const ExternalDictionaries & dictionaries_, const Context & context_) - : dictionaries(dictionaries_) + FunctionDictGetString(const ExternalDictionariesLoader & dictionaries_loader_, const Context & context_) + : dictionaries_loader(dictionaries_loader_) , context(context_) {} String getName() const override { return name; } @@ -245,7 +245,7 @@ private: bool isInjective(const Block & sample_block) override { - return isDictGetFunctionInjective(dictionaries, sample_block); + return isDictGetFunctionInjective(dictionaries_loader, sample_block); } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override @@ -273,7 +273,7 @@ private: + ", must be UInt64 or tuple(...).", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; } - /// This is for the case of range dictionaries. + /// This is for the case of range dictionaries_loader. if (arguments.size() == 4 && !arguments[3]->isValueRepresentedByInteger()) { throw Exception{"Illegal type " + arguments[3]->getName() + @@ -299,7 +299,7 @@ private: return; } - auto dict = dictionaries.getDictionary(dict_name_col->getValue()); + auto dict = dictionaries_loader.getDictionary(dict_name_col->getValue()); const auto dict_ptr = dict.get(); if (!context.hasDictionaryAccessRights(dict_ptr->getName())) @@ -368,7 +368,7 @@ private: String attr_name = attr_name_col->getValue(); const ColumnWithTypeAndName & key_col_with_type = block.getByPosition(arguments[2]); - /// Functions in external dictionaries only support full-value (not constant) columns with keys. + /// Functions in external dictionaries_loader only support full-value (not constant) columns with keys. ColumnPtr key_col = key_col_with_type.column->convertToFullColumnIfConst(); if (checkColumn(key_col.get())) @@ -419,7 +419,7 @@ private: return true; } - const ExternalDictionaries & dictionaries; + const ExternalDictionariesLoader & dictionaries_loader; const Context & context; }; @@ -431,11 +431,11 @@ public: static FunctionPtr create(const Context & context) { - return std::make_shared(context.getExternalDictionaries(), context); + return std::make_shared(context.getExternalDictionariesLoader(), context); } - FunctionDictGetStringOrDefault(const ExternalDictionaries & dictionaries_, const Context & context_) - : dictionaries(dictionaries_) + FunctionDictGetStringOrDefault(const ExternalDictionariesLoader & dictionaries_loader_, const Context & context_) + : dictionaries_loader(dictionaries_loader_) , context(context_) {} String getName() const override { return name; } @@ -485,7 +485,7 @@ private: return; } - auto dict = dictionaries.getDictionary(dict_name_col->getValue()); + auto dict = dictionaries_loader.getDictionary(dict_name_col->getValue()); const auto dict_ptr = dict.get(); if (!context.hasDictionaryAccessRights(dict_ptr->getName())) @@ -606,7 +606,7 @@ private: String attr_name = attr_name_col->getValue(); const ColumnWithTypeAndName & key_col_with_type = block.getByPosition(arguments[2]); - /// Functions in external dictionaries only support full-value (not constant) columns with keys. + /// Functions in external dictionaries_loader only support full-value (not constant) columns with keys. ColumnPtr key_col = key_col_with_type.column->convertToFullColumnIfConst(); const auto & key_columns = typeid_cast(*key_col).getColumnsCopy(); @@ -631,7 +631,7 @@ private: return true; } - const ExternalDictionaries & dictionaries; + const ExternalDictionariesLoader & dictionaries_loader; const Context & context; }; @@ -755,11 +755,11 @@ public: static FunctionPtr create(const Context & context, UInt32 dec_scale = 0) { - return std::make_shared(context.getExternalDictionaries(), context, dec_scale); + return std::make_shared(context.getExternalDictionariesLoader(), context, dec_scale); } - FunctionDictGet(const ExternalDictionaries & dictionaries_, const Context & context_, UInt32 dec_scale = 0) - : dictionaries(dictionaries_) + FunctionDictGet(const ExternalDictionariesLoader & dictionaries_loader_, const Context & context_, UInt32 dec_scale = 0) + : dictionaries_loader(dictionaries_loader_) , context(context_) , decimal_scale(dec_scale) {} @@ -775,7 +775,7 @@ private: bool isInjective(const Block & sample_block) override { - return isDictGetFunctionInjective(dictionaries, sample_block); + return isDictGetFunctionInjective(dictionaries_loader, sample_block); } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override @@ -827,7 +827,7 @@ private: return; } - auto dict = dictionaries.getDictionary(dict_name_col->getValue()); + auto dict = dictionaries_loader.getDictionary(dict_name_col->getValue()); const auto dict_ptr = dict.get(); if (!context.hasDictionaryAccessRights(dict_ptr->getName())) @@ -922,7 +922,7 @@ private: const ColumnWithTypeAndName & key_col_with_type = block.getByPosition(arguments[2]); - /// Functions in external dictionaries only support full-value (not constant) columns with keys. + /// Functions in external dictionaries_loader only support full-value (not constant) columns with keys. ColumnPtr key_col = key_col_with_type.column->convertToFullColumnIfConst(); if (checkColumn(key_col.get())) @@ -983,7 +983,7 @@ private: return true; } - const ExternalDictionaries & dictionaries; + const ExternalDictionariesLoader & dictionaries_loader; const Context & context; UInt32 decimal_scale; }; @@ -1034,11 +1034,11 @@ public: static FunctionPtr create(const Context & context, UInt32 dec_scale = 0) { - return std::make_shared(context.getExternalDictionaries(), context, dec_scale); + return std::make_shared(context.getExternalDictionariesLoader(), context, dec_scale); } - FunctionDictGetOrDefault(const ExternalDictionaries & dictionaries_, const Context & context_, UInt32 dec_scale = 0) - : dictionaries(dictionaries_) + FunctionDictGetOrDefault(const ExternalDictionariesLoader & dictionaries_loader_, const Context & context_, UInt32 dec_scale = 0) + : dictionaries_loader(dictionaries_loader_) , context(context_) , decimal_scale(dec_scale) {} @@ -1091,7 +1091,7 @@ private: return; } - auto dict = dictionaries.getDictionary(dict_name_col->getValue()); + auto dict = dictionaries_loader.getDictionary(dict_name_col->getValue()); const auto dict_ptr = dict.get(); if (!context.hasDictionaryAccessRights(dict_ptr->getName())) @@ -1248,7 +1248,7 @@ private: const ColumnWithTypeAndName & key_col_with_type = block.getByPosition(arguments[2]); - /// Functions in external dictionaries only support full-value (not constant) columns with keys. + /// Functions in external dictionaries_loader only support full-value (not constant) columns with keys. ColumnPtr key_col = key_col_with_type.column->convertToFullColumnIfConst(); const auto & key_columns = typeid_cast(*key_col).getColumnsCopy(); @@ -1284,7 +1284,7 @@ private: return true; } - const ExternalDictionaries & dictionaries; + const ExternalDictionariesLoader & dictionaries_loader; const Context & context; UInt32 decimal_scale; }; @@ -1332,10 +1332,10 @@ public: static FunctionPtr create(const Context & context) { - return std::make_shared(context.getExternalDictionaries(), context); + return std::make_shared(context.getExternalDictionariesLoader(), context); } - FunctionDictGetNoType(const ExternalDictionaries & dictionaries_, const Context & context_) : dictionaries(dictionaries_), context(context_) {} + FunctionDictGetNoType(const ExternalDictionariesLoader & dictionaries_loader_, const Context & context_) : dictionaries_loader(dictionaries_loader_), context(context_) {} String getName() const override { return name; } @@ -1348,7 +1348,7 @@ private: bool isInjective(const Block & sample_block) override { - return isDictGetFunctionInjective(dictionaries, sample_block); + return isDictGetFunctionInjective(dictionaries_loader, sample_block); } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override @@ -1388,7 +1388,7 @@ private: + ", must be convertible to " + TypeName::get() + ".", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; } - auto dict = dictionaries.getDictionary(dict_name); + auto dict = dictionaries_loader.getDictionary(dict_name); const DictionaryStructure & structure = dict->getStructure(); for (const auto idx : ext::range(0, structure.attributes.size())) @@ -1468,7 +1468,7 @@ private: } private: - const ExternalDictionaries & dictionaries; + const ExternalDictionariesLoader & dictionaries_loader; const Context & context; mutable FunctionPtr impl; // underlying function used by dictGet function without explicit type info }; @@ -1481,10 +1481,10 @@ public: static FunctionPtr create(const Context & context) { - return std::make_shared(context.getExternalDictionaries(), context); + return std::make_shared(context.getExternalDictionariesLoader(), context); } - FunctionDictGetNoTypeOrDefault(const ExternalDictionaries & dictionaries_, const Context & context_) : dictionaries(dictionaries_), context(context_) {} + FunctionDictGetNoTypeOrDefault(const ExternalDictionariesLoader & dictionaries_loader_, const Context & context_) : dictionaries_loader(dictionaries_loader_), context(context_) {} String getName() const override { return name; } @@ -1496,7 +1496,7 @@ private: bool isInjective(const Block & sample_block) override { - return isDictGetFunctionInjective(dictionaries, sample_block); + return isDictGetFunctionInjective(dictionaries_loader, sample_block); } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override @@ -1524,7 +1524,7 @@ private: throw Exception{"Illegal type " + arguments[2].type->getName() + " of third argument of function " + getName() + ", must be UInt64 or tuple(...).", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; - auto dict = dictionaries.getDictionary(dict_name); + auto dict = dictionaries_loader.getDictionary(dict_name); const DictionaryStructure & structure = dict->getStructure(); for (const auto idx : ext::range(0, structure.attributes.size())) @@ -1610,7 +1610,7 @@ private: } private: - const ExternalDictionaries & dictionaries; + const ExternalDictionariesLoader & dictionaries_loader; const Context & context; mutable FunctionPtr impl; // underlying function used by dictGet function without explicit type info }; @@ -1624,11 +1624,11 @@ public: static FunctionPtr create(const Context & context) { - return std::make_shared(context.getExternalDictionaries(), context); + return std::make_shared(context.getExternalDictionariesLoader(), context); } - FunctionDictGetHierarchy(const ExternalDictionaries & dictionaries_, const Context & context_) - : dictionaries(dictionaries_) + FunctionDictGetHierarchy(const ExternalDictionariesLoader & dictionaries_loader_, const Context & context_) + : dictionaries_loader(dictionaries_loader_) , context(context_) {} String getName() const override { return name; } @@ -1668,7 +1668,7 @@ private: return; } - auto dict = dictionaries.getDictionary(dict_name_col->getValue()); + auto dict = dictionaries_loader.getDictionary(dict_name_col->getValue()); const auto dict_ptr = dict.get(); if (!context.hasDictionaryAccessRights(dict_ptr->getName())) @@ -1778,7 +1778,7 @@ private: return true; } - const ExternalDictionaries & dictionaries; + const ExternalDictionariesLoader & dictionaries_loader; const Context & context; }; @@ -1790,11 +1790,11 @@ public: static FunctionPtr create(const Context & context) { - return std::make_shared(context.getExternalDictionaries(), context); + return std::make_shared(context.getExternalDictionariesLoader(), context); } - FunctionDictIsIn(const ExternalDictionaries & dictionaries_, const Context & context_) - : dictionaries(dictionaries_) + FunctionDictIsIn(const ExternalDictionariesLoader & dictionaries_loader_, const Context & context_) + : dictionaries_loader(dictionaries_loader_) , context(context_) {} String getName() const override { return name; } @@ -1837,7 +1837,7 @@ private: return; } - auto dict = dictionaries.getDictionary(dict_name_col->getValue()); + auto dict = dictionaries_loader.getDictionary(dict_name_col->getValue()); const auto dict_ptr = dict.get(); if (!context.hasDictionaryAccessRights(dict_ptr->getName())) @@ -1949,7 +1949,7 @@ private: return true; } - const ExternalDictionaries & dictionaries; + const ExternalDictionariesLoader & dictionaries_loader; const Context & context; }; diff --git a/dbms/src/Interpreters/Context.cpp b/dbms/src/Interpreters/Context.cpp index 3835f1f6aba..92a0fbf5273 100644 --- a/dbms/src/Interpreters/Context.cpp +++ b/dbms/src/Interpreters/Context.cpp @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include @@ -124,7 +124,7 @@ struct ContextShared Databases databases; /// List of databases and tables in them. mutable std::optional embedded_dictionaries; /// Metrica's dictionaries. Have lazy initialization. - mutable std::optional external_dictionaries; + mutable std::optional external_dictionaries_loader; mutable std::optional external_models_loader; String default_profile_name; /// Default profile name used for default values. String system_profile_name; /// Profile used by system processes @@ -282,7 +282,7 @@ struct ContextShared system_logs.reset(); embedded_dictionaries.reset(); - external_dictionaries.reset(); + external_dictionaries_loader.reset(); external_models_loader.reset(); background_pool.reset(); schedule_pool.reset(); @@ -1312,30 +1312,30 @@ EmbeddedDictionaries & Context::getEmbeddedDictionaries() } -const ExternalDictionaries & Context::getExternalDictionaries() const +const ExternalDictionariesLoader & Context::getExternalDictionariesLoader() const { { std::lock_guard lock(shared->external_dictionaries_mutex); - if (shared->external_dictionaries) - return *shared->external_dictionaries; + if (shared->external_dictionaries_loader) + return *shared->external_dictionaries_loader; } const auto & config = getConfigRef(); std::lock_guard lock(shared->external_dictionaries_mutex); - if (!shared->external_dictionaries) + if (!shared->external_dictionaries_loader) { if (!this->global_context) throw Exception("Logical error: there is no global context", ErrorCodes::LOGICAL_ERROR); auto config_repository = shared->runtime_components_factory->createExternalDictionariesConfigRepository(); - shared->external_dictionaries.emplace(std::move(config_repository), config, *this->global_context); + shared->external_dictionaries_loader.emplace(std::move(config_repository), config, *this->global_context); } - return *shared->external_dictionaries; + return *shared->external_dictionaries_loader; } -ExternalDictionaries & Context::getExternalDictionaries() +ExternalDictionariesLoader & Context::getExternalDictionariesLoader() { - return const_cast(const_cast(this)->getExternalDictionaries()); + return const_cast(const_cast(this)->getExternalDictionariesLoader()); } diff --git a/dbms/src/Interpreters/Context.h b/dbms/src/Interpreters/Context.h index ca5da6c68fc..4dd70a55e05 100644 --- a/dbms/src/Interpreters/Context.h +++ b/dbms/src/Interpreters/Context.h @@ -46,7 +46,7 @@ class Context; class RuntimeComponentsFactory; class QuotaForIntervals; class EmbeddedDictionaries; -class ExternalDictionaries; +class ExternalDictionariesLoader; class ExternalModelsLoader; class InterserverIOHandler; class BackgroundProcessingPool; @@ -321,10 +321,10 @@ public: void checkSettingsConstraints(const SettingsChanges & changes); const EmbeddedDictionaries & getEmbeddedDictionaries() const; - const ExternalDictionaries & getExternalDictionaries() const; + const ExternalDictionariesLoader & getExternalDictionariesLoader() const; const ExternalModelsLoader & getExternalModelsLoader() const; EmbeddedDictionaries & getEmbeddedDictionaries(); - ExternalDictionaries & getExternalDictionaries(); + ExternalDictionariesLoader & getExternalDictionariesLoader(); ExternalModelsLoader & getExternalModelsLoader(); void tryCreateEmbeddedDictionaries() const; diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.cpp b/dbms/src/Interpreters/ExpressionAnalyzer.cpp index e4e00375829..50f5e3a0b4b 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.cpp +++ b/dbms/src/Interpreters/ExpressionAnalyzer.cpp @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/dbms/src/Interpreters/ExternalDictionaries.cpp b/dbms/src/Interpreters/ExternalDictionaries.cpp deleted file mode 100644 index e1cbd377978..00000000000 --- a/dbms/src/Interpreters/ExternalDictionaries.cpp +++ /dev/null @@ -1,30 +0,0 @@ -#include -#include -#include - -namespace DB -{ - -/// Must not acquire Context lock in constructor to avoid possibility of deadlocks. -ExternalDictionaries::ExternalDictionaries( - std::unique_ptr config_repository, - const Poco::Util::AbstractConfiguration & config, - Context & context_) - : ExternalLoader(config, - "external dictionary", - &Logger::get("ExternalDictionaries")), - context(context_) -{ - addConfigRepository(std::move(config_repository), {"dictionary", "name", "dictionaries_config"}); - enableAsyncLoading(true); - enablePeriodicUpdates(true); -} - - -ExternalLoader::LoadablePtr ExternalDictionaries::create( - const std::string & name, const Poco::Util::AbstractConfiguration & config, const std::string & key_in_config) const -{ - return DictionaryFactory::instance().create(name, config, key_in_config, context); -} - -} diff --git a/dbms/src/Interpreters/ExternalDictionaries.h b/dbms/src/Interpreters/ExternalDictionaries.h deleted file mode 100644 index c071349cc97..00000000000 --- a/dbms/src/Interpreters/ExternalDictionaries.h +++ /dev/null @@ -1,48 +0,0 @@ -#pragma once - -#include -#include -#include -#include - - -namespace DB -{ - -class Context; - -/// Manages user-defined dictionaries. -class ExternalDictionaries : public ExternalLoader -{ -public: - using DictPtr = std::shared_ptr; - - /// Dictionaries will be loaded immediately and then will be updated in separate thread, each 'reload_period' seconds. - ExternalDictionaries( - std::unique_ptr config_repository, - const Poco::Util::AbstractConfiguration & config, - Context & context_); - - DictPtr getDictionary(const std::string & name) const - { - return std::static_pointer_cast(getLoadable(name)); - } - - DictPtr tryGetDictionary(const std::string & name) const - { - return std::static_pointer_cast(tryGetLoadable(name)); - } - -protected: - LoadablePtr create(const std::string & name, const Poco::Util::AbstractConfiguration & config, - const std::string & key_in_config) const override; - - friend class StorageSystemDictionaries; - friend class DatabaseDictionary; - -private: - - Context & context; -}; - -} diff --git a/dbms/src/Interpreters/InterpreterSystemQuery.cpp b/dbms/src/Interpreters/InterpreterSystemQuery.cpp index 6d4ad1219f9..d4cdf10fd63 100644 --- a/dbms/src/Interpreters/InterpreterSystemQuery.cpp +++ b/dbms/src/Interpreters/InterpreterSystemQuery.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include @@ -165,11 +165,11 @@ BlockIO InterpreterSystemQuery::execute() break; #endif case Type::RELOAD_DICTIONARY: - system_context.getExternalDictionaries().reload(query.target_dictionary, true /* load the dictionary even if it wasn't loading before */); + system_context.getExternalDictionariesLoader().reload(query.target_dictionary, true /* load the dictionary even if it wasn't loading before */); break; case Type::RELOAD_DICTIONARIES: executeCommandsAndThrowIfError( - [&] () { system_context.getExternalDictionaries().reload(); }, + [&] () { system_context.getExternalDictionariesLoader().reload(); }, [&] () { system_context.getEmbeddedDictionaries().reload(); } ); break; diff --git a/dbms/src/Interpreters/SyntaxAnalyzer.cpp b/dbms/src/Interpreters/SyntaxAnalyzer.cpp index 81b22379e02..89b7343d894 100644 --- a/dbms/src/Interpreters/SyntaxAnalyzer.cpp +++ b/dbms/src/Interpreters/SyntaxAnalyzer.cpp @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include @@ -315,7 +315,7 @@ void optimizeGroupBy(ASTSelectQuery * select_query, const NameSet & source_colum } const auto & dict_name = function->arguments->children[0]->as().value.safeGet(); - const auto & dict_ptr = context.getExternalDictionaries().getDictionary(dict_name); + const auto & dict_ptr = context.getExternalDictionariesLoader().getDictionary(dict_name); const auto & attr_name = function->arguments->children[1]->as().value.safeGet(); if (!dict_ptr->isInjective(attr_name)) diff --git a/dbms/src/Storages/StorageDictionary.cpp b/dbms/src/Storages/StorageDictionary.cpp index ced0025e36c..6fb2a774812 100644 --- a/dbms/src/Storages/StorageDictionary.cpp +++ b/dbms/src/Storages/StorageDictionary.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include @@ -39,7 +39,7 @@ StorageDictionary::StorageDictionary( if (!attach) { - const auto & dictionary = context.getExternalDictionaries().getDictionary(dictionary_name); + const auto & dictionary = context.getExternalDictionariesLoader().getDictionary(dictionary_name); const DictionaryStructure & dictionary_structure = dictionary->getStructure(); checkNamesAndTypesCompatibleWithDictionary(dictionary_structure); } @@ -53,7 +53,7 @@ BlockInputStreams StorageDictionary::read( const size_t max_block_size, const unsigned /*threads*/) { - auto dictionary = context.getExternalDictionaries().getDictionary(dictionary_name); + auto dictionary = context.getExternalDictionariesLoader().getDictionary(dictionary_name); return BlockInputStreams{dictionary->getBlockInputStream(column_names, max_block_size)}; } diff --git a/dbms/src/Storages/System/StorageSystemDictionaries.cpp b/dbms/src/Storages/System/StorageSystemDictionaries.cpp index 826bb601609..c6f7d4ac9ae 100644 --- a/dbms/src/Storages/System/StorageSystemDictionaries.cpp +++ b/dbms/src/Storages/System/StorageSystemDictionaries.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include @@ -41,7 +41,7 @@ NamesAndTypesList StorageSystemDictionaries::getNamesAndTypes() void StorageSystemDictionaries::fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo &) const { - const auto & external_dictionaries = context.getExternalDictionaries(); + const auto & external_dictionaries = context.getExternalDictionariesLoader(); for (const auto & [dict_name, load_result] : external_dictionaries.getCurrentLoadResults()) { size_t i = 0; From 3a5c644f2ec8e50a24919e91b9c2b45031061f6e Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Thu, 26 Sep 2019 14:18:10 +0300 Subject: [PATCH 273/309] Add link to Singapore meetup --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 93e4af1a43d..8c1a600eb7a 100644 --- a/README.md +++ b/README.md @@ -21,4 +21,4 @@ ClickHouse is an open-source column-oriented database management system that all * [ClickHouse Meetup in Tokyo](https://clickhouse.connpass.com/event/147001/) on November 14. * [ClickHouse Meetup in Istanbul](https://www.eventbrite.com/e/clickhouse-meetup-istanbul-create-blazing-fast-experiences-w-clickhouse-tickets-73101120419) on November 19. * [ClickHouse Meetup in Ankara](https://www.eventbrite.com/e/clickhouse-meetup-ankara-create-blazing-fast-experiences-w-clickhouse-tickets-73100530655) on November 21. - +* [ClickHouse Meetup in Singapore](https://www.meetup.com/Singapore-Clickhouse-Meetup-Group/events/265085331/) on November 23. From 089e3146c9a7a622e4812e645c7a8769aed806e0 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 26 Sep 2019 14:19:10 +0300 Subject: [PATCH 274/309] Add missed files --- .../ExternalDictionariesLoader.cpp | 30 ++++++++++++ .../Interpreters/ExternalDictionariesLoader.h | 48 +++++++++++++++++++ .../src/Interpreters/ExternalModelsLoader.cpp | 46 ++++++++++++++++++ dbms/src/Interpreters/ExternalModelsLoader.h | 40 ++++++++++++++++ 4 files changed, 164 insertions(+) create mode 100644 dbms/src/Interpreters/ExternalDictionariesLoader.cpp create mode 100644 dbms/src/Interpreters/ExternalDictionariesLoader.h create mode 100644 dbms/src/Interpreters/ExternalModelsLoader.cpp create mode 100644 dbms/src/Interpreters/ExternalModelsLoader.h diff --git a/dbms/src/Interpreters/ExternalDictionariesLoader.cpp b/dbms/src/Interpreters/ExternalDictionariesLoader.cpp new file mode 100644 index 00000000000..4cf05491a04 --- /dev/null +++ b/dbms/src/Interpreters/ExternalDictionariesLoader.cpp @@ -0,0 +1,30 @@ +#include +#include +#include + +namespace DB +{ + +/// Must not acquire Context lock in constructor to avoid possibility of deadlocks. +ExternalDictionariesLoader::ExternalDictionariesLoader( + std::unique_ptr config_repository, + const Poco::Util::AbstractConfiguration & config, + Context & context_) + : ExternalLoader(config, + "external dictionary", + &Logger::get("ExternalDictionariesLoader")), + context(context_) +{ + addConfigRepository(std::move(config_repository), {"dictionary", "name", "dictionaries_config"}); + enableAsyncLoading(true); + enablePeriodicUpdates(true); +} + + +ExternalLoader::LoadablePtr ExternalDictionariesLoader::create( + const std::string & name, const Poco::Util::AbstractConfiguration & config, const std::string & key_in_config) const +{ + return DictionaryFactory::instance().create(name, config, key_in_config, context); +} + +} diff --git a/dbms/src/Interpreters/ExternalDictionariesLoader.h b/dbms/src/Interpreters/ExternalDictionariesLoader.h new file mode 100644 index 00000000000..f815e2e6945 --- /dev/null +++ b/dbms/src/Interpreters/ExternalDictionariesLoader.h @@ -0,0 +1,48 @@ +#pragma once + +#include +#include +#include +#include + + +namespace DB +{ + +class Context; + +/// Manages user-defined dictionaries. +class ExternalDictionariesLoader : public ExternalLoader +{ +public: + using DictPtr = std::shared_ptr; + + /// Dictionaries will be loaded immediately and then will be updated in separate thread, each 'reload_period' seconds. + ExternalDictionariesLoader( + std::unique_ptr config_repository, + const Poco::Util::AbstractConfiguration & config, + Context & context_); + + DictPtr getDictionary(const std::string & name) const + { + return std::static_pointer_cast(getLoadable(name)); + } + + DictPtr tryGetDictionary(const std::string & name) const + { + return std::static_pointer_cast(tryGetLoadable(name)); + } + +protected: + LoadablePtr create(const std::string & name, const Poco::Util::AbstractConfiguration & config, + const std::string & key_in_config) const override; + + friend class StorageSystemDictionaries; + friend class DatabaseDictionary; + +private: + + Context & context; +}; + +} diff --git a/dbms/src/Interpreters/ExternalModelsLoader.cpp b/dbms/src/Interpreters/ExternalModelsLoader.cpp new file mode 100644 index 00000000000..624d4bdc9f6 --- /dev/null +++ b/dbms/src/Interpreters/ExternalModelsLoader.cpp @@ -0,0 +1,46 @@ +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INVALID_CONFIG_PARAMETER; +} + + +ExternalModelsLoader::ExternalModelsLoader( + std::unique_ptr config_repository, + Context & context_) + : ExternalLoader(context_.getConfigRef(), + "external model", + &Logger::get("ExternalModelsLoader")), + context(context_) +{ + addConfigRepository(std::move(config_repository), {"model", "name", "models_config"}); + enablePeriodicUpdates(true); +} + +std::shared_ptr ExternalModelsLoader::create( + const std::string & name, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix) const +{ + String type = config.getString(config_prefix + ".type"); + ExternalLoadableLifetime lifetime(config, config_prefix + ".lifetime"); + + /// TODO: add models factory. + if (type == "catboost") + { + return std::make_unique( + name, config.getString(config_prefix + ".path"), + context.getConfigRef().getString("catboost_dynamic_library_path"), + lifetime + ); + } + else + { + throw Exception("Unknown model type: " + type, ErrorCodes::INVALID_CONFIG_PARAMETER); + } +} + +} diff --git a/dbms/src/Interpreters/ExternalModelsLoader.h b/dbms/src/Interpreters/ExternalModelsLoader.h new file mode 100644 index 00000000000..c6324b3c602 --- /dev/null +++ b/dbms/src/Interpreters/ExternalModelsLoader.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include +#include +#include + + +namespace DB +{ + +class Context; + +/// Manages user-defined models. +class ExternalModelsLoader : public ExternalLoader +{ +public: + using ModelPtr = std::shared_ptr; + + /// Models will be loaded immediately and then will be updated in separate thread, each 'reload_period' seconds. + ExternalModelsLoader( + std::unique_ptr config_repository, + Context & context_); + + ModelPtr getModel(const std::string & name) const + { + return std::static_pointer_cast(getLoadable(name)); + } + +protected: + LoadablePtr create(const std::string & name, const Poco::Util::AbstractConfiguration & config, + const std::string & key_in_config) const override; + + friend class StorageSystemModels; +private: + + Context & context; +}; + +} From a0e11264eeea26e6fb5d8b816948be68a3ff718f Mon Sep 17 00:00:00 2001 From: Sergei Bocharov Date: Thu, 26 Sep 2019 14:39:06 +0300 Subject: [PATCH 275/309] Fix startsWith function --- .../functions/string_functions.md | 29 ++++++++++++++++-- .../functions/string_functions.md | 30 +++++++++++++++++-- 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/docs/en/query_language/functions/string_functions.md b/docs/en/query_language/functions/string_functions.md index 0f60749d307..89114703530 100644 --- a/docs/en/query_language/functions/string_functions.md +++ b/docs/en/query_language/functions/string_functions.md @@ -152,9 +152,34 @@ Similar to base64Decode, but in case of error an empty string would be returned. Returns whether to end with the specified suffix. Returns 1 if the string ends with the specified suffix, otherwise it returns 0. -## startsWith(s, prefix) {#function-startswith} +## startsWith(str, prefix) {#function-startswith} -Returns whether to start with the specified prefix. Returns 1 if the string starts with the specified prefix, otherwise it returns 0. +Returns 1 whether string starts with the specified prefix, otherwise it returns 0. + +```sql +SELECT startsWith('string', 'str'); +``` + +**Returned values** + +- 1, if the string starts with the specified prefix. +- 0, if the string isn't start with the specified prefix. + +**Example** + +Query: + +```sql +SELECT startsWith('Hello, world!', 'He'); +``` + +Result: + +```text +┌─startsWith('Hello, world!', 'He')─┐ +│ 1 │ +└───────────────────────────────────┘ +``` ## trimLeft(s) diff --git a/docs/ru/query_language/functions/string_functions.md b/docs/ru/query_language/functions/string_functions.md index f514ac1cbd3..e8fcd737c61 100644 --- a/docs/ru/query_language/functions/string_functions.md +++ b/docs/ru/query_language/functions/string_functions.md @@ -124,11 +124,37 @@ SELECT format('{} {}', 'Hello', 'World') Возвращает 1, если строка завершается указанным суффиксом, и 0 в противном случае. -## startsWith(s, prefix) {#function-startswith} +## startsWith(str, prefix) {#function-startswith} -Возвращает 1, если строка начинается указанным префиксом, и 0 в противном случае. +Возвращает 1, если строка начинается указанным префиксом, в противном случае 0. + +```sql +SELECT startsWith('string', 'str'); +``` + +**Возвращаемые значения** + +- 1, если строка начинается указанным префиксом. +- 0, если строка не начинается указанным префиксом. + +**Пример** + +Запрос: + +```sql +SELECT startsWith('Hello, world!', 'He'); +``` + +Ответ: + +```text +┌─startsWith('Hello, world!', 'He')─┐ +│ 1 │ +└───────────────────────────────────┘ +``` ## CRC32(s) + Возвращает чексумму CRC32 данной строки. Тип результата - UInt32. From 259be751eed1cbe31e769d2dd5aa9f76a5538093 Mon Sep 17 00:00:00 2001 From: millb Date: Thu, 26 Sep 2019 15:06:52 +0300 Subject: [PATCH 276/309] Fixed metric name. Add metric description. Add test. --- dbms/src/Common/ProfileEvents.cpp | 2 +- .../MergeTree/MergeTreeDataMergerMutator.cpp | 4 ++-- .../01014_count_of_merges_metrics.reference | 1 + .../0_stateless/01014_count_of_merges_metrics.sql | 13 +++++++++++++ 4 files changed, 17 insertions(+), 3 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/01014_count_of_merges_metrics.reference create mode 100644 dbms/tests/queries/0_stateless/01014_count_of_merges_metrics.sql diff --git a/dbms/src/Common/ProfileEvents.cpp b/dbms/src/Common/ProfileEvents.cpp index 89e15f73f54..586aa158bdf 100644 --- a/dbms/src/Common/ProfileEvents.cpp +++ b/dbms/src/Common/ProfileEvents.cpp @@ -114,7 +114,7 @@ M(SelectedRanges, "Number of (non-adjacent) ranges in all data parts selected to read from a MergeTree table.") \ M(SelectedMarks, "Number of marks (index granules) selected to read from a MergeTree table.") \ \ - M(CountOfMerges, "") \ + M(Merge, "Number of launches background merges.") \ M(MergedRows, "Rows read for background merges. This is the number of rows before merge.") \ M(MergedUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) that was read for background merges. This is the number before merge.") \ M(MergesTimeMilliseconds, "Total time spent for background merges.")\ diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index abdcd2d0f98..df3720359d3 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -39,7 +39,7 @@ namespace ProfileEvents extern const Event MergedRows; extern const Event MergedUncompressedBytes; extern const Event MergesTimeMilliseconds; - extern const Event CountOfMerges; + extern const Event Merge; } namespace CurrentMetrics @@ -511,7 +511,7 @@ public: if (stage.is_first) { ProfileEvents::increment(ProfileEvents::MergedRows, value.read_rows); - ProfileEvents::increment(ProfileEvents::CountOfMerges); + ProfileEvents::increment(ProfileEvents::Merge); } updateWatch(); diff --git a/dbms/tests/queries/0_stateless/01014_count_of_merges_metrics.reference b/dbms/tests/queries/0_stateless/01014_count_of_merges_metrics.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01014_count_of_merges_metrics.reference @@ -0,0 +1 @@ +1 diff --git a/dbms/tests/queries/0_stateless/01014_count_of_merges_metrics.sql b/dbms/tests/queries/0_stateless/01014_count_of_merges_metrics.sql new file mode 100644 index 00000000000..609722fcf7f --- /dev/null +++ b/dbms/tests/queries/0_stateless/01014_count_of_merges_metrics.sql @@ -0,0 +1,13 @@ +DROP TABLE IF EXISTs new_table_test; +DROP TABLE IF EXISTS check_table_test; + +CREATE TABLE IF NOT EXISTS new_table_test(name String) ENGINE = MergeTree Order By name; +CREATE TABLE IF NOT EXISTS check_table_test(value1 UInt64, value2 UInt64) ENGINE = MergeTree Order By tuple(); +INSERT INTO check_table_test (value1) SELECT value from system.events WHERE event = 'Merge'; +OPTIMIZE TABLE new_table_test FINAL; +INSERT INTO check_table_test (value2) SELECT value from system.events WHERE event = 'Merge'; +SELECT count() FROM check_table_test WHERE value2 > value1; + + +DROP TABLE new_table_test; +DROP TABLE check_table_test; From 5e5be6cbc735b1d664e53d189a34d19cd2f67834 Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Thu, 26 Sep 2019 16:09:13 +0300 Subject: [PATCH 277/309] Change redirect to presentations GitHub pages (#7116) --- website/nginx/default.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/nginx/default.conf b/website/nginx/default.conf index 1600225836d..98edad41055 100644 --- a/website/nginx/default.conf +++ b/website/nginx/default.conf @@ -16,7 +16,7 @@ server { rewrite ^/docs/$ https://clickhouse.yandex/docs/en/ permanent; rewrite ^/reference_en.html$ https://clickhouse.yandex/docs/en/single/ permanent; rewrite ^/reference_ru.html$ https://clickhouse.yandex/docs/ru/single/ permanent; - rewrite ^/presentations/(.*)$ https://yandex.github.io/clickhouse-presentations/$1 permanent; + rewrite ^/presentations/(.*)$ https://clickhouse.github.io/clickhouse-presentations/$1 permanent; include /usr/share/nginx/html/docs/redirects.conf; From ea013cf1a2348aeaee95b924450a511d68cf73ed Mon Sep 17 00:00:00 2001 From: millb Date: Thu, 26 Sep 2019 17:44:04 +0300 Subject: [PATCH 278/309] Fixed test --- .../0_stateless/01014_count_of_merges_metrics.sql | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/dbms/tests/queries/0_stateless/01014_count_of_merges_metrics.sql b/dbms/tests/queries/0_stateless/01014_count_of_merges_metrics.sql index 609722fcf7f..85dd8707a90 100644 --- a/dbms/tests/queries/0_stateless/01014_count_of_merges_metrics.sql +++ b/dbms/tests/queries/0_stateless/01014_count_of_merges_metrics.sql @@ -1,11 +1,12 @@ -DROP TABLE IF EXISTs new_table_test; +DROP TABLE IF EXISTS new_table_test; DROP TABLE IF EXISTS check_table_test; -CREATE TABLE IF NOT EXISTS new_table_test(name String) ENGINE = MergeTree Order By name; -CREATE TABLE IF NOT EXISTS check_table_test(value1 UInt64, value2 UInt64) ENGINE = MergeTree Order By tuple(); -INSERT INTO check_table_test (value1) SELECT value from system.events WHERE event = 'Merge'; +CREATE TABLE new_table_test(name String) ENGINE = MergeTree ORDER BY name; +INSERT INTO new_table_test VALUES ('test'); +CREATE TABLE check_table_test(value1 UInt64, value2 UInt64) ENGINE = MergeTree ORDER BY tuple(); +INSERT INTO check_table_test (value1) SELECT value FROM system.events WHERE event = 'Merge'; OPTIMIZE TABLE new_table_test FINAL; -INSERT INTO check_table_test (value2) SELECT value from system.events WHERE event = 'Merge'; +INSERT INTO check_table_test (value2) SELECT value FROM system.events WHERE event = 'Merge'; SELECT count() FROM check_table_test WHERE value2 > value1; From 3e07e144a448d4de25d6d75669a087a7de54a507 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 26 Sep 2019 19:10:22 +0300 Subject: [PATCH 279/309] update docs --- docs/en/interfaces/formats.md | 71 +++++++++++++++++-------- docs/en/operations/settings/settings.md | 2 +- docs/ru/interfaces/formats.md | 57 +++++++++++++------- docs/ru/operations/settings/settings.md | 2 +- 4 files changed, 88 insertions(+), 44 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 67fe9762ffb..dda14eed8cf 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -15,6 +15,7 @@ The supported formats are: | [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | | [CSV](#csv) | ✔ | ✔ | | [CSVWithNames](#csvwithnames) | ✔ | ✔ | +| [CustomSeparated](#format-customseparated) | ✔ | ✔ | | [Values](#data-format-values) | ✔ | ✔ | | [Vertical](#vertical) | ✗ | ✔ | | [JSON](#json) | ✗ | ✔ | @@ -126,14 +127,14 @@ This format is also available under the name `TSVWithNamesAndTypes`. This format allows to specify a custom format string with placeholders for values with specified escaping rule. -It uses settings `format_schema`, `format_schema_rows`, `format_schema_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) +It uses settings `format_template_resultset`, `format_template_row`, `format_template_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) -Format string `format_schema_rows` specifies rows format with the following syntax: +Setting `format_template_row` specifies path to file, which contains format string for rows with the following syntax: `delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, where `delimiter_i` is a delimiter between values (`$` symbol can be escaped as `$$`), - `column_i` is a name of a column whose values are to be selected or inserted (if empty, then column will be skipped), + `column_i` is a name or index of a column whose values are to be selected or inserted (if empty, then column will be skipped), `serializeAs_i` is an escaping rule for the column values. The following escaping rules are supported: - `CSV`, `JSON`, `XML` (similarly to the formats of the same names) @@ -152,14 +153,14 @@ Format string `format_schema_rows` specifies rows format with the following synt `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` - The `format_schema_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) + The `format_template_rows_between_delimiter` setting specifies delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) -Format string `format_schema` has the same syntax as `format_schema_rows` and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: +Setting `format_template_resultset` specifies path to file, which contains format string for resultset. Format string for resultset has the same syntax as format string for row and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: - - `data` is the rows with data in `format_schema_rows` format, separated by `format_schema_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. - - `totals` is the row with total values in `format_schema_rows` format (when using WITH TOTALS) - - `min` is the row with minimum values in `format_schema_rows` format (when extremes is set to 1) - - `max` is the row with maximum values in `format_schema_rows` format (when extremes is set to 1) + - `data` is the rows with data in `format_template_row` format, separated by `format_template_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. + - `totals` is the row with total values in `format_template_row` format (when using WITH TOTALS) + - `min` is the row with minimum values in `format_template_row` format (when extremes is set to 1) + - `max` is the row with maximum values in `format_template_row` format (when extremes is set to 1) - `rows` is the total number of output rows - `rows_before_limit` is the minimal number of rows there would have been without LIMIT. Output only if the query contains LIMIT. If the query contains GROUP BY, rows_before_limit_at_least is the exact number of rows there would have been without a LIMIT. - `time` is the request execution time in seconds @@ -167,14 +168,17 @@ Format string `format_schema` has the same syntax as `format_schema_rows` and al - `bytes_read` is the number of bytes (uncompressed) have been read The placeholders `data`, `totals`, `min` and `max` must not have escaping rule specified (or `None` must be specified explicitly). The remaining placeholders may have any escaping rule specified. - If the `format_schema` setting is an empty string, `${data}` is used as default value. + If the `format_template_resultset` setting is an empty string, `${data}` is used as default value. For insert queries format allows to skip some columns or some fields if prefix or suffix (see example). - `Select` example: + Select example: ```sql -SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 -FORMAT Template -SETTINGS format_schema = ' +SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 FORMAT Template SETTINGS +format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format', format_template_rows_between_delimiter = '\n ' +``` +`/some/path/resultset.format`: +``` + Search phrases
  • @@ -186,10 +190,13 @@ SETTINGS format_schema = '
    Search phrases
    Processed ${rows_read:XML} rows in ${time:XML} sec -', -format_schema_rows = ' ${SearchPhrase:XML} ${с:XML} ', -format_schema_rows_between_delimiter = '\n ' + ``` +`/some/path/row.format`: +``` + ${0:XML} ${1:XML} +``` +Result: ```html Search phrases @@ -210,7 +217,7 @@ format_schema_rows_between_delimiter = '\n ' ``` -`Insert` example: +Insert example: ``` Some header Page views: 5, User id: 4324182021466249494, Useless field: hello, Duration: 146, Sign: -1 @@ -219,8 +226,15 @@ Total rows: 2 ``` ```sql INSERT INTO UserActivity FORMAT Template SETTINGS -format_schema = 'Some header\n${data}\nTotal rows: ${:CSV}\n', -format_schema_rows = 'Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV}' +format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format' +``` +`/some/path/resultset.format`: +``` +Some header\n${data}\nTotal rows: ${:CSV}\n +``` +`/some/path/row.format`: +``` +Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV} ``` `PageViews`, `UserID`, `Duration` and `Sign` inside placeholders are names of columns in the table. Values after `Useless field` in rows and after `\nTotal rows: ` in suffix will be ignored. All delimiters in the input data must be strictly equal to delimiters in specified format strings. @@ -232,9 +246,15 @@ Similar to `Template`, but skips whitespace characters between delimiters and v It's possible to read `JSON` using this format, if values of columns have the same order in all rows. For example, the following request can be used for inserting data from output example of format [JSON](#json): ```sql INSERT INTO table_name FORMAT TemplateIgnoreSpaces SETTINGS -format_schema = '{${}"meta"${}:${:JSON},${}"data"${}:${}[${data}]${},${}"totals"${}:${:JSON},${}"extremes"${}:${:JSON},${}"rows"${}:${:JSON},${}"rows_before_limit_at_least"${}:${:JSON}${}}', -format_schema_rows = '{${}"SearchPhrase"${}:${}${phrase:JSON}${},${}"c"${}:${}${cnt:JSON}${}}', -format_schema_rows_between_delimiter = ',' +format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format', format_template_rows_between_delimiter = ',' +``` +`/some/path/resultset.format`: +``` +{${}"meta"${}:${:JSON},${}"data"${}:${}[${data}]${},${}"totals"${}:${:JSON},${}"extremes"${}:${:JSON},${}"rows"${}:${:JSON},${}"rows_before_limit_at_least"${}:${:JSON}${}} +``` +`/some/path/row.format`: +``` +{${}"SearchPhrase"${}:${}${phrase:JSON}${},${}"c"${}:${}${cnt:JSON}${}} ``` ## TSKV {#tskv} @@ -296,6 +316,11 @@ The CSV format supports the output of totals and extremes the same way as `TabSe Also prints the header row, similar to `TabSeparatedWithNames`. +## CustomSeparated {#format-customseparated} + +Similar to [Template](#format-template), but it prints or reads all columns and uses escaping rule from setting `format_custom_escaping_rule` and delimiters from settings `format_custom_field_delimiter`, `format_custom_row_before_delimiter`, `format_custom_row_after_delimiter`, `format_custom_row_between_delimiter`, `format_custom_result_before_delimiter` and `format_custom_result_after_delimiter`, not from format strings. +There is also `CustomSeparatedIgnoreSpaces` format, which is similar to `TemplateIgnoreSpaces`. + ## JSON {#json} Outputs data in JSON format. Besides data tables, it also outputs column names and types, along with some additional information: the total number of output rows, and the number of rows that could have been output if there weren't a LIMIT. Example: diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index d4e433803ae..4882fc6c987 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -74,7 +74,7 @@ If `force_primary_key=1`, ClickHouse checks to see if the query has a primary ke ## format_schema -This parameter is useful when you are using formats that require a schema definition, such as [Cap'n Proto](https://capnproto.org/), [Protobuf](https://developers.google.com/protocol-buffers/) or [Template](../../interfaces/formats.md#format-template). The value depends on the format. +This parameter is useful when you are using formats that require a schema definition, such as [Cap'n Proto](https://capnproto.org/) or [Protobuf](https://developers.google.com/protocol-buffers/). The value depends on the format. ## fsync_metadata diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 9acf2d67e4a..b13420827ad 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -14,6 +14,7 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT | [TemplateIgnoreSpaces](#templateignorespaces) | ✔ | ✗ | | [CSV](#csv) | ✔ | ✔ | | [CSVWithNames](#csvwithnames) | ✔ | ✔ | +| [CustomSeparated](#format-customseparated) | ✔ | ✔ | | [Values](#data-format-values) | ✔ | ✔ | | [Vertical](#vertical) | ✗ | ✔ | | [JSON](#json) | ✗ | ✔ | @@ -125,14 +126,14 @@ world Этот формат позволяет указать произвольную форматную строку, в которую подставляются значения, сериализованные выбранным способом. -Для этого используются настройки `format_schema`, `format_schema_rows`, `format_schema_rows_between_delimiter` и настройки экранирования других форматов (например, `output_format_json_quote_64bit_integers` при экранировании как в `JSON`, см. далее) +Для этого используются настройки `format_template_resultset`, `format_template_row`, `format_template_rows_between_delimiter` и настройки экранирования других форматов (например, `output_format_json_quote_64bit_integers` при экранировании как в `JSON`, см. далее) -Форматная строка `format_schema_rows` задаёт формат для строк таблицы и должна иметь вид: +Настройка `format_template_row` задаёт путь к файлу, содержащему форматную строку для строк таблицы, которая должна иметь вид: `delimiter_1${column_1:serializeAs_1}delimiter_2${column_2:serializeAs_2} ... delimiter_N`, где `delimiter_i` - разделители между значениями (символ `$` в разделителе экранируется как `$$`), - `column_i` - имена столбцов, значения которых должны быть выведены или считаны (если имя не указано - столбец пропускается), + `column_i` - имена или номера столбцов, значения которых должны быть выведены или считаны (если имя не указано - столбец пропускается), `serializeAs_i` - тип экранирования для значений соответствующего столбца. Поддерживаются следующие типы экранирования: - `CSV`, `JSON`, `XML` (как в одноимённых форматах) @@ -151,14 +152,14 @@ world `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` - Настройка `format_schema_rows_between_delimiter` задаёт разделитель между строками, который выводится (или ожмдается при вводе) после каждой строки, кроме последней. По умолчанию `\n`. + Настройка `format_template_rows_between_delimiter` задаёт разделитель между строками, который выводится (или ожмдается при вводе) после каждой строки, кроме последней. По умолчанию `\n`. -Форматная строка `format_schema` имеет аналогичный `format_schema_rows` синтаксис и позволяет указать префикс, суффикс и способ вывода дополнительной информации. Вместо имён столбцов в ней указываются следующие имена подстановок: +Настройка `format_template_resultset` задаёт путь к файлу, содержащему форматную строку для результата. Форматная строка для результата имеет синтаксис аналогичный форматной строке для строк таблицы и позволяет указать префикс, суффикс и способ вывода дополнительной информации. Вместо имён столбцов в ней указываются следующие имена подстановок: - - `data` - строки с данными в формате `format_schema_rows`, разделённые `format_schema_rows_between_delimiter`. Эта подстановка должна быть первой подстановкой в форматной строке. - - `totals` - строка с тотальными значениями в формате `format_schema_rows` (при использовании WITH TOTALS) - - `min` - строка с минимальными значениями в формате `format_schema_rows` (при настройке extremes, выставленной в 1) - - `max` - строка с максимальными значениями в формате `format_schema_rows` (при настройке extremes, выставленной в 1) + - `data` - строки с данными в формате `format_template_row`, разделённые `format_template_rows_between_delimiter`. Эта подстановка должна быть первой подстановкой в форматной строке. + - `totals` - строка с тотальными значениями в формате `format_template_row` (при использовании WITH TOTALS) + - `min` - строка с минимальными значениями в формате `format_template_row` (при настройке extremes, выставленной в 1) + - `max` - строка с максимальными значениями в формате `format_template_row` (при настройке extremes, выставленной в 1) - `rows` - общее количество выведенных стрчек - `rows_before_limit` - не менее скольких строчек получилось бы, если бы не было LIMIT-а. Выводится только если запрос содержит LIMIT. В случае, если запрос содержит GROUP BY, `rows_before_limit` - точное число строк, которое получилось бы, если бы не было LIMIT-а. - `time` - время выполнения запроса в секундах @@ -166,15 +167,18 @@ world - `bytes_read` - сколько байт (несжатых) было прочитано при выполнении запроса У подстановок `data`, `totals`, `min` и `max` не должны быть указаны типы экранирования (или должен быть указан `None`). Остальные подстановки - это отдельные значения, для них может быть указан любой тип экранирования. - Если строка `format_schema` пустая, то по-умолчанию используется `${data}`. - Из всех перечисленных подстановок форматная строка `format_schema` для ввода может содержать только `data`. + Если строка `format_template_resultset` пустая, то по-умолчанию используется `${data}`. + Из всех перечисленных подстановок форматная строка `format_template_resultset` для ввода может содержать только `data`. Также при вводе формат поддерживает пропуск значений столбцов и пропуск значений в префиксе и суффиксе (см. пример). Пример вывода: ```sql -SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 -FORMAT Template -SETTINGS format_schema = ' +SELECT SearchPhrase, count() AS c FROM test.hits GROUP BY SearchPhrase ORDER BY c DESC LIMIT 5 FORMAT Template SETTINGS +format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format', format_template_rows_between_delimiter = '\n ' +``` +`/some/path/resultset.format`: +``` + Search phrases @@ -186,10 +190,13 @@ SETTINGS format_schema = '
    Search phrases
    Processed ${rows_read:XML} rows in ${time:XML} sec -', -format_schema_rows = ' ${SearchPhrase:XML} ${с:XML} ', -format_schema_rows_between_delimiter = '\n ' + ``` +`/some/path/row.format`: +``` + ${0:XML} ${1:XML} +``` +Резутьтат: ```html Search phrases @@ -219,8 +226,15 @@ Total rows: 2 ``` ```sql INSERT INTO UserActivity FORMAT Template SETTINGS -format_schema = 'Some header\n${data}\nTotal rows: ${:CSV}\n', -format_schema_rows = 'Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV}' +format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format' +``` +`/some/path/resultset.format`: +``` +Some header\n${data}\nTotal rows: ${:CSV}\n +``` +`/some/path/row.format`: +``` +Page views: ${PageViews:CSV}, User id: ${UserID:CSV}, Useless field: ${:CSV}, Duration: ${Duration:CSV}, Sign: ${Sign:CSV} ``` `PageViews`, `UserID`, `Duration` и `Sign` внутри подстановок - имена столбцов в таблице, в которую вставляются данные. Значения после `Useless field` в строках и значение после `\nTotal rows: ` в суффиксе будут проигнорированы. Все разделители во входных данных должны строго соответствовать разделителям в форматных строках. @@ -292,6 +306,11 @@ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMA Выводит также заголовок, аналогично `TabSeparatedWithNames`. +## CustomSeparated {#format-customseparated} + +Аналогичен [Template](#format-template), но выводит (или считывает) все столбцы, используя для них правило экранирования из настройки `format_custom_escaping_rule` и разделители из настроек `format_custom_field_delimiter`, `format_custom_row_before_delimiter`, `format_custom_row_after_delimiter`, `format_custom_row_between_delimiter`, `format_custom_result_before_delimiter` и `format_custom_result_after_delimiter`, а не из форматных строк. +Также существует формат `CustomSeparatedIgnoreSpaces`, аналогичный `TemplateIgnoreSpaces`. + ## JSON {#json} Выводит данные в формате JSON. Кроме таблицы с данными, также выводятся имена и типы столбцов, и некоторая дополнительная информация - общее количество выведенных строк, а также количество строк, которое могло бы быть выведено, если бы не было LIMIT-а. Пример: diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 20017e88af4..bbcfd96c069 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -72,7 +72,7 @@ ClickHouse применяет настройку в тех случаях, ко ## format_schema -Параметр применяется в том случае, когда используются форматы, требующие определения схемы, например [Cap'n Proto](https://capnproto.org/), [Protobuf](https://developers.google.com/protocol-buffers/) или [Template](../../interfaces/formats.md#format-template). Значение параметра зависит от формата. +Параметр применяется в том случае, когда используются форматы, требующие определения схемы, например [Cap'n Proto](https://capnproto.org/) или [Protobuf](https://developers.google.com/protocol-buffers/). Значение параметра зависит от формата. ## fsync_metadata From 0e28dc4e51d8aa9dd66f2bf425316a40c83af204 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 26 Sep 2019 19:12:15 +0300 Subject: [PATCH 280/309] Remove redundant classes --- .../Embedded/GeoDictionariesLoader.h | 11 ++++-- .../Embedded/IGeoDictionariesLoader.h | 28 -------------- dbms/src/Interpreters/Context.cpp | 32 ++++++---------- dbms/src/Interpreters/Context.h | 2 - .../src/Interpreters/EmbeddedDictionaries.cpp | 4 +- dbms/src/Interpreters/EmbeddedDictionaries.h | 6 +-- .../ExternalDictionariesLoader.cpp | 2 +- .../Interpreters/ExternalDictionariesLoader.h | 2 +- dbms/src/Interpreters/ExternalLoader.cpp | 8 ++-- dbms/src/Interpreters/ExternalLoader.h | 4 +- .../ExternalLoaderConfigRepository.h | 20 ++++++---- .../src/Interpreters/ExternalModelsLoader.cpp | 2 +- dbms/src/Interpreters/ExternalModelsLoader.h | 2 +- .../IExternalLoaderConfigRepository.h | 31 --------------- dbms/src/Interpreters/IUsersManager.h | 38 ------------------- .../Interpreters/RuntimeComponentsFactory.h | 37 ------------------ dbms/src/Interpreters/UsersManager.h | 17 +++++---- 17 files changed, 56 insertions(+), 190 deletions(-) delete mode 100644 dbms/src/Dictionaries/Embedded/IGeoDictionariesLoader.h delete mode 100644 dbms/src/Interpreters/IExternalLoaderConfigRepository.h delete mode 100644 dbms/src/Interpreters/IUsersManager.h delete mode 100644 dbms/src/Interpreters/RuntimeComponentsFactory.h diff --git a/dbms/src/Dictionaries/Embedded/GeoDictionariesLoader.h b/dbms/src/Dictionaries/Embedded/GeoDictionariesLoader.h index ed99d0b7ac7..3231a92b724 100644 --- a/dbms/src/Dictionaries/Embedded/GeoDictionariesLoader.h +++ b/dbms/src/Dictionaries/Embedded/GeoDictionariesLoader.h @@ -1,13 +1,16 @@ #pragma once -#include "IGeoDictionariesLoader.h" +#include +#include "RegionsHierarchies.h" +#include "RegionsNames.h" +#include // Default implementation of geo dictionaries loader used by native server application -class GeoDictionariesLoader : public IGeoDictionariesLoader +class GeoDictionariesLoader { public: - std::unique_ptr reloadRegionsHierarchies(const Poco::Util::AbstractConfiguration & config) override; + std::unique_ptr reloadRegionsHierarchies(const Poco::Util::AbstractConfiguration & config); - std::unique_ptr reloadRegionsNames(const Poco::Util::AbstractConfiguration & config) override; + std::unique_ptr reloadRegionsNames(const Poco::Util::AbstractConfiguration & config); }; diff --git a/dbms/src/Dictionaries/Embedded/IGeoDictionariesLoader.h b/dbms/src/Dictionaries/Embedded/IGeoDictionariesLoader.h deleted file mode 100644 index 77aacebab66..00000000000 --- a/dbms/src/Dictionaries/Embedded/IGeoDictionariesLoader.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include -#include "RegionsHierarchies.h" -#include "RegionsNames.h" - -namespace Poco -{ -namespace Util -{ - class AbstractConfiguration; -} - -class Logger; -} - - -// Provides actual versions of geo dictionaries (regions hierarchies, regions names) -// Bind data structures (RegionsHierarchies, RegionsNames) with data providers -class IGeoDictionariesLoader -{ -public: - virtual std::unique_ptr reloadRegionsHierarchies(const Poco::Util::AbstractConfiguration & config) = 0; - - virtual std::unique_ptr reloadRegionsNames(const Poco::Util::AbstractConfiguration & config) = 0; - - virtual ~IGeoDictionariesLoader() {} -}; diff --git a/dbms/src/Interpreters/Context.cpp b/dbms/src/Interpreters/Context.cpp index 92a0fbf5273..40f59af7c3f 100644 --- a/dbms/src/Interpreters/Context.cpp +++ b/dbms/src/Interpreters/Context.cpp @@ -25,9 +25,9 @@ #include #include #include -#include -#include +#include #include +#include #include #include #include @@ -97,8 +97,6 @@ struct ContextShared { Logger * log = &Logger::get("Context"); - std::unique_ptr runtime_components_factory; - /// For access of most of shared objects. Recursive mutex. mutable std::recursive_mutex mutex; /// Separate mutex for access of dictionaries. Separate mutex to avoid locks when server doing request to itself. @@ -128,7 +126,7 @@ struct ContextShared mutable std::optional external_models_loader; String default_profile_name; /// Default profile name used for default values. String system_profile_name; /// Profile used by system processes - std::unique_ptr users_manager; /// Known users. + std::unique_ptr users_manager; /// Known users. Quotas quotas; /// Known quotas for resource use. mutable UncompressedCachePtr uncompressed_cache; /// The cache of decompressed blocks. mutable MarkCachePtr mark_cache; /// Cache of marks in compressed files. @@ -210,8 +208,8 @@ struct ContextShared Context::ConfigReloadCallback config_reload_callback; - ContextShared(std::unique_ptr runtime_components_factory_) - : runtime_components_factory(std::move(runtime_components_factory_)), macros(std::make_unique()) + ContextShared() + : macros(std::make_unique()) { /// TODO: make it singleton (?) static std::atomic num_calls{0}; @@ -308,7 +306,7 @@ struct ContextShared private: void initialize() { - users_manager = runtime_components_factory->createUsersManager(); + users_manager = std::make_unique(); } }; @@ -318,17 +316,11 @@ Context::Context(const Context &) = default; Context & Context::operator=(const Context &) = default; -Context Context::createGlobal(std::unique_ptr runtime_components_factory) -{ - Context res; - res.shared = std::make_shared(std::move(runtime_components_factory)); - res.quota = std::make_shared(); - return res; -} - Context Context::createGlobal() { - return createGlobal(std::make_unique()); + Context res; + res.quota = std::make_shared(); + return res; } Context::~Context() = default; @@ -1327,7 +1319,7 @@ const ExternalDictionariesLoader & Context::getExternalDictionariesLoader() cons if (!this->global_context) throw Exception("Logical error: there is no global context", ErrorCodes::LOGICAL_ERROR); - auto config_repository = shared->runtime_components_factory->createExternalDictionariesConfigRepository(); + auto config_repository = std::make_unique(); shared->external_dictionaries_loader.emplace(std::move(config_repository), config, *this->global_context); } return *shared->external_dictionaries_loader; @@ -1347,7 +1339,7 @@ const ExternalModelsLoader & Context::getExternalModelsLoader() const if (!this->global_context) throw Exception("Logical error: there is no global context", ErrorCodes::LOGICAL_ERROR); - auto config_repository = shared->runtime_components_factory->createExternalModelsConfigRepository(); + auto config_repository = std::make_unique(); shared->external_models_loader.emplace(std::move(config_repository), *this->global_context); } return *shared->external_models_loader; @@ -1365,7 +1357,7 @@ EmbeddedDictionaries & Context::getEmbeddedDictionariesImpl(const bool throw_on_ if (!shared->embedded_dictionaries) { - auto geo_dictionaries_loader = shared->runtime_components_factory->createGeoDictionariesLoader(); + auto geo_dictionaries_loader = std::make_unique(); shared->embedded_dictionaries.emplace( std::move(geo_dictionaries_loader), diff --git a/dbms/src/Interpreters/Context.h b/dbms/src/Interpreters/Context.h index 4dd70a55e05..07de8312647 100644 --- a/dbms/src/Interpreters/Context.h +++ b/dbms/src/Interpreters/Context.h @@ -43,7 +43,6 @@ namespace DB struct ContextShared; class Context; -class RuntimeComponentsFactory; class QuotaForIntervals; class EmbeddedDictionaries; class ExternalDictionariesLoader; @@ -174,7 +173,6 @@ private: public: /// Create initial Context with ContextShared and etc. - static Context createGlobal(std::unique_ptr runtime_components_factory); static Context createGlobal(); Context(const Context &); diff --git a/dbms/src/Interpreters/EmbeddedDictionaries.cpp b/dbms/src/Interpreters/EmbeddedDictionaries.cpp index 4dd9f50b82c..c73850073cd 100644 --- a/dbms/src/Interpreters/EmbeddedDictionaries.cpp +++ b/dbms/src/Interpreters/EmbeddedDictionaries.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include @@ -121,7 +121,7 @@ void EmbeddedDictionaries::reloadPeriodically() EmbeddedDictionaries::EmbeddedDictionaries( - std::unique_ptr geo_dictionaries_loader_, + std::unique_ptr geo_dictionaries_loader_, Context & context_, const bool throw_on_error) : log(&Logger::get("EmbeddedDictionaries")) diff --git a/dbms/src/Interpreters/EmbeddedDictionaries.h b/dbms/src/Interpreters/EmbeddedDictionaries.h index 56abfe12aaa..69b21d4b39a 100644 --- a/dbms/src/Interpreters/EmbeddedDictionaries.h +++ b/dbms/src/Interpreters/EmbeddedDictionaries.h @@ -11,7 +11,7 @@ namespace Poco { class Logger; namespace Util { class AbstractConfiguration; } } class RegionsHierarchies; class RegionsNames; -class IGeoDictionariesLoader; +class GeoDictionariesLoader; namespace DB @@ -31,7 +31,7 @@ private: MultiVersion regions_hierarchies; MultiVersion regions_names; - std::unique_ptr geo_dictionaries_loader; + std::unique_ptr geo_dictionaries_loader; /// Directories' updating periodicity (in seconds). int reload_period; @@ -68,7 +68,7 @@ private: public: /// Every reload_period seconds directories are updated inside a separate thread. EmbeddedDictionaries( - std::unique_ptr geo_dictionaries_loader, + std::unique_ptr geo_dictionaries_loader, Context & context, const bool throw_on_error); diff --git a/dbms/src/Interpreters/ExternalDictionariesLoader.cpp b/dbms/src/Interpreters/ExternalDictionariesLoader.cpp index 4cf05491a04..a9f4cd3bd81 100644 --- a/dbms/src/Interpreters/ExternalDictionariesLoader.cpp +++ b/dbms/src/Interpreters/ExternalDictionariesLoader.cpp @@ -7,7 +7,7 @@ namespace DB /// Must not acquire Context lock in constructor to avoid possibility of deadlocks. ExternalDictionariesLoader::ExternalDictionariesLoader( - std::unique_ptr config_repository, + std::unique_ptr config_repository, const Poco::Util::AbstractConfiguration & config, Context & context_) : ExternalLoader(config, diff --git a/dbms/src/Interpreters/ExternalDictionariesLoader.h b/dbms/src/Interpreters/ExternalDictionariesLoader.h index f815e2e6945..e2f53d1cb53 100644 --- a/dbms/src/Interpreters/ExternalDictionariesLoader.h +++ b/dbms/src/Interpreters/ExternalDictionariesLoader.h @@ -19,7 +19,7 @@ public: /// Dictionaries will be loaded immediately and then will be updated in separate thread, each 'reload_period' seconds. ExternalDictionariesLoader( - std::unique_ptr config_repository, + std::unique_ptr config_repository, const Poco::Util::AbstractConfiguration & config, Context & context_); diff --git a/dbms/src/Interpreters/ExternalLoader.cpp b/dbms/src/Interpreters/ExternalLoader.cpp index fb6464fb217..9fe7120fd17 100644 --- a/dbms/src/Interpreters/ExternalLoader.cpp +++ b/dbms/src/Interpreters/ExternalLoader.cpp @@ -42,7 +42,7 @@ public: } ~ConfigFilesReader() = default; - void addConfigRepository(std::unique_ptr repository, const ExternalLoaderConfigSettings & settings) + void addConfigRepository(std::unique_ptr repository, const ExternalLoaderConfigSettings & settings) { std::lock_guard lock{mutex}; repositories.emplace_back(std::move(repository), std::move(settings)); @@ -140,7 +140,7 @@ private: } bool readFileInfo( - IExternalLoaderConfigRepository & repository, + ExternalLoaderConfigRepository & repository, const String & path, const ExternalLoaderConfigSettings & settings, bool ignore_last_modification_time, @@ -205,7 +205,7 @@ private: Logger * log; std::mutex mutex; - std::vector, ExternalLoaderConfigSettings>> repositories; + std::vector, ExternalLoaderConfigSettings>> repositories; ObjectConfigs configs; std::unordered_map file_infos; }; @@ -1031,7 +1031,7 @@ ExternalLoader::ExternalLoader(const Poco::Util::AbstractConfiguration & main_co ExternalLoader::~ExternalLoader() = default; void ExternalLoader::addConfigRepository( - std::unique_ptr config_repository, const ExternalLoaderConfigSettings & config_settings) + std::unique_ptr config_repository, const ExternalLoaderConfigSettings & config_settings) { config_files_reader->addConfigRepository(std::move(config_repository), config_settings); loading_dispatcher->setConfiguration(config_files_reader->read()); diff --git a/dbms/src/Interpreters/ExternalLoader.h b/dbms/src/Interpreters/ExternalLoader.h index 4976c28d8e6..35e31dc613c 100644 --- a/dbms/src/Interpreters/ExternalLoader.h +++ b/dbms/src/Interpreters/ExternalLoader.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include @@ -83,7 +83,7 @@ public: /// Adds a repository which will be used to read configurations from. void addConfigRepository( - std::unique_ptr config_repository, const ExternalLoaderConfigSettings & config_settings); + std::unique_ptr config_repository, const ExternalLoaderConfigSettings & config_settings); /// Sets whether all the objects from the configuration should be always loaded (even those which are never used). void enableAlwaysLoadEverything(bool enable); diff --git a/dbms/src/Interpreters/ExternalLoaderConfigRepository.h b/dbms/src/Interpreters/ExternalLoaderConfigRepository.h index a1b1606dd2c..06280803a43 100644 --- a/dbms/src/Interpreters/ExternalLoaderConfigRepository.h +++ b/dbms/src/Interpreters/ExternalLoaderConfigRepository.h @@ -1,25 +1,31 @@ #pragma once -#include +#include +#include +#include + +#include +#include namespace DB { -/** Default implementation of config repository used by native server application. +/** Config repository used by native server application. * Represents files in local filesystem. */ -class ExternalLoaderConfigRepository : public IExternalLoaderConfigRepository +class ExternalLoaderConfigRepository { public: + using Files = std::set; Files list( const Poco::Util::AbstractConfiguration & config, - const std::string & path_key) const override; + const std::string & path_key) const; - bool exists(const std::string & config_file) const override; + bool exists(const std::string & config_file) const; - Poco::Timestamp getLastModificationTime(const std::string & config_file) const override; + Poco::Timestamp getLastModificationTime(const std::string & config_file) const; - Poco::AutoPtr load(const std::string & config_file, const std::string & preprocessed_dir = "") const override; + Poco::AutoPtr load(const std::string & config_file, const std::string & preprocessed_dir = "") const; }; } diff --git a/dbms/src/Interpreters/ExternalModelsLoader.cpp b/dbms/src/Interpreters/ExternalModelsLoader.cpp index 624d4bdc9f6..77ee6147524 100644 --- a/dbms/src/Interpreters/ExternalModelsLoader.cpp +++ b/dbms/src/Interpreters/ExternalModelsLoader.cpp @@ -11,7 +11,7 @@ namespace ErrorCodes ExternalModelsLoader::ExternalModelsLoader( - std::unique_ptr config_repository, + std::unique_ptr config_repository, Context & context_) : ExternalLoader(context_.getConfigRef(), "external model", diff --git a/dbms/src/Interpreters/ExternalModelsLoader.h b/dbms/src/Interpreters/ExternalModelsLoader.h index c6324b3c602..d1523fabf29 100644 --- a/dbms/src/Interpreters/ExternalModelsLoader.h +++ b/dbms/src/Interpreters/ExternalModelsLoader.h @@ -19,7 +19,7 @@ public: /// Models will be loaded immediately and then will be updated in separate thread, each 'reload_period' seconds. ExternalModelsLoader( - std::unique_ptr config_repository, + std::unique_ptr config_repository, Context & context_); ModelPtr getModel(const std::string & name) const diff --git a/dbms/src/Interpreters/IExternalLoaderConfigRepository.h b/dbms/src/Interpreters/IExternalLoaderConfigRepository.h deleted file mode 100644 index d0caaf1b921..00000000000 --- a/dbms/src/Interpreters/IExternalLoaderConfigRepository.h +++ /dev/null @@ -1,31 +0,0 @@ -#pragma once - -#include -#include -#include - -#include -#include - -namespace DB -{ - -/** Repository with configurations of user-defined objects (dictionaries, models). - * Used by ExternalLoader. - */ -class IExternalLoaderConfigRepository -{ -public: - using Files = std::set; - virtual Files list(const Poco::Util::AbstractConfiguration & config, const std::string & path_key) const = 0; - - virtual bool exists(const std::string & config_file) const = 0; - - virtual Poco::Timestamp getLastModificationTime(const std::string & config_file) const = 0; - - virtual Poco::AutoPtr load(const std::string & config_file, const std::string & preprocessed_dir = "") const = 0; - - virtual ~IExternalLoaderConfigRepository() {} -}; - -} diff --git a/dbms/src/Interpreters/IUsersManager.h b/dbms/src/Interpreters/IUsersManager.h deleted file mode 100644 index 5cfaa413975..00000000000 --- a/dbms/src/Interpreters/IUsersManager.h +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -/** Duties of users manager: - * 1) Authenticate users - * 2) Provide user settings (profile, quota, ACLs) - * 3) Grant access to databases - */ -class IUsersManager -{ -public: - using UserPtr = std::shared_ptr; - - virtual ~IUsersManager() = default; - - virtual void loadFromConfig(const Poco::Util::AbstractConfiguration & config) = 0; - - /// Find user and make authorize checks - virtual UserPtr authorizeAndGetUser( - const String & user_name, - const String & password, - const Poco::Net::IPAddress & address) const = 0; - - /// Just find user - virtual UserPtr getUser(const String & user_name) const = 0; - - /// Check if the user has access to the database. - virtual bool hasAccessToDatabase(const String & user_name, const String & database_name) const = 0; - - // Check if the user has access to the dictionary - virtual bool hasAccessToDictionary(const String & user_name, const String & dictionary_name) const = 0; -}; - -} diff --git a/dbms/src/Interpreters/RuntimeComponentsFactory.h b/dbms/src/Interpreters/RuntimeComponentsFactory.h deleted file mode 100644 index e2b8310dd8c..00000000000 --- a/dbms/src/Interpreters/RuntimeComponentsFactory.h +++ /dev/null @@ -1,37 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace DB -{ - -/** Default implementation of runtime components factory - * used by native server application. - */ -class RuntimeComponentsFactory -{ -public: - std::unique_ptr createUsersManager() - { - return std::make_unique(); - } - - std::unique_ptr createGeoDictionariesLoader() - { - return std::make_unique(); - } - - std::unique_ptr createExternalDictionariesConfigRepository() - { - return std::make_unique(); - } - - std::unique_ptr createExternalModelsConfigRepository() - { - return std::make_unique(); - } -}; - -} diff --git a/dbms/src/Interpreters/UsersManager.h b/dbms/src/Interpreters/UsersManager.h index 94d35a8231e..17cdc62c209 100644 --- a/dbms/src/Interpreters/UsersManager.h +++ b/dbms/src/Interpreters/UsersManager.h @@ -1,8 +1,7 @@ #pragma once -#include - #include +#include namespace DB { @@ -10,20 +9,22 @@ namespace DB /** Default implementation of users manager used by native server application. * Manages fixed set of users listed in 'Users' configuration file. */ -class UsersManager : public IUsersManager +class UsersManager { public: - void loadFromConfig(const Poco::Util::AbstractConfiguration & config) override; + using UserPtr = std::shared_ptr; + + void loadFromConfig(const Poco::Util::AbstractConfiguration & config); UserPtr authorizeAndGetUser( const String & user_name, const String & password, - const Poco::Net::IPAddress & address) const override; + const Poco::Net::IPAddress & address) const; - UserPtr getUser(const String & user_name) const override; + UserPtr getUser(const String & user_name) const; - bool hasAccessToDatabase(const String & user_name, const String & database_name) const override; - bool hasAccessToDictionary(const String & user_name, const String & dictionary_name) const override; + bool hasAccessToDatabase(const String & user_name, const String & database_name) const; + bool hasAccessToDictionary(const String & user_name, const String & dictionary_name) const; private: using Container = std::map; From bba3a96ff48b740f79c7c6e61ee2a3fe99def334 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 26 Sep 2019 20:24:55 +0300 Subject: [PATCH 281/309] Remove redundant argument --- dbms/src/Interpreters/ExternalLoader.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/dbms/src/Interpreters/ExternalLoader.cpp b/dbms/src/Interpreters/ExternalLoader.cpp index 9fe7120fd17..e3a764abd2f 100644 --- a/dbms/src/Interpreters/ExternalLoader.cpp +++ b/dbms/src/Interpreters/ExternalLoader.cpp @@ -224,18 +224,13 @@ public: /// Called after loading/reloading an object to calculate the time of the next update. using CalculateNextUpdateTimeFunction = std::function; - /// Called on the time of each update to decide if we should reload an object. - using IsObjectModifiedFunction = std::function; - LoadingDispatcher( const CreateObjectFunction & create_object_function_, const CalculateNextUpdateTimeFunction & calculate_next_update_time_function_, - const IsObjectModifiedFunction & is_object_modified_function_, const String & type_name_, Logger * log_) : create_object(create_object_function_) , calculate_next_update_time(calculate_next_update_time_function_) - , is_object_modified(is_object_modified_function_) , type_name(type_name_) , log(log_) { @@ -556,7 +551,7 @@ public: { try { - is_modified_flag = is_object_modified(object); + is_modified_flag = object->isModified(); } catch (...) { @@ -914,7 +909,6 @@ private: const CreateObjectFunction create_object; const CalculateNextUpdateTimeFunction calculate_next_update_time; - const IsObjectModifiedFunction is_object_modified; const String type_name; Logger * log; @@ -1020,7 +1014,6 @@ ExternalLoader::ExternalLoader(const Poco::Util::AbstractConfiguration & main_co , loading_dispatcher(std::make_unique( std::bind(&ExternalLoader::createObject, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4), std::bind(&ExternalLoader::calculateNextUpdateTime, this, std::placeholders::_1, std::placeholders::_2), - std::bind(&IExternalLoadable::isModified, std::placeholders::_1), type_name_, log)) , periodic_updater(std::make_unique(*config_files_reader, *loading_dispatcher)) From 38f95e9fbc5cd855cc46d9f7b7cd4b5f0cbffd66 Mon Sep 17 00:00:00 2001 From: BayoNet Date: Thu, 26 Sep 2019 21:24:47 +0300 Subject: [PATCH 282/309] DOCAPI-8299: Added a link with metrics for system.metrics (#7110) * Typo fix. * Links fix. * Fixed links in docs. * More fixes. * Link fixes. * DOCAPI-8299: Reference to source code. --- docs/en/operations/system_tables.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/en/operations/system_tables.md b/docs/en/operations/system_tables.md index 5eae7ecd544..36008cffdc6 100644 --- a/docs/en/operations/system_tables.md +++ b/docs/en/operations/system_tables.md @@ -251,6 +251,8 @@ Columns: - `value` ([Int64](../data_types/int_uint.md)) — Metric value. - `description` ([String](../data_types/string.md)) — Metric description. +The list of supported metrics you can find in the [dbms/src/Common/CurrentMetrics.cpp](https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Common/CurrentMetrics.cpp) source file of ClickHouse. + **Example** ```sql From 87a6d3b222e274d2703b9c7adc838416fe589645 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 26 Sep 2019 21:49:30 +0300 Subject: [PATCH 283/309] Update ProfileEvents.cpp --- dbms/src/Common/ProfileEvents.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Common/ProfileEvents.cpp b/dbms/src/Common/ProfileEvents.cpp index 586aa158bdf..6cbbc07d8d8 100644 --- a/dbms/src/Common/ProfileEvents.cpp +++ b/dbms/src/Common/ProfileEvents.cpp @@ -114,7 +114,7 @@ M(SelectedRanges, "Number of (non-adjacent) ranges in all data parts selected to read from a MergeTree table.") \ M(SelectedMarks, "Number of marks (index granules) selected to read from a MergeTree table.") \ \ - M(Merge, "Number of launches background merges.") \ + M(Merge, "Number of launched background merges.") \ M(MergedRows, "Rows read for background merges. This is the number of rows before merge.") \ M(MergedUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) that was read for background merges. This is the number before merge.") \ M(MergesTimeMilliseconds, "Total time spent for background merges.")\ From 004455a02952d5cacf91d34a89e72c2dfde47948 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 27 Sep 2019 00:14:52 +0300 Subject: [PATCH 284/309] fixes --- dbms/src/Core/Settings.h | 6 +++--- dbms/src/Formats/ParsedTemplateFormatString.cpp | 9 ++++----- .../queries/0_stateless/01014_format_custom_separated.sh | 1 + 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index 84ce0f0469a..e2bc24059dd 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -225,11 +225,11 @@ struct Settings : public SettingsCollection M(SettingString, format_template_row, "", "Path to file which contains format string for rows (for Template format)") \ M(SettingString, format_template_rows_between_delimiter, "\n", "Delimiter between rows (for Template format)") \ \ - M(SettingString, format_custom_escaping_rule, "", "Field escaping rule (for CustomSeparated format)") \ + M(SettingString, format_custom_escaping_rule, "Escaped", "Field escaping rule (for CustomSeparated format)") \ M(SettingString, format_custom_field_delimiter, "\t", "Delimiter between fields (for CustomSeparated format)") \ M(SettingString, format_custom_row_before_delimiter, "", "Delimiter before field of the first column (for CustomSeparated format)") \ - M(SettingString, format_custom_row_after_delimiter, "", "Delimiter after field of the last column (for CustomSeparated format)") \ - M(SettingString, format_custom_row_between_delimiter, "\n", "Delimiter between rows (for CustomSeparated format)") \ + M(SettingString, format_custom_row_after_delimiter, "\n", "Delimiter after field of the last column (for CustomSeparated format)") \ + M(SettingString, format_custom_row_between_delimiter, "", "Delimiter between rows (for CustomSeparated format)") \ M(SettingString, format_custom_result_before_delimiter, "", "Prefix before result set (for CustomSeparated format)") \ M(SettingString, format_custom_result_after_delimiter, "", "Suffix after result set (for CustomSeparated format)") \ \ diff --git a/dbms/src/Formats/ParsedTemplateFormatString.cpp b/dbms/src/Formats/ParsedTemplateFormatString.cpp index e02b972345e..d6773970c0c 100644 --- a/dbms/src/Formats/ParsedTemplateFormatString.cpp +++ b/dbms/src/Formats/ParsedTemplateFormatString.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include @@ -19,10 +18,10 @@ ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & { try { - ReadBufferFromFile schema_file(schema.absoluteSchemaPath()); - WriteBufferFromOwnString format_string; - copyData(schema_file, format_string); - parse(format_string.str(), idx_by_name); + ReadBufferFromFile schema_file(schema.absoluteSchemaPath(), 4096); + String format_string; + readStringUntilEOF(format_string, schema_file); + parse(format_string, idx_by_name); } catch (DB::Exception & e) { diff --git a/dbms/tests/queries/0_stateless/01014_format_custom_separated.sh b/dbms/tests/queries/0_stateless/01014_format_custom_separated.sh index 672e3686bc2..8880c9f86c0 100755 --- a/dbms/tests/queries/0_stateless/01014_format_custom_separated.sh +++ b/dbms/tests/queries/0_stateless/01014_format_custom_separated.sh @@ -12,6 +12,7 @@ format_custom_escaping_rule = 'CSV', \ format_custom_field_delimiter = '\t|\t', \ format_custom_row_before_delimiter = '||', \ format_custom_row_after_delimiter = '\t||', \ +format_custom_row_between_delimiter = '\n', \ format_custom_result_before_delimiter = '========== result ==========\n', \ format_custom_result_after_delimiter = '\n============================\n'" From b1182cdc3dc286a44d3a0ae00a7abdeda3eb9c50 Mon Sep 17 00:00:00 2001 From: Guillaume Tassery Date: Fri, 27 Sep 2019 05:15:03 +0200 Subject: [PATCH 285/309] uupdate tests --- .../http_server.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/http_server.py b/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/http_server.py index d8e4865df81..5eb1d3cca64 100644 --- a/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/http_server.py +++ b/dbms/tests/integration/test_dictionaries_all_layouts_and_sources/http_server.py @@ -9,9 +9,9 @@ import csv # Decorator used to see if authentification works for external dictionary who use a HTTP source. def check_auth(fn): def wrapper(req): - auth_header = self.headers.get('Authorization', None) - api_key = self.headers.get('api-key', None) - if not auth_header or auth_header != 'Zm9vOmJhcg==' or not api_key or api_key != 'secret': + auth_header = req.headers.get('authorization', None) + api_key = req.headers.get('api-key', None) + if not auth_header or auth_header != 'Basic Zm9vOmJhcg==' or not api_key or api_key != 'secret': req.send_response(401) else: fn(req) From 506dbd49cb40715ec9a018b0634a20ceee5a2253 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 27 Sep 2019 12:02:06 +0300 Subject: [PATCH 286/309] Add missed row --- dbms/src/Interpreters/Context.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dbms/src/Interpreters/Context.cpp b/dbms/src/Interpreters/Context.cpp index 40f59af7c3f..6e0fe3ba3ec 100644 --- a/dbms/src/Interpreters/Context.cpp +++ b/dbms/src/Interpreters/Context.cpp @@ -320,6 +320,7 @@ Context Context::createGlobal() { Context res; res.quota = std::make_shared(); + res.shared = std::make_shared(); return res; } From 4cb53093a0aede40132dd4d3d40e5827a5c9b923 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Fri, 27 Sep 2019 20:19:26 +0800 Subject: [PATCH 287/309] Get rid of malloc symbols in libcommon (#7065) --- CMakeLists.txt | 27 +++++++++++++++++++++--- dbms/CMakeLists.txt | 16 +++++--------- dbms/programs/CMakeLists.txt | 6 +++--- dbms/src/Client/tests/CMakeLists.txt | 2 +- dbms/src/Processors/tests/CMakeLists.txt | 14 ++++++------ libs/libcommon/CMakeLists.txt | 24 --------------------- libs/libcommon/src/tests/CMakeLists.txt | 18 ++++++++-------- libs/libmysqlxx/src/tests/CMakeLists.txt | 2 +- 8 files changed, 50 insertions(+), 59 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bb7387ddeb0..a5460cd1c76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,3 @@ -project(ClickHouse) cmake_minimum_required(VERSION 3.3) foreach(policy @@ -13,6 +12,7 @@ foreach(policy endif() endforeach() +project(ClickHouse) include (cmake/target.cmake) # Ignore export() since we don't use it, @@ -348,7 +348,7 @@ include (libs/libcommon/cmake/find_jemalloc.cmake) include (libs/libcommon/cmake/find_cctz.cmake) include (libs/libmysqlxx/cmake/find_mysqlclient.cmake) -# When testing for memory leaks with Valgrind, dont link tcmalloc or jemalloc. +# When testing for memory leaks with Valgrind, don't link tcmalloc or jemalloc. if (USE_JEMALLOC) message (STATUS "Link jemalloc: ${JEMALLOC_LIBRARIES}") @@ -367,7 +367,7 @@ elseif (USE_TCMALLOC) endif () elseif (SANITIZE) message (STATUS "Will use ${SANITIZE} sanitizer.") -else () +elseif (OS_LINUX) message (WARNING "Non default allocator is disabled. This is not recommended for production Linux builds.") endif () @@ -376,8 +376,29 @@ include (cmake/print_flags.cmake) install (EXPORT global DESTINATION cmake) add_subdirectory (contrib EXCLUDE_FROM_ALL) + +macro (add_executable target) + # invoke built-in add_executable + _add_executable (${ARGV}) + get_target_property (type ${target} TYPE) + if (${type} STREQUAL EXECUTABLE) + set_property (GLOBAL APPEND PROPERTY CLICKHOUSE_EXECUTABLES ${target}) + endif() +endmacro() + +set_property (GLOBAL PROPERTY CLICKHOUSE_EXECUTABLES "") add_subdirectory (libs) add_subdirectory (utils) +get_property (executables GLOBAL PROPERTY CLICKHOUSE_EXECUTABLES) +foreach (executable ${executables}) + target_link_libraries (${executable} PRIVATE ${MALLOC_LIBRARIES}) +endforeach () + +set_property (GLOBAL PROPERTY CLICKHOUSE_EXECUTABLES "") add_subdirectory (dbms) +get_property (executables GLOBAL PROPERTY CLICKHOUSE_EXECUTABLES) +foreach (executable ${executables}) + target_link_libraries (${executable} PRIVATE clickhouse_new_delete ${MALLOC_LIBRARIES}) +endforeach () include (cmake/print_include_directories.cmake) diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index 22a3111a70e..30e73815bc7 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -100,6 +100,7 @@ set(dbms_sources) add_headers_and_sources(clickhouse_common_io src/Common) add_headers_and_sources(clickhouse_common_io src/Common/HashTable) add_headers_and_sources(clickhouse_common_io src/IO) +list (REMOVE_ITEM clickhouse_common_io_sources src/Common/new_delete.cpp) if(USE_RDKAFKA) add_headers_and_sources(dbms src/Storages/Kafka) @@ -139,6 +140,9 @@ endif () add_library(clickhouse_common_io ${clickhouse_common_io_headers} ${clickhouse_common_io_sources}) +add_library (clickhouse_new_delete STATIC src/Common/new_delete.cpp) +target_link_libraries (clickhouse_new_delete clickhouse_common_io) + if (OS_FREEBSD) target_compile_definitions (clickhouse_common_io PUBLIC CLOCK_MONOTONIC_COARSE=CLOCK_MONOTONIC_FAST) endif () @@ -419,17 +423,7 @@ endif() if (USE_JEMALLOC) dbms_target_include_directories (SYSTEM BEFORE PRIVATE ${JEMALLOC_INCLUDE_DIR}) # used in Interpreters/AsynchronousMetrics.cpp - target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${JEMALLOC_INCLUDE_DIR}) # new_delete.cpp - # common/memory.h - if (MAKE_STATIC_LIBRARIES OR NOT SPLIT_SHARED_LIBRARIES) - # skip if we have bundled build, since jemalloc is static in this case - elseif (${JEMALLOC_LIBRARIES} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$") - # if the library is static we do not need to link with it, - # since in this case it will be in libs/libcommon, - # and we do not want to link with jemalloc multiple times. - else() - target_link_libraries(clickhouse_common_io PRIVATE ${JEMALLOC_LIBRARIES}) - endif() + target_include_directories (clickhouse_new_delete SYSTEM BEFORE PRIVATE ${JEMALLOC_INCLUDE_DIR}) endif () dbms_target_include_directories (PUBLIC ${DBMS_INCLUDE_DIR} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src/Formats/include) diff --git a/dbms/programs/CMakeLists.txt b/dbms/programs/CMakeLists.txt index bac3269468e..138321360f3 100644 --- a/dbms/programs/CMakeLists.txt +++ b/dbms/programs/CMakeLists.txt @@ -24,9 +24,9 @@ configure_file (config_tools.h.in ${CMAKE_CURRENT_BINARY_DIR}/config_tools.h) macro(clickhouse_target_link_split_lib target name) if(NOT CLICKHOUSE_ONE_SHARED) - target_link_libraries(${target} PRIVATE clickhouse-${name}-lib ${MALLOC_LIBRARIES}) + target_link_libraries(${target} PRIVATE clickhouse-${name}-lib) else() - target_link_libraries(${target} PRIVATE clickhouse-lib ${MALLOC_LIBRARIES}) + target_link_libraries(${target} PRIVATE clickhouse-lib) endif() endmacro() @@ -111,7 +111,7 @@ if (CLICKHOUSE_SPLIT_BINARY) install(PROGRAMS clickhouse-split-helper DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME clickhouse COMPONENT clickhouse) else () add_executable (clickhouse main.cpp) - target_link_libraries (clickhouse PRIVATE clickhouse_common_io string_utils ${MALLOC_LIBRARIES}) + target_link_libraries (clickhouse PRIVATE clickhouse_common_io string_utils) target_include_directories (clickhouse BEFORE PRIVATE ${COMMON_INCLUDE_DIR}) target_include_directories (clickhouse PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/dbms/src/Client/tests/CMakeLists.txt b/dbms/src/Client/tests/CMakeLists.txt index f4471136a8a..d952c006bb5 100644 --- a/dbms/src/Client/tests/CMakeLists.txt +++ b/dbms/src/Client/tests/CMakeLists.txt @@ -1,2 +1,2 @@ add_executable(test-connect test_connect.cpp) -target_link_libraries (test-connect dbms) +target_link_libraries (test-connect PRIVATE dbms) diff --git a/dbms/src/Processors/tests/CMakeLists.txt b/dbms/src/Processors/tests/CMakeLists.txt index 5f44ec2a8fd..4ddb6c68416 100644 --- a/dbms/src/Processors/tests/CMakeLists.txt +++ b/dbms/src/Processors/tests/CMakeLists.txt @@ -6,10 +6,10 @@ add_executable (processors_test_merge_sorting_transform processors_test_merge_so add_executable (processors_test_expand_pipeline processors_test_expand_pipeline.cpp) add_executable (processors_test_aggregation processors_test_aggregation.cpp) -target_link_libraries (processors_test dbms) -target_link_libraries (processors_test_chain dbms) -target_link_libraries (processors_test_merge dbms) -target_link_libraries (processors_test_expand_pipeline dbms) -target_link_libraries (processors_test_merging_sorted_transform dbms) -target_link_libraries (processors_test_merge_sorting_transform dbms) -target_link_libraries (processors_test_aggregation dbms clickhouse_aggregate_functions) +target_link_libraries (processors_test PRIVATE dbms) +target_link_libraries (processors_test_chain PRIVATE dbms) +target_link_libraries (processors_test_merge PRIVATE dbms) +target_link_libraries (processors_test_expand_pipeline PRIVATE dbms) +target_link_libraries (processors_test_merging_sorted_transform PRIVATE dbms) +target_link_libraries (processors_test_merge_sorting_transform PRIVATE dbms) +target_link_libraries (processors_test_aggregation PRIVATE dbms clickhouse_aggregate_functions) diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index 62c64a9bdb0..357e457b240 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -65,29 +65,6 @@ add_library (common ${CONFIG_COMMON}) -# When testing for memory leaks with Valgrind, dont link tcmalloc or jemalloc. - -if (USE_JEMALLOC) - message (STATUS "Link jemalloc: ${JEMALLOC_LIBRARIES}") - set (MALLOC_LIBRARIES ${JEMALLOC_LIBRARIES}) -elseif (USE_TCMALLOC) - if (DEBUG_TCMALLOC AND NOT GPERFTOOLS_TCMALLOC_MINIMAL_DEBUG) - message (FATAL_ERROR "Requested DEBUG_TCMALLOC but debug library is not found. You should install Google Perftools. Example: sudo apt-get install libgoogle-perftools-dev") - endif () - - if (DEBUG_TCMALLOC AND GPERFTOOLS_TCMALLOC_MINIMAL_DEBUG) - message (STATUS "Link libtcmalloc_minimal_debug for testing: ${GPERFTOOLS_TCMALLOC_MINIMAL_DEBUG}") - set (MALLOC_LIBRARIES ${GPERFTOOLS_TCMALLOC_MINIMAL_DEBUG}) - else () - message (STATUS "Link libtcmalloc_minimal: ${GPERFTOOLS_TCMALLOC_MINIMAL}") - set (MALLOC_LIBRARIES ${GPERFTOOLS_TCMALLOC_MINIMAL}) - endif () -elseif (SANITIZE) - message (STATUS "Will use ${SANITIZE} sanitizer.") -elseif (OS_LINUX) - message (WARNING "Non default allocator is disabled. This is not recommended for production Linux builds.") -endif () - if (USE_INTERNAL_MEMCPY) set (MEMCPY_LIBRARIES memcpy) endif () @@ -120,7 +97,6 @@ target_link_libraries (common PUBLIC ${Boost_SYSTEM_LIBRARY} PRIVATE - ${MALLOC_LIBRARIES} ${MEMCPY_LIBRARIES}) if (RT_LIBRARY) diff --git a/libs/libcommon/src/tests/CMakeLists.txt b/libs/libcommon/src/tests/CMakeLists.txt index 15d872ac49d..486914e4ca7 100644 --- a/libs/libcommon/src/tests/CMakeLists.txt +++ b/libs/libcommon/src/tests/CMakeLists.txt @@ -10,20 +10,20 @@ add_executable (realloc-perf allocator.cpp) set(PLATFORM_LIBS ${CMAKE_DL_LIBS}) -target_link_libraries (date_lut_init common ${PLATFORM_LIBS}) -target_link_libraries (date_lut2 common ${PLATFORM_LIBS}) -target_link_libraries (date_lut3 common ${PLATFORM_LIBS}) -target_link_libraries (date_lut4 common ${PLATFORM_LIBS}) -target_link_libraries (date_lut_default_timezone common ${PLATFORM_LIBS}) -target_link_libraries (local_date_time_comparison common) -target_link_libraries (realloc-perf common) +target_link_libraries (date_lut_init PRIVATE common ${PLATFORM_LIBS}) +target_link_libraries (date_lut2 PRIVATE common ${PLATFORM_LIBS}) +target_link_libraries (date_lut3 PRIVATE common ${PLATFORM_LIBS}) +target_link_libraries (date_lut4 PRIVATE common ${PLATFORM_LIBS}) +target_link_libraries (date_lut_default_timezone PRIVATE common ${PLATFORM_LIBS}) +target_link_libraries (local_date_time_comparison PRIVATE common) +target_link_libraries (realloc-perf PRIVATE common) add_check(local_date_time_comparison) if(USE_GTEST) add_executable(unit_tests_libcommon gtest_json_test.cpp gtest_strong_typedef.cpp gtest_find_symbols.cpp) - target_link_libraries(unit_tests_libcommon common ${GTEST_MAIN_LIBRARIES} ${GTEST_LIBRARIES}) + target_link_libraries(unit_tests_libcommon PRIVATE common ${GTEST_MAIN_LIBRARIES} ${GTEST_LIBRARIES}) add_check(unit_tests_libcommon) endif() add_executable (dump_variable dump_variable.cpp) -target_link_libraries (dump_variable clickhouse_common_io) +target_link_libraries (dump_variable PRIVATE clickhouse_common_io) diff --git a/libs/libmysqlxx/src/tests/CMakeLists.txt b/libs/libmysqlxx/src/tests/CMakeLists.txt index d2901513808..ec3fdfaa913 100644 --- a/libs/libmysqlxx/src/tests/CMakeLists.txt +++ b/libs/libmysqlxx/src/tests/CMakeLists.txt @@ -1,2 +1,2 @@ add_executable (mysqlxx_test mysqlxx_test.cpp) -target_link_libraries (mysqlxx_test mysqlxx) +target_link_libraries (mysqlxx_test PRIVATE mysqlxx) From 88b9672108c4b0693c3e44d68d34a3457c0a6921 Mon Sep 17 00:00:00 2001 From: Dario Date: Fri, 27 Sep 2019 14:23:12 +0200 Subject: [PATCH 288/309] Added Redash to the GUI documentation --- docs/en/interfaces/third-party/gui.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/en/interfaces/third-party/gui.md b/docs/en/interfaces/third-party/gui.md index 140cc8a4222..0f99d3fbc95 100644 --- a/docs/en/interfaces/third-party/gui.md +++ b/docs/en/interfaces/third-party/gui.md @@ -48,6 +48,19 @@ Features: - Table preview with filtering and sorting. - Read-only queries execution. +### Redash + +[Redash](https://github.com/getredash/redash) is an insights platform. + +Features: + +- Connect with multiple sources (include ClickHouse). +- Autocompletion. +- Write queries in their natural syntax and explore schemas. +- Data plotting +- Create snippets for elements you frequently use. +- Use query results as data sources to join different databases. + ### DBeaver [DBeaver](https://dbeaver.io/) - universal desktop database client with ClickHouse support. From baaf0a7c5b7a087a72ec2ba3ab3f9a3b64e5d7ab Mon Sep 17 00:00:00 2001 From: chertus Date: Fri, 27 Sep 2019 15:24:07 +0300 Subject: [PATCH 289/309] partial_merge_join_optimisations -> partial_merge_join_optimizations --- dbms/src/Core/Settings.h | 2 +- dbms/src/Interpreters/AnalyzedJoin.cpp | 2 +- dbms/src/Interpreters/AnalyzedJoin.h | 6 +++--- dbms/src/Interpreters/MergeJoin.cpp | 2 +- dbms/tests/queries/0_stateless/01010_pmj_skip_blocks.sql | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index 7bb7c9e5dba..34b051c0d86 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -292,7 +292,7 @@ struct Settings : public SettingsCollection M(SettingOverflowMode, join_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.") \ M(SettingBool, join_any_take_last_row, false, "When disabled (default) ANY JOIN will take the first found row for a key. When enabled, it will take the last row seen if there are multiple rows for the same key.") \ M(SettingBool, partial_merge_join, false, "Use partial merge join instead of hash join for LEFT and INNER JOINs.") \ - M(SettingBool, partial_merge_join_optimisations, false, "Enable optimisations in partial merge join") \ + M(SettingBool, partial_merge_join_optimizations, false, "Enable optimisations in partial merge join") \ M(SettingUInt64, partial_merge_join_rows_in_right_blocks, 10000, "Split right-hand joining data in blocks of specified size.") \ \ M(SettingUInt64, max_rows_to_transfer, 0, "Maximum size (in rows) of the transmitted external table obtained when the GLOBAL IN/JOIN section is executed.") \ diff --git a/dbms/src/Interpreters/AnalyzedJoin.cpp b/dbms/src/Interpreters/AnalyzedJoin.cpp index cdf047fc5e5..072cf352831 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.cpp +++ b/dbms/src/Interpreters/AnalyzedJoin.cpp @@ -27,7 +27,7 @@ AnalyzedJoin::AnalyzedJoin(const Settings & settings) : size_limits(SizeLimits{settings.max_rows_in_join, settings.max_bytes_in_join, settings.join_overflow_mode}) , join_use_nulls(settings.join_use_nulls) , partial_merge_join(settings.partial_merge_join) - , partial_merge_join_optimisations(settings.partial_merge_join_optimisations) + , partial_merge_join_optimizations(settings.partial_merge_join_optimizations) , partial_merge_join_rows_in_right_blocks(settings.partial_merge_join_rows_in_right_blocks) {} diff --git a/dbms/src/Interpreters/AnalyzedJoin.h b/dbms/src/Interpreters/AnalyzedJoin.h index f9d0d9d0f79..2145912c202 100644 --- a/dbms/src/Interpreters/AnalyzedJoin.h +++ b/dbms/src/Interpreters/AnalyzedJoin.h @@ -40,7 +40,7 @@ class AnalyzedJoin const SizeLimits size_limits; const bool join_use_nulls; const bool partial_merge_join; - const bool partial_merge_join_optimisations; + const bool partial_merge_join_optimizations; const size_t partial_merge_join_rows_in_right_blocks; Names key_names_left; @@ -68,7 +68,7 @@ public: : size_limits(limits) , join_use_nulls(use_nulls) , partial_merge_join(false) - , partial_merge_join_optimisations(false) + , partial_merge_join_optimizations(false) , partial_merge_join_rows_in_right_blocks(0) , key_names_right(key_names_right_) { @@ -83,7 +83,7 @@ public: bool forceNullableRight() const { return join_use_nulls && isLeftOrFull(table_join.kind); } bool forceNullableLeft() const { return join_use_nulls && isRightOrFull(table_join.kind); } size_t maxRowsInRightBlock() const { return partial_merge_join_rows_in_right_blocks; } - bool enablePartialMergeJoinOptimisations() const { return partial_merge_join_optimisations; } + bool enablePartialMergeJoinOptimizations() const { return partial_merge_join_optimizations; } void addUsingKey(const ASTPtr & ast); void addOnKeys(ASTPtr & left_table_ast, ASTPtr & right_table_ast); diff --git a/dbms/src/Interpreters/MergeJoin.cpp b/dbms/src/Interpreters/MergeJoin.cpp index 2464500957b..430ff6693ab 100644 --- a/dbms/src/Interpreters/MergeJoin.cpp +++ b/dbms/src/Interpreters/MergeJoin.cpp @@ -330,7 +330,7 @@ MergeJoin::MergeJoin(std::shared_ptr table_join_, const Block & ri , is_all(table_join->strictness() == ASTTableJoin::Strictness::All) , is_inner(isInner(table_join->kind())) , is_left(isLeft(table_join->kind())) - , skip_not_intersected(table_join->enablePartialMergeJoinOptimisations()) + , skip_not_intersected(table_join->enablePartialMergeJoinOptimizations()) { if (!isLeft(table_join->kind()) && !isInner(table_join->kind())) throw Exception("Partial merge supported for LEFT and INNER JOINs only", ErrorCodes::NOT_IMPLEMENTED); diff --git a/dbms/tests/queries/0_stateless/01010_pmj_skip_blocks.sql b/dbms/tests/queries/0_stateless/01010_pmj_skip_blocks.sql index 3f2cbdd0cc4..3554aabe5f3 100644 --- a/dbms/tests/queries/0_stateless/01010_pmj_skip_blocks.sql +++ b/dbms/tests/queries/0_stateless/01010_pmj_skip_blocks.sql @@ -7,7 +7,7 @@ CREATE TABLE t1 (x UInt32, y UInt64) engine = MergeTree ORDER BY (x,y); CREATE TABLE t2 (x UInt32, y UInt64) engine = MergeTree ORDER BY (x,y); SET partial_merge_join = 1; -SET partial_merge_join_optimisations = 1; +SET partial_merge_join_optimizations = 1; SET partial_merge_join_rows_in_right_blocks = 2; SET any_join_distinct_right_table_keys = 1; From f7bf293cbfff2a3f69040584dcb249658d9159c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dar=C3=ADo?= Date: Fri, 27 Sep 2019 14:28:00 +0200 Subject: [PATCH 290/309] Update gui.md --- docs/en/interfaces/third-party/gui.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/en/interfaces/third-party/gui.md b/docs/en/interfaces/third-party/gui.md index 0f99d3fbc95..3ed0d8924e2 100644 --- a/docs/en/interfaces/third-party/gui.md +++ b/docs/en/interfaces/third-party/gui.md @@ -54,10 +54,10 @@ Features: Features: -- Connect with multiple sources (include ClickHouse). -- Autocompletion. -- Write queries in their natural syntax and explore schemas. -- Data plotting +- Connect with multiple sources (including ClickHouse). +- Autocompletion query editor. +- Schema/Database explorer. +- Data plotting. - Create snippets for elements you frequently use. - Use query results as data sources to join different databases. From 9ee031413c49c9ca86ea6f484bbe0ddedc91c459 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 27 Sep 2019 15:36:54 +0300 Subject: [PATCH 291/309] Remove a lot of unused code --- dbms/src/Interpreters/ExternalLoader.cpp | 219 ++++++----------------- dbms/src/Interpreters/ExternalLoader.h | 22 +-- 2 files changed, 51 insertions(+), 190 deletions(-) diff --git a/dbms/src/Interpreters/ExternalLoader.cpp b/dbms/src/Interpreters/ExternalLoader.cpp index e3a764abd2f..0709abc337e 100644 --- a/dbms/src/Interpreters/ExternalLoader.cpp +++ b/dbms/src/Interpreters/ExternalLoader.cpp @@ -221,16 +221,11 @@ public: using CreateObjectFunction = std::function; - /// Called after loading/reloading an object to calculate the time of the next update. - using CalculateNextUpdateTimeFunction = std::function; - LoadingDispatcher( const CreateObjectFunction & create_object_function_, - const CalculateNextUpdateTimeFunction & calculate_next_update_time_function_, const String & type_name_, Logger * log_) : create_object(create_object_function_) - , calculate_next_update_time(calculate_next_update_time_function_) , type_name(type_name_) , log(log_) { @@ -280,7 +275,7 @@ public: info.config = new_config; info.config_changed = true; - if (info.was_loading()) + if (info.wasLoading()) { /// The object has been tried to load before, so it is currently in use or was in use /// and we should try to reload it with the new config. @@ -325,7 +320,7 @@ public: { /// Start loading all the objects which were not loaded yet. for (auto & [name, info] : infos) - if (!info.was_loading()) + if (!info.wasLoading()) startLoading(name, info); } } @@ -356,30 +351,28 @@ public: const Info * info = getInfo(name); if (!info) return {Status::NOT_EXIST}; - return info->load_result(); + return info->loadResult(); } /// Returns all the load results as a map. /// The function doesn't load anything, it just returns the current load results as is. - template - LoadResults getCurrentLoadResults(const FilterByNameType & filter_by_name) const + LoadResults getCurrentLoadResults(const FilterByNameFunction & filter_by_name) const { std::lock_guard lock{mutex}; return collectLoadResults(filter_by_name); } - LoadResults getCurrentLoadResults() const { return getCurrentLoadResults(all_names); } + LoadResults getCurrentLoadResults() const { return getCurrentLoadResults(allNames); } /// Returns all the loaded objects as a map. /// The function doesn't load anything, it just returns the current load results as is. - template - Loadables getCurrentlyLoadedObjects(const FilterByNameType & filter_by_name) const + Loadables getCurrentlyLoadedObjects(const FilterByNameFunction & filter_by_name) const { std::lock_guard lock{mutex}; return collectLoadedObjects(filter_by_name); } - Loadables getCurrentlyLoadedObjects() const { return getCurrentlyLoadedObjects(all_names); } + Loadables getCurrentlyLoadedObjects() const { return getCurrentlyLoadedObjects(allNames); } size_t getNumberOfCurrentlyLoadedObjects() const { @@ -394,10 +387,6 @@ public: return count; } -#if !__clang__ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" -#endif bool hasCurrentlyLoadedObjects() const { std::lock_guard lock{mutex}; @@ -406,16 +395,6 @@ public: return true; return false; } -#if !__clang__ -#pragma GCC diagnostic pop -#endif - - /// Starts loading of a specified object. - void load(const String & name) - { - std::lock_guard lock{mutex}; - startLoading(name); - } /// Tries to load a specified object during the timeout. /// Returns nullptr if the loading is unsuccessful or if there is no such object. @@ -426,13 +405,6 @@ public: loaded_object = (info ? info->object : nullptr); } - void load(const String & name, LoadResult & load_result, Duration timeout = NO_TIMEOUT) - { - std::unique_lock lock{mutex}; - Info * info = loadImpl(name, timeout, lock); - load_result = info ? info->load_result() : LoadResult{Status::NOT_EXIST}; - } - /// Tries to finish loading of a specified object during the timeout. /// Returns nullptr if the loading is unsuccessful or if there is no such object. void loadStrict(const String & name, LoadablePtr & loaded_object) @@ -445,50 +417,25 @@ public: loaded_object = info->object; } - void loadStrict(const String & name, LoadResult & load_result) - { - std::unique_lock lock{mutex}; - Info * info = loadImpl(name, NO_TIMEOUT, lock); - if (!info) - throw Exception("No such " + type_name + " '" + name + "'.", ErrorCodes::BAD_ARGUMENTS); - checkLoaded(name, *info); - load_result = info->load_result(); - } - /// Tries to start loading of the objects for which the specified functor returns true. - template - void load(const FilterByNameType & filter_by_name) + void load(const FilterByNameFunction & filter_by_name) { std::lock_guard lock{mutex}; for (auto & [name, info] : infos) - if (!info.was_loading() && filter_by_name(name)) + if (!info.wasLoading() && filter_by_name(name)) startLoading(name, info); } /// Tries to finish loading of the objects for which the specified function returns true. - template - void load(const FilterByNameType & filter_by_name, Loadables & loaded_objects, Duration timeout = NO_TIMEOUT) + void load(const FilterByNameFunction & filter_by_name, Loadables & loaded_objects, Duration timeout = NO_TIMEOUT) { std::unique_lock lock{mutex}; loadImpl(filter_by_name, timeout, lock); loaded_objects = collectLoadedObjects(filter_by_name); } - template - void load(const FilterByNameType & filter_by_name, LoadResults & load_results, Duration timeout = NO_TIMEOUT) - { - std::unique_lock lock{mutex}; - loadImpl(filter_by_name, timeout, lock); - load_results = collectLoadResults(filter_by_name); - } - - - /// Starts loading of all the objects. - void load() { load(all_names); } - /// Tries to finish loading of all the objects during the timeout. - void load(Loadables & loaded_objects, Duration timeout = NO_TIMEOUT) { load(all_names, loaded_objects, timeout); } - void load(LoadResults & load_results, Duration timeout = NO_TIMEOUT) { load(all_names, load_results, timeout); } + void load(Loadables & loaded_objects, Duration timeout = NO_TIMEOUT) { load(allNames, loaded_objects, timeout); } /// Starts reloading a specified object. void reload(const String & name, bool load_never_loading = false) @@ -498,7 +445,7 @@ public: if (!info) return; - if (info->was_loading() || load_never_loading) + if (info->wasLoading() || load_never_loading) { cancelLoading(*info); info->forced_to_reload = true; @@ -507,13 +454,12 @@ public: } /// Starts reloading of the objects which `filter_by_name` returns true for. - template - void reload(const FilterByNameType & filter_by_name, bool load_never_loading = false) + void reload(const FilterByNameFunction & filter_by_name, bool load_never_loading = false) { std::lock_guard lock{mutex}; for (auto & [name, info] : infos) { - if ((info.was_loading() || load_never_loading) && filter_by_name(name)) + if ((info.wasLoading() || load_never_loading) && filter_by_name(name)) { cancelLoading(info); info.forced_to_reload = true; @@ -523,9 +469,7 @@ public: } /// Starts reloading of all the objects. - void reload(bool load_never_loading = false) { reload(all_names, load_never_loading); } - - using IsModifiedFunction = std::function; + void reload(bool load_never_loading = false) { reload(allNames, load_never_loading); } /// Starts reloading all the object which update time is earlier than now. /// The function doesn't touch the objects which were never tried to load. @@ -546,7 +490,7 @@ public: /// Find out which of the loaded objects were modified. /// We couldn't perform these checks while we were building `is_modified_map` because - /// the `mutex` should be unlocked while we're calling the function is_object_modified(). + /// the `mutex` should be unlocked while we're calling the function object->isModified() for (auto & [object, is_modified_flag] : is_modified_map) { try @@ -577,7 +521,7 @@ public: if (!is_modified_flag) { /// Object wasn't modified so we only have to set `next_update_time`. - info.next_update_time = calculate_next_update_time(info.object, info.error_count); + info.next_update_time = calculateNextUpdateTime(info.object, info.error_count); continue; } @@ -602,7 +546,7 @@ private: bool loaded() const { return object != nullptr; } bool failed() const { return !object && exception; } bool loading() const { return loading_id != 0; } - bool was_loading() const { return loaded() || failed() || loading(); } + bool wasLoading() const { return loaded() || failed() || loading(); } bool ready() const { return (loaded() || failed()) && !forced_to_reload; } Status status() const @@ -615,20 +559,20 @@ private: return loading() ? Status::LOADING : Status::NOT_LOADED; } - Duration loading_duration() const + Duration loadingDuration() const { if (loading()) return std::chrono::duration_cast(std::chrono::system_clock::now() - loading_start_time); return std::chrono::duration_cast(loading_end_time - loading_start_time); } - LoadResult load_result() const + LoadResult loadResult() const { LoadResult result{status()}; result.object = object; result.exception = exception; result.loading_start_time = loading_start_time; - result.loading_duration = loading_duration(); + result.loading_duration = loadingDuration(); result.origin = config.config_path; return result; } @@ -661,8 +605,7 @@ private: return &it->second; } - template - Loadables collectLoadedObjects(const FilterByNameType & filter_by_name) const + Loadables collectLoadedObjects(const FilterByNameFunction & filter_by_name) const { Loadables objects; objects.reserve(infos.size()); @@ -672,14 +615,13 @@ private: return objects; } - template - LoadResults collectLoadResults(const FilterByNameType & filter_by_name) const + LoadResults collectLoadResults(const FilterByNameFunction & filter_by_name) const { LoadResults load_results; load_results.reserve(infos.size()); for (const auto & [name, info] : infos) if (filter_by_name(name)) - load_results.emplace_back(name, info.load_result()); + load_results.emplace_back(name, info.loadResult()); return load_results; } @@ -704,8 +646,7 @@ private: return info; } - template - void loadImpl(const FilterByNameType & filter_by_name, Duration timeout, std::unique_lock & lock) + void loadImpl(const FilterByNameFunction & filter_by_name, Duration timeout, std::unique_lock & lock) { auto pred = [&]() { @@ -728,13 +669,6 @@ private: event.wait_for(lock, timeout, pred); } - void startLoading(const String & name) - { - Info * info = getInfo(name); - if (info) - startLoading(name, *info); - } - void startLoading(const String & name, Info & info) { if (info.loading()) @@ -823,7 +757,7 @@ private: ++error_count; else error_count = 0; - next_update_time = calculate_next_update_time(new_object, error_count); + next_update_time = calculateNextUpdateTime(new_object, error_count); } catch (...) { @@ -905,10 +839,31 @@ private: } /// Filter by name which matches everything. - static bool all_names(const String &) { return true; } + static bool allNames(const String &) { return true; } + + /// Calculate next update time for loaded_object. Can be called without mutex locking, + /// because single loadable can be loaded in single thread only. + TimePoint calculateNextUpdateTime(const LoadablePtr & loaded_object, size_t error_count) const + { + static constexpr auto never = TimePoint::max(); + if (!error_count) + { + if (!loaded_object->supportUpdates()) + return never; + + /// do not update loadable objects with zero as lifetime + const auto & lifetime = loaded_object->getLifetime(); + if (lifetime.min_sec == 0 || lifetime.max_sec == 0) + return never; + + std::uniform_int_distribution distribution{lifetime.min_sec, lifetime.max_sec}; + return std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)}; + } + + return std::chrono::system_clock::now() + std::chrono::seconds(calculateDurationWithBackoff(rnd_engine, error_count)); + } const CreateObjectFunction create_object; - const CalculateNextUpdateTimeFunction calculate_next_update_time; const String type_name; Logger * log; @@ -920,6 +875,7 @@ private: bool enable_async_loading = false; std::unordered_map loading_ids; size_t next_loading_id = 1; /// should always be > 0 + mutable pcg64 rnd_engine{randomSeed()}; }; @@ -961,26 +917,6 @@ public: } } - TimePoint calculateNextUpdateTime(const LoadablePtr & loaded_object, size_t error_count) const - { - std::lock_guard lock{mutex}; - static constexpr auto never = TimePoint::max(); - if (!error_count) - { - if (!loaded_object->supportUpdates()) - return never; - - /// do not update loadable objects with zero as lifetime - const auto & lifetime = loaded_object->getLifetime(); - if (lifetime.min_sec == 0 || lifetime.max_sec == 0) - return never; - - std::uniform_int_distribution distribution{lifetime.min_sec, lifetime.max_sec}; - return std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)}; - } - - return std::chrono::system_clock::now() + std::chrono::seconds(calculateDurationWithBackoff(rnd_engine, error_count)); - } private: void doPeriodicUpdates() @@ -1005,7 +941,6 @@ private: bool enabled = false; ThreadFromGlobalPool thread; std::condition_variable event; - mutable pcg64 rnd_engine{randomSeed()}; }; @@ -1013,7 +948,6 @@ ExternalLoader::ExternalLoader(const Poco::Util::AbstractConfiguration & main_co : config_files_reader(std::make_unique(main_config, type_name_, log)) , loading_dispatcher(std::make_unique( std::bind(&ExternalLoader::createObject, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, std::placeholders::_4), - std::bind(&ExternalLoader::calculateNextUpdateTime, this, std::placeholders::_1, std::placeholders::_2), type_name_, log)) , periodic_updater(std::make_unique(*config_files_reader, *loading_dispatcher)) @@ -1085,36 +1019,16 @@ size_t ExternalLoader::getNumberOfCurrentlyLoadedObjects() const return loading_dispatcher->getNumberOfCurrentlyLoadedObjects(); } -void ExternalLoader::load(const String & name) const -{ - loading_dispatcher->load(name); -} - void ExternalLoader::load(const String & name, LoadablePtr & loaded_object, Duration timeout) const { loading_dispatcher->load(name, loaded_object, timeout); } -void ExternalLoader::load(const String & name, LoadResult & load_result, Duration timeout) const -{ - loading_dispatcher->load(name, load_result, timeout); -} - void ExternalLoader::loadStrict(const String & name, LoadablePtr & loaded_object) const { loading_dispatcher->loadStrict(name, loaded_object); } -void ExternalLoader::loadStrict(const String & name, LoadResult & load_result) const -{ - loading_dispatcher->loadStrict(name, load_result); -} - -void ExternalLoader::load(const FilterByNameFunction & filter_by_name) const -{ - loading_dispatcher->load(filter_by_name); -} - void ExternalLoader::load(const FilterByNameFunction & filter_by_name, Loadables & loaded_objects, Duration timeout) const { if (filter_by_name) @@ -1123,44 +1037,17 @@ void ExternalLoader::load(const FilterByNameFunction & filter_by_name, Loadables loading_dispatcher->load(loaded_objects, timeout); } -void ExternalLoader::load(const FilterByNameFunction & filter_by_name, LoadResults & load_results, Duration timeout) const -{ - if (filter_by_name) - loading_dispatcher->load(filter_by_name, load_results, timeout); - else - loading_dispatcher->load(load_results, timeout); -} - -void ExternalLoader::load() const -{ - loading_dispatcher->load(); -} - void ExternalLoader::load(Loadables & loaded_objects, Duration timeout) const { return loading_dispatcher->load(loaded_objects, timeout); } -void ExternalLoader::load(LoadResults & load_results, Duration timeout) const -{ - loading_dispatcher->load(load_results, timeout); -} - void ExternalLoader::reload(const String & name, bool load_never_loading) { loading_dispatcher->setConfiguration(config_files_reader->read()); loading_dispatcher->reload(name, load_never_loading); } -void ExternalLoader::reload(const FilterByNameFunction & filter_by_name, bool load_never_loading) -{ - loading_dispatcher->setConfiguration(config_files_reader->read()); - if (filter_by_name) - loading_dispatcher->reload(filter_by_name, load_never_loading); - else - loading_dispatcher->reload(load_never_loading); -} - void ExternalLoader::reload(bool load_never_loading) { loading_dispatcher->setConfiguration(config_files_reader->read()); @@ -1176,12 +1063,6 @@ ExternalLoader::LoadablePtr ExternalLoader::createObject( return create(name, *config.config, config.key_in_config); } -ExternalLoader::TimePoint ExternalLoader::calculateNextUpdateTime(const LoadablePtr & loaded_object, size_t error_count) const -{ - return periodic_updater->calculateNextUpdateTime(loaded_object, error_count); -} - - std::vector> ExternalLoader::getStatusEnumAllPossibleValues() { return std::vector>{ diff --git a/dbms/src/Interpreters/ExternalLoader.h b/dbms/src/Interpreters/ExternalLoader.h index 35e31dc613c..30b54d4490c 100644 --- a/dbms/src/Interpreters/ExternalLoader.h +++ b/dbms/src/Interpreters/ExternalLoader.h @@ -121,48 +121,29 @@ public: static constexpr Duration NO_TIMEOUT = Duration::max(); - /// Starts loading of a specified object. - void load(const String & name) const; - /// Tries to finish loading of a specified object during the timeout. /// Returns nullptr if the loading is unsuccessful or if there is no such object. void load(const String & name, LoadablePtr & loaded_object, Duration timeout = NO_TIMEOUT) const; - void load(const String & name, LoadResult & load_result, Duration timeout = NO_TIMEOUT) const; LoadablePtr loadAndGet(const String & name, Duration timeout = NO_TIMEOUT) const { LoadablePtr object; load(name, object, timeout); return object; } LoadablePtr tryGetLoadable(const String & name) const { return loadAndGet(name); } /// Tries to finish loading of a specified object during the timeout. /// Throws an exception if the loading is unsuccessful or if there is no such object. void loadStrict(const String & name, LoadablePtr & loaded_object) const; - void loadStrict(const String & name, LoadResult & load_result) const; - LoadablePtr loadAndGetStrict(const String & name) const { LoadablePtr object; loadStrict(name, object); return object; } - LoadablePtr getLoadable(const String & name) const { return loadAndGetStrict(name); } - - /// Tries to start loading of the objects for which the specified function returns true. - void load(const FilterByNameFunction & filter_by_name) const; + LoadablePtr getLoadable(const String & name) const { LoadablePtr object; loadStrict(name, object); return object; } /// Tries to finish loading of the objects for which the specified function returns true. void load(const FilterByNameFunction & filter_by_name, Loadables & loaded_objects, Duration timeout = NO_TIMEOUT) const; - void load(const FilterByNameFunction & filter_by_name, LoadResults & load_results, Duration timeout = NO_TIMEOUT) const; Loadables loadAndGet(const FilterByNameFunction & filter_by_name, Duration timeout = NO_TIMEOUT) const { Loadables loaded_objects; load(filter_by_name, loaded_objects, timeout); return loaded_objects; } - /// Starts loading of all the objects. - void load() const; - /// Tries to finish loading of all the objects during the timeout. void load(Loadables & loaded_objects, Duration timeout = NO_TIMEOUT) const; - void load(LoadResults & load_results, Duration timeout = NO_TIMEOUT) const; /// Starts reloading of a specified object. /// `load_never_loading` specifies what to do if the object has never been loading before. /// The function can either skip it (false) or load for the first time (true). void reload(const String & name, bool load_never_loading = false); - /// Starts reloading of the objects for which the specified function returns true. - /// `load_never_loading` specifies what to do with the objects which have never been loading before. - /// The function can either skip them (false) or load for the first time (true). - void reload(const FilterByNameFunction & filter_by_name, bool load_never_loading = false); - /// Starts reloading of all the objects. /// `load_never_loading` specifies what to do with the objects which have never been loading before. /// The function can either skip them (false) or load for the first time (true). @@ -175,7 +156,6 @@ private: struct ObjectConfig; LoadablePtr createObject(const String & name, const ObjectConfig & config, bool config_changed, const LoadablePtr & previous_version) const; - TimePoint calculateNextUpdateTime(const LoadablePtr & loaded_object, size_t error_count) const; class ConfigFilesReader; std::unique_ptr config_files_reader; From 0356bd98ae89d6bb1a66d025cca512ac34d99f2f Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Fri, 27 Sep 2019 14:46:14 +0200 Subject: [PATCH 292/309] Add missing verb in readme Signed-off-by: Philippe Ombredanne --- docker/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/README.md b/docker/README.md index bf7e35f8b34..083c6a1dc28 100644 --- a/docker/README.md +++ b/docker/README.md @@ -2,4 +2,4 @@ This directory contain Dockerfiles for `clickhouse-client` and `clickhouse-server`. They updated each release. -Also there is bunch of images for testing and CI. They listed in `images.json` file and updated on each commit to master. If you need to add another image, place information about it into `images.json`. +Also there is bunch of images for testing and CI. They are listed in `images.json` file and updated on each commit to master. If you need to add another image, place information about it into `images.json`. From e5a5091556c488510bd6c664a0ca05bb7f9ea2ad Mon Sep 17 00:00:00 2001 From: Ivan <5627721+abyss7@users.noreply.github.com> Date: Fri, 27 Sep 2019 16:16:53 +0300 Subject: [PATCH 293/309] Revert "Get rid of malloc symbols in libcommon (#7065)" (#7131) This reverts commit 4cb53093a0aede40132dd4d3d40e5827a5c9b923. --- CMakeLists.txt | 27 +++--------------------- dbms/CMakeLists.txt | 16 +++++++++----- dbms/programs/CMakeLists.txt | 6 +++--- dbms/src/Client/tests/CMakeLists.txt | 2 +- dbms/src/Processors/tests/CMakeLists.txt | 14 ++++++------ libs/libcommon/CMakeLists.txt | 24 +++++++++++++++++++++ libs/libcommon/src/tests/CMakeLists.txt | 18 ++++++++-------- libs/libmysqlxx/src/tests/CMakeLists.txt | 2 +- 8 files changed, 59 insertions(+), 50 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a5460cd1c76..bb7387ddeb0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,3 +1,4 @@ +project(ClickHouse) cmake_minimum_required(VERSION 3.3) foreach(policy @@ -12,7 +13,6 @@ foreach(policy endif() endforeach() -project(ClickHouse) include (cmake/target.cmake) # Ignore export() since we don't use it, @@ -348,7 +348,7 @@ include (libs/libcommon/cmake/find_jemalloc.cmake) include (libs/libcommon/cmake/find_cctz.cmake) include (libs/libmysqlxx/cmake/find_mysqlclient.cmake) -# When testing for memory leaks with Valgrind, don't link tcmalloc or jemalloc. +# When testing for memory leaks with Valgrind, dont link tcmalloc or jemalloc. if (USE_JEMALLOC) message (STATUS "Link jemalloc: ${JEMALLOC_LIBRARIES}") @@ -367,7 +367,7 @@ elseif (USE_TCMALLOC) endif () elseif (SANITIZE) message (STATUS "Will use ${SANITIZE} sanitizer.") -elseif (OS_LINUX) +else () message (WARNING "Non default allocator is disabled. This is not recommended for production Linux builds.") endif () @@ -376,29 +376,8 @@ include (cmake/print_flags.cmake) install (EXPORT global DESTINATION cmake) add_subdirectory (contrib EXCLUDE_FROM_ALL) - -macro (add_executable target) - # invoke built-in add_executable - _add_executable (${ARGV}) - get_target_property (type ${target} TYPE) - if (${type} STREQUAL EXECUTABLE) - set_property (GLOBAL APPEND PROPERTY CLICKHOUSE_EXECUTABLES ${target}) - endif() -endmacro() - -set_property (GLOBAL PROPERTY CLICKHOUSE_EXECUTABLES "") add_subdirectory (libs) add_subdirectory (utils) -get_property (executables GLOBAL PROPERTY CLICKHOUSE_EXECUTABLES) -foreach (executable ${executables}) - target_link_libraries (${executable} PRIVATE ${MALLOC_LIBRARIES}) -endforeach () - -set_property (GLOBAL PROPERTY CLICKHOUSE_EXECUTABLES "") add_subdirectory (dbms) -get_property (executables GLOBAL PROPERTY CLICKHOUSE_EXECUTABLES) -foreach (executable ${executables}) - target_link_libraries (${executable} PRIVATE clickhouse_new_delete ${MALLOC_LIBRARIES}) -endforeach () include (cmake/print_include_directories.cmake) diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index 30e73815bc7..22a3111a70e 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -100,7 +100,6 @@ set(dbms_sources) add_headers_and_sources(clickhouse_common_io src/Common) add_headers_and_sources(clickhouse_common_io src/Common/HashTable) add_headers_and_sources(clickhouse_common_io src/IO) -list (REMOVE_ITEM clickhouse_common_io_sources src/Common/new_delete.cpp) if(USE_RDKAFKA) add_headers_and_sources(dbms src/Storages/Kafka) @@ -140,9 +139,6 @@ endif () add_library(clickhouse_common_io ${clickhouse_common_io_headers} ${clickhouse_common_io_sources}) -add_library (clickhouse_new_delete STATIC src/Common/new_delete.cpp) -target_link_libraries (clickhouse_new_delete clickhouse_common_io) - if (OS_FREEBSD) target_compile_definitions (clickhouse_common_io PUBLIC CLOCK_MONOTONIC_COARSE=CLOCK_MONOTONIC_FAST) endif () @@ -423,7 +419,17 @@ endif() if (USE_JEMALLOC) dbms_target_include_directories (SYSTEM BEFORE PRIVATE ${JEMALLOC_INCLUDE_DIR}) # used in Interpreters/AsynchronousMetrics.cpp - target_include_directories (clickhouse_new_delete SYSTEM BEFORE PRIVATE ${JEMALLOC_INCLUDE_DIR}) + target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${JEMALLOC_INCLUDE_DIR}) # new_delete.cpp + # common/memory.h + if (MAKE_STATIC_LIBRARIES OR NOT SPLIT_SHARED_LIBRARIES) + # skip if we have bundled build, since jemalloc is static in this case + elseif (${JEMALLOC_LIBRARIES} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$") + # if the library is static we do not need to link with it, + # since in this case it will be in libs/libcommon, + # and we do not want to link with jemalloc multiple times. + else() + target_link_libraries(clickhouse_common_io PRIVATE ${JEMALLOC_LIBRARIES}) + endif() endif () dbms_target_include_directories (PUBLIC ${DBMS_INCLUDE_DIR} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src/Formats/include) diff --git a/dbms/programs/CMakeLists.txt b/dbms/programs/CMakeLists.txt index 138321360f3..bac3269468e 100644 --- a/dbms/programs/CMakeLists.txt +++ b/dbms/programs/CMakeLists.txt @@ -24,9 +24,9 @@ configure_file (config_tools.h.in ${CMAKE_CURRENT_BINARY_DIR}/config_tools.h) macro(clickhouse_target_link_split_lib target name) if(NOT CLICKHOUSE_ONE_SHARED) - target_link_libraries(${target} PRIVATE clickhouse-${name}-lib) + target_link_libraries(${target} PRIVATE clickhouse-${name}-lib ${MALLOC_LIBRARIES}) else() - target_link_libraries(${target} PRIVATE clickhouse-lib) + target_link_libraries(${target} PRIVATE clickhouse-lib ${MALLOC_LIBRARIES}) endif() endmacro() @@ -111,7 +111,7 @@ if (CLICKHOUSE_SPLIT_BINARY) install(PROGRAMS clickhouse-split-helper DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME clickhouse COMPONENT clickhouse) else () add_executable (clickhouse main.cpp) - target_link_libraries (clickhouse PRIVATE clickhouse_common_io string_utils) + target_link_libraries (clickhouse PRIVATE clickhouse_common_io string_utils ${MALLOC_LIBRARIES}) target_include_directories (clickhouse BEFORE PRIVATE ${COMMON_INCLUDE_DIR}) target_include_directories (clickhouse PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/dbms/src/Client/tests/CMakeLists.txt b/dbms/src/Client/tests/CMakeLists.txt index d952c006bb5..f4471136a8a 100644 --- a/dbms/src/Client/tests/CMakeLists.txt +++ b/dbms/src/Client/tests/CMakeLists.txt @@ -1,2 +1,2 @@ add_executable(test-connect test_connect.cpp) -target_link_libraries (test-connect PRIVATE dbms) +target_link_libraries (test-connect dbms) diff --git a/dbms/src/Processors/tests/CMakeLists.txt b/dbms/src/Processors/tests/CMakeLists.txt index 4ddb6c68416..5f44ec2a8fd 100644 --- a/dbms/src/Processors/tests/CMakeLists.txt +++ b/dbms/src/Processors/tests/CMakeLists.txt @@ -6,10 +6,10 @@ add_executable (processors_test_merge_sorting_transform processors_test_merge_so add_executable (processors_test_expand_pipeline processors_test_expand_pipeline.cpp) add_executable (processors_test_aggregation processors_test_aggregation.cpp) -target_link_libraries (processors_test PRIVATE dbms) -target_link_libraries (processors_test_chain PRIVATE dbms) -target_link_libraries (processors_test_merge PRIVATE dbms) -target_link_libraries (processors_test_expand_pipeline PRIVATE dbms) -target_link_libraries (processors_test_merging_sorted_transform PRIVATE dbms) -target_link_libraries (processors_test_merge_sorting_transform PRIVATE dbms) -target_link_libraries (processors_test_aggregation PRIVATE dbms clickhouse_aggregate_functions) +target_link_libraries (processors_test dbms) +target_link_libraries (processors_test_chain dbms) +target_link_libraries (processors_test_merge dbms) +target_link_libraries (processors_test_expand_pipeline dbms) +target_link_libraries (processors_test_merging_sorted_transform dbms) +target_link_libraries (processors_test_merge_sorting_transform dbms) +target_link_libraries (processors_test_aggregation dbms clickhouse_aggregate_functions) diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index 357e457b240..62c64a9bdb0 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -65,6 +65,29 @@ add_library (common ${CONFIG_COMMON}) +# When testing for memory leaks with Valgrind, dont link tcmalloc or jemalloc. + +if (USE_JEMALLOC) + message (STATUS "Link jemalloc: ${JEMALLOC_LIBRARIES}") + set (MALLOC_LIBRARIES ${JEMALLOC_LIBRARIES}) +elseif (USE_TCMALLOC) + if (DEBUG_TCMALLOC AND NOT GPERFTOOLS_TCMALLOC_MINIMAL_DEBUG) + message (FATAL_ERROR "Requested DEBUG_TCMALLOC but debug library is not found. You should install Google Perftools. Example: sudo apt-get install libgoogle-perftools-dev") + endif () + + if (DEBUG_TCMALLOC AND GPERFTOOLS_TCMALLOC_MINIMAL_DEBUG) + message (STATUS "Link libtcmalloc_minimal_debug for testing: ${GPERFTOOLS_TCMALLOC_MINIMAL_DEBUG}") + set (MALLOC_LIBRARIES ${GPERFTOOLS_TCMALLOC_MINIMAL_DEBUG}) + else () + message (STATUS "Link libtcmalloc_minimal: ${GPERFTOOLS_TCMALLOC_MINIMAL}") + set (MALLOC_LIBRARIES ${GPERFTOOLS_TCMALLOC_MINIMAL}) + endif () +elseif (SANITIZE) + message (STATUS "Will use ${SANITIZE} sanitizer.") +elseif (OS_LINUX) + message (WARNING "Non default allocator is disabled. This is not recommended for production Linux builds.") +endif () + if (USE_INTERNAL_MEMCPY) set (MEMCPY_LIBRARIES memcpy) endif () @@ -97,6 +120,7 @@ target_link_libraries (common PUBLIC ${Boost_SYSTEM_LIBRARY} PRIVATE + ${MALLOC_LIBRARIES} ${MEMCPY_LIBRARIES}) if (RT_LIBRARY) diff --git a/libs/libcommon/src/tests/CMakeLists.txt b/libs/libcommon/src/tests/CMakeLists.txt index 486914e4ca7..15d872ac49d 100644 --- a/libs/libcommon/src/tests/CMakeLists.txt +++ b/libs/libcommon/src/tests/CMakeLists.txt @@ -10,20 +10,20 @@ add_executable (realloc-perf allocator.cpp) set(PLATFORM_LIBS ${CMAKE_DL_LIBS}) -target_link_libraries (date_lut_init PRIVATE common ${PLATFORM_LIBS}) -target_link_libraries (date_lut2 PRIVATE common ${PLATFORM_LIBS}) -target_link_libraries (date_lut3 PRIVATE common ${PLATFORM_LIBS}) -target_link_libraries (date_lut4 PRIVATE common ${PLATFORM_LIBS}) -target_link_libraries (date_lut_default_timezone PRIVATE common ${PLATFORM_LIBS}) -target_link_libraries (local_date_time_comparison PRIVATE common) -target_link_libraries (realloc-perf PRIVATE common) +target_link_libraries (date_lut_init common ${PLATFORM_LIBS}) +target_link_libraries (date_lut2 common ${PLATFORM_LIBS}) +target_link_libraries (date_lut3 common ${PLATFORM_LIBS}) +target_link_libraries (date_lut4 common ${PLATFORM_LIBS}) +target_link_libraries (date_lut_default_timezone common ${PLATFORM_LIBS}) +target_link_libraries (local_date_time_comparison common) +target_link_libraries (realloc-perf common) add_check(local_date_time_comparison) if(USE_GTEST) add_executable(unit_tests_libcommon gtest_json_test.cpp gtest_strong_typedef.cpp gtest_find_symbols.cpp) - target_link_libraries(unit_tests_libcommon PRIVATE common ${GTEST_MAIN_LIBRARIES} ${GTEST_LIBRARIES}) + target_link_libraries(unit_tests_libcommon common ${GTEST_MAIN_LIBRARIES} ${GTEST_LIBRARIES}) add_check(unit_tests_libcommon) endif() add_executable (dump_variable dump_variable.cpp) -target_link_libraries (dump_variable PRIVATE clickhouse_common_io) +target_link_libraries (dump_variable clickhouse_common_io) diff --git a/libs/libmysqlxx/src/tests/CMakeLists.txt b/libs/libmysqlxx/src/tests/CMakeLists.txt index ec3fdfaa913..d2901513808 100644 --- a/libs/libmysqlxx/src/tests/CMakeLists.txt +++ b/libs/libmysqlxx/src/tests/CMakeLists.txt @@ -1,2 +1,2 @@ add_executable (mysqlxx_test mysqlxx_test.cpp) -target_link_libraries (mysqlxx_test PRIVATE mysqlxx) +target_link_libraries (mysqlxx_test mysqlxx) From dad401a047722fdef41d688db0aaa54936e05be0 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 27 Sep 2019 16:25:51 +0300 Subject: [PATCH 294/309] Downgrade cmake version in packager image --- docker/packager/binary/Dockerfile | 5 +---- docker/packager/deb/Dockerfile | 2 -- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile index 2ad696cd279..d88a2767efd 100644 --- a/docker/packager/binary/Dockerfile +++ b/docker/packager/binary/Dockerfile @@ -16,9 +16,6 @@ RUN apt-get --allow-unauthenticated update -y \ RUN echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-7 main" >> /etc/apt/sources.list RUN echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main" >> /etc/apt/sources.list RUN add-apt-repository ppa:ubuntu-toolchain-r/test -RUN curl https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add - -RUN apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main' - RUN apt-get update -y \ && env DEBIAN_FRONTEND=noninteractive \ @@ -59,7 +56,7 @@ RUN apt-get update -y \ gdb \ rename \ wget - + # Build and install tools for cross-linking to Darwin ENV CC=clang-8 diff --git a/docker/packager/deb/Dockerfile b/docker/packager/deb/Dockerfile index 10263205fb4..986768b7a95 100644 --- a/docker/packager/deb/Dockerfile +++ b/docker/packager/deb/Dockerfile @@ -17,8 +17,6 @@ RUN apt-get --allow-unauthenticated update -y \ RUN echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-7 main" >> /etc/apt/sources.list RUN echo "deb [trusted=yes] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main" >> /etc/apt/sources.list RUN add-apt-repository ppa:ubuntu-toolchain-r/test -RUN curl https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add - -RUN apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main' RUN apt-get --allow-unauthenticated update -y \ && env DEBIAN_FRONTEND=noninteractive \ From 2f355d716af4194c4be38df3569c3c4555131e33 Mon Sep 17 00:00:00 2001 From: Alexander Kazakov Date: Fri, 27 Sep 2019 16:38:40 +0300 Subject: [PATCH 295/309] Corrections to tests 00992/00993:mutations to be deterministic --- ...2_system_parts_race_condition_zookeeper.sh | 19 ++++++++++++------- ...tem_parts_race_condition_drop_zookeeper.sh | 2 +- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/dbms/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper.sh b/dbms/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper.sh index 99c9e8774fe..2199c99e3be 100755 --- a/dbms/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper.sh +++ b/dbms/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper.sh @@ -5,10 +5,13 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) set -e -$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS alter_table" -$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS alter_table2" -$CLICKHOUSE_CLIENT -q "CREATE TABLE alter_table (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/alter_table', 'r1') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1" -$CLICKHOUSE_CLIENT -q "CREATE TABLE alter_table2 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/alter_table', 'r2') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1" +$CLICKHOUSE_CLIENT -n -q " + DROP TABLE IF EXISTS alter_table; + DROP TABLE IF EXISTS alter_table2; + + CREATE TABLE alter_table (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_DATABASE.alter_table', 'r1') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1; + CREATE TABLE alter_table2 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_DATABASE.alter_table', 'r2') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1 +" function thread1() { @@ -32,7 +35,7 @@ function thread4() function thread5() { - while true; do $CLICKHOUSE_CLIENT -q "ALTER TABLE alter_table DELETE WHERE rand() % 2 = 1"; done + while true; do $CLICKHOUSE_CLIENT -q "ALTER TABLE alter_table DELETE WHERE cityHash64(a,b,c,d,e,g) % 1048576 < 524288"; done } # https://stackoverflow.com/questions/9954794/execute-a-shell-function-with-timeout @@ -70,5 +73,7 @@ timeout $TIMEOUT bash -c thread5 2> /dev/null & wait -$CLICKHOUSE_CLIENT -q "DROP TABLE alter_table" -$CLICKHOUSE_CLIENT -q "DROP TABLE alter_table2" +$CLICKHOUSE_CLIENT -n -q " + DROP TABLE alter_table; + DROP TABLE alter_table2 +" diff --git a/dbms/tests/queries/0_stateless/00993_system_parts_race_condition_drop_zookeeper.sh b/dbms/tests/queries/0_stateless/00993_system_parts_race_condition_drop_zookeeper.sh index 20f848155bc..a9d6e51a28b 100755 --- a/dbms/tests/queries/0_stateless/00993_system_parts_race_condition_drop_zookeeper.sh +++ b/dbms/tests/queries/0_stateless/00993_system_parts_race_condition_drop_zookeeper.sh @@ -41,7 +41,7 @@ function thread5() { while true; do REPLICA=$(($RANDOM % 10)) - $CLICKHOUSE_CLIENT -q "ALTER TABLE alter_table_$REPLICA DELETE WHERE rand() % 2 = 1"; + $CLICKHOUSE_CLIENT -q "ALTER TABLE alter_table_$REPLICA DELETE WHERE cityHash64(a,b,c,d,e,g) % 1048576 < 524288"; sleep 0.$RANDOM; done } From a04756d0dd3a3bc64513657208c782c5fecfa1a0 Mon Sep 17 00:00:00 2001 From: BayoNet Date: Fri, 27 Sep 2019 18:44:36 +0300 Subject: [PATCH 296/309] DOCAPI-8799: Updated SHOW TABLE dscr. Restructured TOC (#7128) * Typo fix. * Links fix. * Fixed links in docs. * More fixes. * Link fixes. * DOCAPI-8799: Updated SHOW TABLES. Restructured TOC. * DOCAPI-8799: Fix. * DOCAPI-8799: Links fix. --- .../en/operations/server_settings/settings.md | 2 +- docs/en/query_language/misc.md | 66 --------------- docs/en/query_language/show.md | 82 +++++++++++++++++++ docs/fa/query_language/show.md | 1 + docs/toc_en.yml | 79 +++++++++--------- docs/toc_fa.yml | 1 + docs/toc_zh.yml | 1 + docs/zh/query_language/show.md | 1 + 8 files changed, 127 insertions(+), 106 deletions(-) create mode 100644 docs/en/query_language/show.md create mode 120000 docs/fa/query_language/show.md create mode 120000 docs/zh/query_language/show.md diff --git a/docs/en/operations/server_settings/settings.md b/docs/en/operations/server_settings/settings.md index 796e4621475..70e46629c32 100644 --- a/docs/en/operations/server_settings/settings.md +++ b/docs/en/operations/server_settings/settings.md @@ -61,7 +61,7 @@ ClickHouse checks `min_part_size` and `min_part_size_ratio` and processes the `c The default database. -To get a list of databases, use the [SHOW DATABASES](../../query_language/misc.md#show-databases) query. +To get a list of databases, use the [SHOW DATABASES](../../query_language/show.md#show-databases) query. **Example** diff --git a/docs/en/query_language/misc.md b/docs/en/query_language/misc.md index 4272bb5c155..22d67044619 100644 --- a/docs/en/query_language/misc.md +++ b/docs/en/query_language/misc.md @@ -214,72 +214,6 @@ SET profile = 'profile-name-from-the-settings-file' For more information, see [Settings](../operations/settings/settings.md). -## SHOW CREATE TABLE - -```sql -SHOW CREATE [TEMPORARY] TABLE [db.]table [INTO OUTFILE filename] [FORMAT format] -``` - -Returns a single `String`-type 'statement' column, which contains a single value – the `CREATE` query used for creating the specified table. - -## SHOW DATABASES {#show-databases} - -```sql -SHOW DATABASES [INTO OUTFILE filename] [FORMAT format] -``` - -Prints a list of all databases. -This query is identical to `SELECT name FROM system.databases [INTO OUTFILE filename] [FORMAT format]`. - -See also the section "Formats". - -## SHOW PROCESSLIST - -```sql -SHOW PROCESSLIST [INTO OUTFILE filename] [FORMAT format] -``` - -Outputs a list of queries currently being processed, other than `SHOW PROCESSLIST` queries. - -Prints a table containing the columns: - -**user** – The user who made the query. Keep in mind that for distributed processing, queries are sent to remote servers under the 'default' user. SHOW PROCESSLIST shows the username for a specific query, not for a query that this query initiated. - -**address** – The name of the host that the query was sent from. For distributed processing, on remote servers, this is the name of the query requestor host. To track where a distributed query was originally made from, look at SHOW PROCESSLIST on the query requestor server. - -**elapsed** – The execution time, in seconds. Queries are output in order of decreasing execution time. - -**rows_read**, **bytes_read** – How many rows and bytes of uncompressed data were read when processing the query. For distributed processing, data is totaled from all the remote servers. This is the data used for restrictions and quotas. - -**memory_usage** – Current RAM usage in bytes. See the setting 'max_memory_usage'. - -**query** – The query itself. In INSERT queries, the data for insertion is not output. - -**query_id** – The query identifier. Non-empty only if it was explicitly defined by the user. For distributed processing, the query ID is not passed to remote servers. - -This query is nearly identical to: `SELECT * FROM system.processes`. The difference is that the `SHOW PROCESSLIST` query does not show itself in a list, when the `SELECT .. FROM system.processes` query does. - -Tip (execute in the console): - -```bash -$ watch -n1 "clickhouse-client --query='SHOW PROCESSLIST'" -``` - -## SHOW TABLES - -```sql -SHOW [TEMPORARY] TABLES [FROM db] [LIKE 'pattern'] [INTO OUTFILE filename] [FORMAT format] -``` - -Displays a list of tables - -- Tables from the current database, or from the 'db' database if "FROM db" is specified. -- All tables, or tables whose name matches the pattern, if "LIKE 'pattern'" is specified. - -This query is identical to: `SELECT name FROM system.tables WHERE database = 'db' [AND name LIKE 'pattern'] [INTO OUTFILE filename] [FORMAT format]`. - -See also the section "LIKE operator". - ## TRUNCATE ```sql diff --git a/docs/en/query_language/show.md b/docs/en/query_language/show.md new file mode 100644 index 00000000000..04f373a31a9 --- /dev/null +++ b/docs/en/query_language/show.md @@ -0,0 +1,82 @@ +# SHOW Queries + +## SHOW CREATE TABLE + +```sql +SHOW CREATE [TEMPORARY] TABLE [db.]table [INTO OUTFILE filename] [FORMAT format] +``` + +Returns a single `String`-type 'statement' column, which contains a single value – the `CREATE` query used for creating the specified table. + +## SHOW DATABASES {#show-databases} + +```sql +SHOW DATABASES [INTO OUTFILE filename] [FORMAT format] +``` + +Prints a list of all databases. +This query is identical to `SELECT name FROM system.databases [INTO OUTFILE filename] [FORMAT format]`. + +See also the section "Formats". + +## SHOW PROCESSLIST + +```sql +SHOW PROCESSLIST [INTO OUTFILE filename] [FORMAT format] +``` + +Outputs a list of queries currently being processed, other than `SHOW PROCESSLIST` queries. + +Prints a table containing the columns: + +**user** – The user who made the query. Keep in mind that for distributed processing, queries are sent to remote servers under the 'default' user. SHOW PROCESSLIST shows the username for a specific query, not for a query that this query initiated. + +**address** – The name of the host that the query was sent from. For distributed processing, on remote servers, this is the name of the query requestor host. To track where a distributed query was originally made from, look at SHOW PROCESSLIST on the query requestor server. + +**elapsed** – The execution time, in seconds. Queries are output in order of decreasing execution time. + +**rows_read**, **bytes_read** – How many rows and bytes of uncompressed data were read when processing the query. For distributed processing, data is totaled from all the remote servers. This is the data used for restrictions and quotas. + +**memory_usage** – Current RAM usage in bytes. See the setting 'max_memory_usage'. + +**query** – The query itself. In INSERT queries, the data for insertion is not output. + +**query_id** – The query identifier. Non-empty only if it was explicitly defined by the user. For distributed processing, the query ID is not passed to remote servers. + +This query is nearly identical to: `SELECT * FROM system.processes`. The difference is that the `SHOW PROCESSLIST` query does not show itself in a list, when the `SELECT .. FROM system.processes` query does. + +Tip (execute in the console): + +```bash +$ watch -n1 "clickhouse-client --query='SHOW PROCESSLIST'" +``` + +## SHOW TABLES + +Displays a list of tables. + +```sql +SHOW [TEMPORARY] TABLES [FROM ] [LIKE ''] [LIMIT ] [INTO OUTFILE ] [FORMAT ] +``` + +If the `FROM` clause is not specified, the query returns the list of tables from the current database. + +The same result as the `SHOW TABLES` query returns, you can get by the following way: + +```sql +SELECT name FROM system.tables WHERE database = [AND name LIKE ] [LIMIT ] [INTO OUTFILE ] [FORMAT ] +``` + +**Example** + +The following query selects the first two rows from the list of tables in the `system` database, whose names contain `co`. + +```sql +SHOW TABLES FROM system LIKE '%co%' LIMIT 2 +``` +```text +┌─name───────────────────────────┐ +│ aggregate_function_combinators │ +│ collations │ +└────────────────────────────────┘ +``` diff --git a/docs/fa/query_language/show.md b/docs/fa/query_language/show.md new file mode 120000 index 00000000000..4c2f4cf2c4f --- /dev/null +++ b/docs/fa/query_language/show.md @@ -0,0 +1 @@ +../../en/query_language/show.md \ No newline at end of file diff --git a/docs/toc_en.yml b/docs/toc_en.yml index b3a46303e49..9317cb36729 100644 --- a/docs/toc_en.yml +++ b/docs/toc_en.yml @@ -32,35 +32,6 @@ nav: - 'Visual Interfaces': 'interfaces/third-party/gui.md' - 'Proxies': 'interfaces/third-party/proxy.md' -- 'Data Types': - - 'Introduction': 'data_types/index.md' - - 'UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64': 'data_types/int_uint.md' - - 'Float32, Float64': 'data_types/float.md' - - 'Decimal': 'data_types/decimal.md' - - 'Boolean': 'data_types/boolean.md' - - 'String': 'data_types/string.md' - - 'FixedString(N)': 'data_types/fixedstring.md' - - 'UUID': 'data_types/uuid.md' - - 'Date': 'data_types/date.md' - - 'DateTime': 'data_types/datetime.md' - - 'Enum': 'data_types/enum.md' - - 'Array(T)': 'data_types/array.md' - - 'AggregateFunction(name, types_of_arguments...)': 'data_types/nested_data_structures/aggregatefunction.md' - - 'Tuple(T1, T2, ...)': 'data_types/tuple.md' - - 'Nullable': 'data_types/nullable.md' - - 'Nested Data Structures': - - 'hidden': 'data_types/nested_data_structures/index.md' - - 'Nested(Name1 Type1, Name2 Type2, ...)': 'data_types/nested_data_structures/nested.md' - - 'Special Data Types': - - 'hidden': 'data_types/special_data_types/index.md' - - 'Expression': 'data_types/special_data_types/expression.md' - - 'Set': 'data_types/special_data_types/set.md' - - 'Nothing': 'data_types/special_data_types/nothing.md' - - 'Domains': - - 'Overview': 'data_types/domains/overview.md' - - 'IPv4': 'data_types/domains/ipv4.md' - - 'IPv6': 'data_types/domains/ipv6.md' - - 'Database Engines': - 'Introduction': 'database_engines/index.md' - 'MySQL': 'database_engines/mysql.md' @@ -105,12 +76,15 @@ nav: - 'SQL Reference': - 'hidden': 'query_language/index.md' - - 'SELECT': 'query_language/select.md' - - 'INSERT INTO': 'query_language/insert_into.md' - - 'CREATE': 'query_language/create.md' - - 'ALTER': 'query_language/alter.md' - - 'SYSTEM': 'query_language/system.md' - - 'Other Kinds of Queries': 'query_language/misc.md' + - 'Syntax': 'query_language/syntax.md' + - 'Statements': + - 'SELECT': 'query_language/select.md' + - 'INSERT INTO': 'query_language/insert_into.md' + - 'CREATE': 'query_language/create.md' + - 'ALTER': 'query_language/alter.md' + - 'SYSTEM': 'query_language/system.md' + - 'SHOW': 'query_language/show.md' + - 'Other': 'query_language/misc.md' - 'Functions': - 'Introduction': 'query_language/functions/index.md' - 'Arithmetic': 'query_language/functions/arithmetic_functions.md' @@ -172,7 +146,34 @@ nav: - 'Dictionary Key and Fields': 'query_language/dicts/external_dicts_dict_structure.md' - 'Internal Dictionaries': 'query_language/dicts/internal_dicts.md' - 'Operators': 'query_language/operators.md' - - 'General Syntax': 'query_language/syntax.md' + - 'Data Types': + - 'Introduction': 'data_types/index.md' + - 'UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64': 'data_types/int_uint.md' + - 'Float32, Float64': 'data_types/float.md' + - 'Decimal': 'data_types/decimal.md' + - 'Boolean': 'data_types/boolean.md' + - 'String': 'data_types/string.md' + - 'FixedString(N)': 'data_types/fixedstring.md' + - 'UUID': 'data_types/uuid.md' + - 'Date': 'data_types/date.md' + - 'DateTime': 'data_types/datetime.md' + - 'Enum': 'data_types/enum.md' + - 'Array(T)': 'data_types/array.md' + - 'AggregateFunction(name, types_of_arguments...)': 'data_types/nested_data_structures/aggregatefunction.md' + - 'Tuple(T1, T2, ...)': 'data_types/tuple.md' + - 'Nullable': 'data_types/nullable.md' + - 'Nested Data Structures': + - 'hidden': 'data_types/nested_data_structures/index.md' + - 'Nested(Name1 Type1, Name2 Type2, ...)': 'data_types/nested_data_structures/nested.md' + - 'Special Data Types': + - 'hidden': 'data_types/special_data_types/index.md' + - 'Expression': 'data_types/special_data_types/expression.md' + - 'Set': 'data_types/special_data_types/set.md' + - 'Nothing': 'data_types/special_data_types/nothing.md' + - 'Domains': + - 'Overview': 'data_types/domains/overview.md' + - 'IPv4': 'data_types/domains/ipv4.md' + - 'IPv6': 'data_types/domains/ipv6.md' - 'Operations': - 'Introduction': 'operations/index.md' @@ -202,9 +203,6 @@ nav: - 'clickhouse-copier': 'operations/utils/clickhouse-copier.md' - 'clickhouse-local': 'operations/utils/clickhouse-local.md' -- 'F.A.Q.': - - 'General Questions': 'faq/general.md' - - 'Development': - 'hidden': 'development/index.md' - 'Overview of ClickHouse Architecture': 'development/architecture.md' @@ -219,3 +217,6 @@ nav: - 'Roadmap': 'roadmap.md' - 'Changelog': 'changelog.md' - 'Security Changelog': 'security_changelog.md' + +- 'F.A.Q.': + - 'General Questions': 'faq/general.md' diff --git a/docs/toc_fa.yml b/docs/toc_fa.yml index b35ead655f7..fb412f45c9d 100644 --- a/docs/toc_fa.yml +++ b/docs/toc_fa.yml @@ -110,6 +110,7 @@ nav: - 'CREATE': 'query_language/create.md' - 'ALTER': 'query_language/alter.md' - 'SYSTEM': 'query_language/system.md' + - 'SYSTEM': 'query_language/show.md' - 'Other Kinds of Queries': 'query_language/misc.md' - 'Functions': - 'Introduction': 'query_language/functions/index.md' diff --git a/docs/toc_zh.yml b/docs/toc_zh.yml index f90ace045d6..8a40a1fb133 100644 --- a/docs/toc_zh.yml +++ b/docs/toc_zh.yml @@ -109,6 +109,7 @@ nav: - 'CREATE': 'query_language/create.md' - 'ALTER': 'query_language/alter.md' - 'SYSTEM': 'query_language/system.md' + - 'SHOW': 'query_language/show.md' - '其他类型的查询': 'query_language/misc.md' - '函数': - '介绍': 'query_language/functions/index.md' diff --git a/docs/zh/query_language/show.md b/docs/zh/query_language/show.md new file mode 120000 index 00000000000..4c2f4cf2c4f --- /dev/null +++ b/docs/zh/query_language/show.md @@ -0,0 +1 @@ +../../en/query_language/show.md \ No newline at end of file From e97571d5212be441bb13b692a8a49a0581fde5ef Mon Sep 17 00:00:00 2001 From: Denis Zhuravlev Date: Fri, 27 Sep 2019 15:05:11 -0300 Subject: [PATCH 297/309] Update array_functions.md fix arrayDistinct description --- docs/en/query_language/functions/array_functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/query_language/functions/array_functions.md b/docs/en/query_language/functions/array_functions.md index bea2c0a6ec6..d6b589535a1 100644 --- a/docs/en/query_language/functions/array_functions.md +++ b/docs/en/query_language/functions/array_functions.md @@ -665,7 +665,7 @@ SELECT arrayDifference([1, 2, 3, 4]) ## arrayDistinct(arr) -Takes an array, returns an array containing the different elements in all the arrays. For example: +Takes an array, returns an array containing the distinct elements. For example: ```sql SELECT arrayDistinct([1, 2, 2, 3, 1]) From c20604bd5e90ce5a580a1c4b7396d0643cb5d2a1 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sat, 28 Sep 2019 01:08:17 +0300 Subject: [PATCH 298/309] Update CHANGELOG.md --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b03364e2d3..e0cdb774a89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -103,7 +103,6 @@ * Avoid possible deadlock in `TRUNCATE` of Replicated table. [#6695](https://github.com/ClickHouse/ClickHouse/pull/6695) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Fix reading in order of sorting key. [#6189](https://github.com/ClickHouse/ClickHouse/pull/6189) ([Anton Popov](https://github.com/CurtizJ)) * Fix `ALTER TABLE ... UPDATE` query for tables with `enable_mixed_granularity_parts=1`. [#6543](https://github.com/ClickHouse/ClickHouse/pull/6543) ([alesapin](https://github.com/alesapin)) -* Fixed the case when server may close listening sockets but not shutdown and continue serving remaining queries. You may end up with two running clickhouse-server processes. Sometimes, the server may return an error `bad_function_call` for remaining queries. [#6231](https://github.com/ClickHouse/ClickHouse/pull/6231) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Fix bug opened by [#4405](https://github.com/ClickHouse/ClickHouse/pull/4405) (since 19.4.0). Reproduces in queries to Distributed tables over MergeTree tables when we doesn't query any columns (`SELECT 1`). [#6236](https://github.com/ClickHouse/ClickHouse/pull/6236) ([alesapin](https://github.com/alesapin)) * Fixed overflow in integer division of signed type to unsigned type. The behaviour was exactly as in C or C++ language (integer promotion rules) that may be surprising. Please note that the overflow is still possible when dividing large signed number to large unsigned number or vice-versa (but that case is less usual). The issue existed in all server versions. [#6214](https://github.com/ClickHouse/ClickHouse/issues/6214) [#6233](https://github.com/ClickHouse/ClickHouse/pull/6233) ([alexey-milovidov](https://github.com/alexey-milovidov)) * Limit maximum sleep time for throttling when `max_execution_speed` or `max_execution_speed_bytes` is set. Fixed false errors like `Estimated query execution time (inf seconds) is too long`. [#5547](https://github.com/ClickHouse/ClickHouse/issues/5547) [#6232](https://github.com/ClickHouse/ClickHouse/pull/6232) ([alexey-milovidov](https://github.com/alexey-milovidov)) From c6b18c9fe8ff2564203e276252a7c2b302d00ae9 Mon Sep 17 00:00:00 2001 From: Yuriy Date: Sat, 28 Sep 2019 03:27:53 +0300 Subject: [PATCH 299/309] disabled auth_gssapi_client plugin --- contrib/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 38f2c96b0f9..06c33fb7e74 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -129,6 +129,7 @@ if (ENABLE_MYSQL AND USE_INTERNAL_MYSQL_LIBRARY) set(CLIENT_PLUGIN_SHA256_PASSWORD STATIC) set(CLIENT_PLUGIN_REMOTE_IO OFF) set(CLIENT_PLUGIN_DIALOG OFF) + set(CLIENT_PLUGIN_AUTH_GSSAPI_CLIENT OFF) set(CLIENT_PLUGIN_CLIENT_ED25519 OFF) set(CLIENT_PLUGIN_MYSQL_CLEAR_PASSWORD OFF) set(SKIP_TESTS 1) From 774e255154f1779bc27a6671162b3ce2a02a0409 Mon Sep 17 00:00:00 2001 From: Denis Zhuravlev Date: Sat, 28 Sep 2019 03:39:25 -0300 Subject: [PATCH 300/309] Doc change. translation to Russian some array functions (#7139) --- .../functions/array_functions.md | 145 +++++++++++++++++- 1 file changed, 143 insertions(+), 2 deletions(-) diff --git a/docs/ru/query_language/functions/array_functions.md b/docs/ru/query_language/functions/array_functions.md index 19e3bb965c5..680055fa816 100644 --- a/docs/ru/query_language/functions/array_functions.md +++ b/docs/ru/query_language/functions/array_functions.md @@ -647,15 +647,156 @@ SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res; └─────────┘ ``` -## arrayUniq(arr, ...) +## arrayUniq(arr, ...) {#array_functions-arrayuniq} Если передан один аргумент, считает количество разных элементов в массиве. Если передано несколько аргументов, считает количество разных кортежей из элементов на соответствующих позициях в нескольких массивах. Если необходимо получить список уникальных элементов массива, можно воспользоваться arrayReduce('groupUniqArray', arr). -## arrayJoin(arr) +## arrayJoin(arr) {#array_functions-arrayjoin} Особенная функция. Смотрите раздел ["Функция arrayJoin"](array_join.md#functions_arrayjoin). +## arrayDifference(arr) {#array_functions-arraydifference} + +Принимает массив, возвращает массив разностей между соседними элементами. Первым элементом будет 0, вторым разность между вторым и первым элементами исходного массива, и т.д. +Результирующий массив имеет тип Array(Int64) для целых чисел и Array(Float64) для чисел с плавающей точкой. Другие типы, в том числе Decimal, не поддерживаются. + +Пример: + +```sql +SELECT arrayDifference([1, 2, 3, 4]) +``` + +```text +┌─arrayDifference([1, 2, 3, 4])─┐ +│ [0,1,1,1] │ +└───────────────────────────────┘ +``` + +Пример переполнения из-за результирующего типа Int64: + +```sql +SELECT arrayDifference([0, 10000000000000000000]) +``` + +```text +┌─arrayDifference([0, 10000000000000000000])─┐ +│ [0,-8446744073709551616] │ +└────────────────────────────────────────────┘ +``` + +## arrayDistinct(arr) {#array_functions-arraydistinct} + +Принимает массив, возвращает массив, содержащий уникальные элементы. + +Пример: + +```sql +SELECT arrayDistinct([1, 2, 2, 3, 1]) +``` + +```text +┌─arrayDistinct([1, 2, 2, 3, 1])─┐ +│ [1,2,3] │ +└────────────────────────────────┘ +``` + +## arrayEnumerateDense(arr) {#array_functions-arrayenumeratedense} + +Возвращает массив того же размера, что и исходный массив, с индексами исходного массива, указывающими, где каждый элемент впервые появляется в исходном массиве. + +Пример: + +```sql +SELECT arrayEnumerateDense([10, 20, 10, 30]) +``` + +```text +┌─arrayEnumerateDense([10, 20, 10, 30])─┐ +│ [1,2,1,3] │ +└───────────────────────────────────────┘ +``` + +## arrayIntersect(arr) {#array_functions-arrayintersect} + +Принимает несколько массивов, возвращает массив с элементами, присутствующими во всех исходных массивах. Элементы на выходе следуют в порядке следования в первом массиве. + +Пример: + +```sql +SELECT + arrayIntersect([1, 2], [1, 3], [2, 3]) AS no_intersect, + arrayIntersect([1, 2], [1, 3], [1, 4]) AS intersect +``` + +```text +┌─no_intersect─┬─intersect─┐ +│ [] │ [1] │ +└──────────────┴───────────┘ +``` + +## arrayReduce(agg_func, arr1, ...) {#array_functions-arrayreduce} + +Применяет агрегатную функцию к элементам массива и возвращает ее результат. Имя агрегирующей функции передается как строка в одинарных кавычках `'max'`, `'sum'`. При использовании параметрических агрегатных функций, параметр указывается после имени функции в круглых скобках `'uniqUpTo(6)'`. + +Пример: + +```sql +SELECT arrayReduce('max', [1, 2, 3]) +``` + +```text +┌─arrayReduce('max', [1, 2, 3])─┐ +│ 3 │ +└───────────────────────────────┘ +``` + +Если агрегатная функция имеет несколько аргументов, то эту функцию можно применять к нескольким массивам одинакового размера. + +Пример: + +```sql +SELECT arrayReduce('maxIf', [3, 5], [1, 0]) +``` + +```text +┌─arrayReduce('maxIf', [3, 5], [1, 0])─┐ +│ 3 │ +└──────────────────────────────────────┘ +``` + +Пример с параметрической агрегатной функцией: + +```sql +SELECT arrayReduce('uniqUpTo(3)', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +``` + +```text +┌─arrayReduce('uniqUpTo(3)', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])─┐ +│ 4 │ +└─────────────────────────────────────────────────────────────┘ +``` + +## arrayReverse(arr) {#array_functions-arrayreverse} + +Возвращает массив того же размера, что и исходный массив, содержащий элементы в обратном порядке. + +Пример: +```sql +SELECT arrayReverse([1, 2, 3]) +``` + +```text +┌─arrayReverse([1, 2, 3])─┐ +│ [3,2,1] │ +└─────────────────────────┘ +``` + +# reverse(arr) {#array_functions-reverse} + +Синоним для ["arrayReverse"](#array_functions-arrayreverse) + + [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/array_functions/) From fa31743ace531bcdc817c2396128224964479634 Mon Sep 17 00:00:00 2001 From: BayoNet Date: Sat, 28 Sep 2019 17:55:50 +0300 Subject: [PATCH 301/309] DOCAPI-8259: EN review, RU translation. toInt, toUint docs refactoring and update (#7043) * Typo fix. * Update type_conversion_functions.md (#48) * DOCAPI-8259: RU translation * DOCAPI-8259: EN. Numeric conversion issues. * DOCAPI-8259: Clarification. --- .../functions/type_conversion_functions.md | 43 ++++--- .../functions/type_conversion_functions.md | 114 ++++++++++++++++-- 2 files changed, 126 insertions(+), 31 deletions(-) diff --git a/docs/en/query_language/functions/type_conversion_functions.md b/docs/en/query_language/functions/type_conversion_functions.md index 9245ec00120..f37130ecf33 100644 --- a/docs/en/query_language/functions/type_conversion_functions.md +++ b/docs/en/query_language/functions/type_conversion_functions.md @@ -1,26 +1,31 @@ - # Type Conversion Functions +## Common Issues of Numeric Conversions {#numeric-conversion-issues} + +When you convert a value from one to another data type, you should remember that in common case, it is an unsafe operation that can lead to a data loss. A data loss can occur if you try to fit value from a larger data type to a smaller data type, or if you convert values between different data types. + +ClickHouse has the [same behavior as C++ programs](https://en.cppreference.com/w/cpp/language/implicit_conversion). + ## toInt(8|16|32|64) -Converts an input value to the [Int](../../data_types/int_uint.md) data type. This functions family includes: +Converts an input value to the [Int](../../data_types/int_uint.md) data type. This function family includes: -* `toInt8(expr)` — Results in `Int8` data type. -* `toInt16(expr)` — Results in `Int16` data type. -* `toInt32(expr)` — Results in `Int32` data type. -* `toInt64(expr)` — Results in `Int64` data type. +* `toInt8(expr)` — Results in the `Int8` data type. +* `toInt16(expr)` — Results in the `Int16` data type. +* `toInt32(expr)` — Results in the `Int32` data type. +* `toInt64(expr)` — Results in the `Int64` data type. **Parameters** -- `expr` — [Expression](../syntax.md#syntax-expressions) returning a number or a string with decimal representation of a number. Binary, octal, and hexadecimal representations of numbers are not supported. Leading zeroes are stripped. +- `expr` — [Expression](../syntax.md#syntax-expressions) returning a number or a string with the decimal representation of a number. Binary, octal, and hexadecimal representations of numbers are not supported. Leading zeroes are stripped. **Returned value** -Integer value in `Int8`, `Int16`, `Int32` or `Int64` data type. +Integer value in the `Int8`, `Int16`, `Int32`, or `Int64` data type. -Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), they truncate fraction digits of numbers. +Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), meaning they truncate fractional digits of numbers. -The behaviour of functions for the [NaN and Inf](../../data_types/float.md#data_type-float-nan-inf) arguments is undefined. +The behavior of functions for the [NaN and Inf](../../data_types/float.md#data_type-float-nan-inf) arguments is undefined. Remember about [numeric convertions issues](#numeric-conversion-issues), when using the functions. **Example** @@ -39,24 +44,24 @@ SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) ## toUInt(8|16|32|64) -Converts an input value to the [UInt](../../data_types/int_uint.md) data type. This functions family includes: +Converts an input value to the [UInt](../../data_types/int_uint.md) data type. This function family includes: -* `toUInt8(expr)` — Results in `UInt8` data type. -* `toUInt16(expr)` — Results in `UInt16` data type. -* `toUInt32(expr)` — Results in `UInt32` data type. -* `toUInt64(expr)` — Results in `UInt64` data type. +* `toUInt8(expr)` — Results in the `UInt8` data type. +* `toUInt16(expr)` — Results in the `UInt16` data type. +* `toUInt32(expr)` — Results in the `UInt32` data type. +* `toUInt64(expr)` — Results in the `UInt64` data type. **Parameters** -- `expr` — [Expression](../syntax.md#syntax-expressions) returning a number or a string with decimal representation of a number. Binary, octal, and hexadecimal representations of numbers are not supported. Leading zeroes are stripped. +- `expr` — [Expression](../syntax.md#syntax-expressions) returning a number or a string with the decimal representation of a number. Binary, octal, and hexadecimal representations of numbers are not supported. Leading zeroes are stripped. **Returned value** -Integer value in `UInt8`, `UInt16`, `UInt32` or `UInt64` data type. +Integer value in the `UInt8`, `UInt16`, `UInt32`, or `UInt64` data type. -Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), they truncate fraction digits of numbers. +Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), meaning they truncate fractional digits of numbers. -The behaviour of functions for negative agruments and for the [NaN and Inf](../../data_types/float.md#data_type-float-nan-inf) arguments is undefined. If you pass the string with negative number, for example `'-32'`, ClickHouse rises an exception. +The behavior of functions for negative agruments and for the [NaN and Inf](../../data_types/float.md#data_type-float-nan-inf) arguments is undefined. If you pass a string with a negative number, for example `'-32'`, ClickHouse raises an exception. Remember about [numeric convertions issues](#numeric-conversion-issues), when using the functions. **Example** diff --git a/docs/ru/query_language/functions/type_conversion_functions.md b/docs/ru/query_language/functions/type_conversion_functions.md index 72354e77f8c..af02eeae835 100644 --- a/docs/ru/query_language/functions/type_conversion_functions.md +++ b/docs/ru/query_language/functions/type_conversion_functions.md @@ -1,20 +1,108 @@ # Функции преобразования типов -## toUInt8, toUInt16, toUInt32, toUInt64 +## Общие проблемы преобразования чисел {#numeric-conversion-issues} -## toInt8, toInt16, toInt32, toInt64 +При преобразовании значения из одного типа в другой необходимо помнить, что в общем случае это небезопасная операция, которая может привести к потере данных. Потеря данных может произойти при попытке сконвертировать тип данных значения от большего к меньшему или при конвертировании между различными классами типов данных. -## toFloat32, toFloat64 +Поведение ClickHouse при конвертировании похоже на [поведение C++ программ](https://en.cppreference.com/w/cpp/language/implicit_conversion). -## toDate, toDateTime +## toInt(8|16|32|64) -## toUInt8OrZero, toUInt16OrZero, toUInt32OrZero, toUInt64OrZero, toInt8OrZero, toInt16OrZero, toInt32OrZero, toInt64OrZero, toFloat32OrZero, toFloat64OrZero +Преобразует входное значение к типу [Int](../../data_types/int_uint.md). Семейство функций включает: -## toUInt8OrNull, toUInt16OrNull, toUInt32OrNull, toUInt64OrNull, toInt8OrNull, toInt16OrNull, toInt32OrNull, toInt64OrNull, toFloat32OrNull, toFloat64OrNull, toDateOrNull, toDateTimeOrNull +* `toInt8(expr)` — возвращает значение типа `Int8`. +* `toInt16(expr)` — возвращает значение типа `Int16`. +* `toInt32(expr)` — возвращает значение типа `Int32`. +* `toInt64(expr)` — возвращает значение типа `Int64`. -## toDecimal32(value, S), toDecimal64(value, S), toDecimal128(value, S) +**Параметры** -Преобразует тип `value` в тип [Decimal](../../data_types/decimal.md), имеющий точность `S`. `value` может быть числом или строкой. Параметр `S` (scale) устанавливает количество десятичных знаков. +- `expr` — [выражение](../syntax.md#syntax-expressions) возвращающее число или строку с десятичным представление числа. Бинарное, восьмеричное и шестнадцатеричное представление числа не поддержаны. Ведущие нули обрезаются. + +**Возвращаемое значение** + +Целое число типа `Int8`, `Int16`, `Int32` или `Int64`. + +Функции используют [округление к нулю](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), т.е. обрезают дробную часть числа. + +Поведение функций для аргументов [NaN и Inf](../../data_types/float.md#data_type-float-nan-inf) не определено. При использовании функций помните о возможных проблемах при [преобразовании чисел](#numeric-conversion-issues). + +**Пример** + +```sql +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) +``` +```text +┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ +│ -9223372036854775808 │ 32 │ 16 │ 8 │ +└──────────────────────┴─────────────┴───────────────┴─────────────┘ +``` + +## toInt(8|16|32|64)OrZero + +## toInt(8|16|32|64)OrNull + +## toUInt(8|16|32|64) + +Преобраует входное значение к типу [UInt](../../data_types/int_uint.md). Семейство функций включает: + +* `toUInt8(expr)` — возвращает значение типа `UInt8`. +* `toUInt16(expr)` — возвращает значение типа `UInt16`. +* `toUInt32(expr)` — возвращает значение типа `UInt32`. +* `toUInt64(expr)` — возвращает значение типа `UInt64`. + +**Параметры** + +- `expr` — [выражение](../syntax.md#syntax-expressions) возвращающее число или строку с десятичным представление числа. Бинарное, восьмеричное и шестнадцатеричное представление числа не поддержаны. Ведущие нули обрезаются. + +**Возвращаемое значение** + +Целое число типа `UInt8`, `UInt16`, `UInt32` или `UInt64`. + +Функции используют [округление к нулю](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), т.е. обрезают дробную часть числа. + +Поведение функций для аргументов [NaN и Inf](../../data_types/float.md#data_type-float-nan-inf) не определено. Если передать строку, содержащую отрицательное число, например `'-32'`, ClickHouse генерирует исключение. При использовании функций помните о возможных проблемах при [преобразовании чисел](#numeric-conversion-issues). + +**Пример** + +```sql +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) +``` +```text +┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ +│ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ +└─────────────────────┴───────────────┴────────────────┴──────────────┘ +``` + +## toUInt(8|16|32|64)OrZero + +## toUInt(8|16|32|64)OrNull + +## toFloat(32|64) + +## toFloat(32|64)OrZero + +## toFloat(32|64)OrNull + +## toDate + +## toDateOrZero + +## toDateOrNull + +## toDateTime + +## toDateTimeOrZero + +## toDateTimeOrNull + +## toDecimal(32|64|128) + +Преобразует `value` к типу данных [Decimal](../../data_types/decimal.md) с точностью `S`. `value` может быть числом или строкой. Параметр `S` (scale) задаёт число десятичных знаков. + +- `toDecimal32(value, S)` +- `toDecimal64(value, S)` +- `toDecimal128(value, S)` ## toDecimal(32|64|128)OrNull @@ -174,13 +262,15 @@ SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut └────────────┴───────┘ ``` -## reinterpretAsUInt8, reinterpretAsUInt16, reinterpretAsUInt32, reinterpretAsUInt64 +## reinterpretAsUInt(8|16|32|64) -## reinterpretAsInt8, reinterpretAsInt16, reinterpretAsInt32, reinterpretAsInt64 +## reinterpretAsInt(8|16|32|64) -## reinterpretAsFloat32, reinterpretAsFloat64 +## reinterpretAsFloat(32|64) -## reinterpretAsDate, reinterpretAsDateTime +## reinterpretAsDate + +## reinterpretAsDateTime Функции принимают строку и интерпретируют байты, расположенные в начале строки, как число в host order (little endian). Если строка имеет недостаточную длину, то функции работают так, как будто строка дополнена необходимым количеством нулевых байт. Если строка длиннее, чем нужно, то лишние байты игнорируются. Дата интерпретируется, как число дней с начала unix-эпохи, а дата-с-временем - как число секунд с начала unix-эпохи. From 34b61e8dfff5d06d2dec4e8d68ba021538dae748 Mon Sep 17 00:00:00 2001 From: BayoNet Date: Sat, 28 Sep 2019 17:58:10 +0300 Subject: [PATCH 302/309] DOCAPI-7442: EN review, RU translation for updated system.parts docs (#7041) * Typo fix. * Update system_tables.md (#40) * DOCAPI-7442: RU translation. * DOCAPI-7442: Fix. --- docs/en/operations/system_tables.md | 66 +++++++++---------- docs/ru/operations/system_tables.md | 59 ++++++++++------- .../functions/hash_functions.md | 2 +- 3 files changed, 69 insertions(+), 58 deletions(-) diff --git a/docs/en/operations/system_tables.md b/docs/en/operations/system_tables.md index 36008cffdc6..47bbf0266ac 100644 --- a/docs/en/operations/system_tables.md +++ b/docs/en/operations/system_tables.md @@ -311,45 +311,45 @@ Columns: - `YYYYMM` for automatic partitioning by month. - `any_string` when partitioning manually. -- `name` (String) – Name of the data part. -- `active` (UInt8) – Flag that indicates whether the part is active. If a part is active, it is used in a table; otherwise, it will be deleted. Inactive data parts remain after merging. -- `marks` (UInt64) – The number of marks. To get the approximate number of rows in a data part, multiply `marks` by the index granularity (usually 8192) (this hint doesn't work for adaptive granularity). -- `rows` (UInt64) – The number of rows. -- `bytes_on_disk` (UInt64) – Total size of all the data part files in bytes. -- `data_compressed_bytes` (UInt64) – Total size of compressed data in the data part. All the auxiliary files (for example, files with marks) are not included. -- `data_uncompressed_bytes` (UInt64) – Total size of uncompressed data in the data part. All the auxiliary files (for example, files with marks) are not included. -- `marks_bytes` (UInt64) – The size of the file with marks. -- `modification_time` (DateTime) – The modification time of the directory with the data part. This usually corresponds to the time of data part creation.| -- `remove_time` (DateTime) – The time when the data part became inactive. -- `refcount` (UInt32) – The number of places where the data part is used. A value greater than 2 indicates that the data part is used in queries or merges. -- `min_date` (Date) – The minimum value of the date key in the data part. -- `max_date` (Date) – The maximum value of the date key in the data part. -- `min_time` (DateTime) – The minimum value of the date and time key in the data part. +- `name` (`String`) – Name of the data part. +- `active` (`UInt8`) – Flag that indicates whether the data part is active. If a data part is active, it's used in a table. Otherwise, it's deleted. Inactive data parts remain after merging. +- `marks` (`UInt64`) – The number of marks. To get the approximate number of rows in a data part, multiply `marks` by the index granularity (usually 8192) (this hint doesn't work for adaptive granularity). +- `rows` (`UInt64`) – The number of rows. +- `bytes_on_disk` (`UInt64`) – Total size of all the data part files in bytes. +- `data_compressed_bytes` (`UInt64`) – Total size of compressed data in the data part. All the auxiliary files (for example, files with marks) are not included. +- `data_uncompressed_bytes` (`UInt64`) – Total size of uncompressed data in the data part. All the auxiliary files (for example, files with marks) are not included. +- `marks_bytes` (`UInt64`) – The size of the file with marks. +- `modification_time` (`DateTime`) – The time the directory with the data part was modified. This usually corresponds to the time of data part creation.| +- `remove_time` (`DateTime`) – The time when the data part became inactive. +- `refcount` (`UInt32`) – The number of places where the data part is used. A value greater than 2 indicates that the data part is used in queries or merges. +- `min_date` (`Date`) – The minimum value of the date key in the data part. +- `max_date` (`Date`) – The maximum value of the date key in the data part. +- `min_time` (`DateTime`) – The minimum value of the date and time key in the data part. - `max_time`(`DateTime`) – The maximum value of the date and time key in the data part. -- `partition_id` (String) – Id of the partition. -- `min_block_number` (UInt64) – The minimum number of data parts that make up the current part after merging. -- `max_block_number` (UInt64) – The maximum number of data parts that make up the current part after merging. -- `level` (UInt32) – Depth of the merge tree. Zero means that current part was created by insert rather than by merging other parts. -- `data_version` (UInt64) – Number that is used to determine which mutations should be applied to the data part (the mutations with the higher version than `data_version`). -- `primary_key_bytes_in_memory` (UInt64) – The amount of memory (in bytes) used by primary key values. -- `primary_key_bytes_in_memory_allocated` (UInt64) – The amount of memory (in bytes) reserved for primary key values. -- `is_frozen` (UInt8) – Flag that shows partition data backup existence. 1, the backup exists. 0, the backup doesn't exist. For more details, see [FREEZE PARTITION](../query_language/alter.md#alter_freeze-partition) -- `database` (String) – Name of the database. -- `table` (String) – Name of the table. -- `engine` (String) – Name of the table engine without parameters. -- `path` (String) – Absolute path to the folder with data part files. -- `hash_of_all_files` (String) – [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) of compressed files. -- `hash_of_uncompressed_files` (String) – [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) of uncompressed data. -- `uncompressed_hash_of_compressed_files` (String) – [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) of the file with marks. -- `bytes` (UInt64) – Alias for `bytes_on_disk`. -- `marks_size` (UInt64) – Alias for `marks_bytes`. +- `partition_id` (`String`) – ID of the partition. +- `min_block_number` (`UInt64`) – The minimum number of data parts that make up the current part after merging. +- `max_block_number` (`UInt64`) – The maximum number of data parts that make up the current part after merging. +- `level` (`UInt32`) – Depth of the merge tree. Zero means that the current part was created by insert rather than by merging other parts. +- `data_version` (`UInt64`) – Number that is used to determine which mutations should be applied to the data part (mutations with a version higher than `data_version`). +- `primary_key_bytes_in_memory` (`UInt64`) – The amount of memory (in bytes) used by primary key values. +- `primary_key_bytes_in_memory_allocated` (`UInt64`) – The amount of memory (in bytes) reserved for primary key values. +- `is_frozen` (`UInt8`) – Flag that shows that a partition data backup exists. 1, the backup exists. 0, the backup doesn't exist. For more details, see [FREEZE PARTITION](../query_language/alter.md#alter_freeze-partition) +- `database` (`String`) – Name of the database. +- `table` (`String`) – Name of the table. +- `engine` (`String`) – Name of the table engine without parameters. +- `path` (`String`) – Absolute path to the folder with data part files. +- `hash_of_all_files` (`String`) – [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) of compressed files. +- `hash_of_uncompressed_files` (`String`) – [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) of uncompressed files (files with marks, index file etc.). +- `uncompressed_hash_of_compressed_files` (`String`) – [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) of data in the compressed files as if they were uncompressed. +- `bytes` (`UInt64`) – Alias for `bytes_on_disk`. +- `marks_size` (`UInt64`) – Alias for `marks_bytes`. ## system.part_log {#system_tables-part-log} The `system.part_log` table is created only if the [part_log](server_settings/settings.md#server_settings-part-log) server setting is specified. -This table contains information about the events that occurred with the [data parts](table_engines/custom_partitioning_key.md) in the [MergeTree](table_engines/mergetree.md) family tables. For instance, adding or merging data. +This table contains information about events that occurred with [data parts](table_engines/custom_partitioning_key.md) in the [MergeTree](table_engines/mergetree.md) family tables, such as adding or merging data. The `system.part_log` table contains the following columns: @@ -429,7 +429,7 @@ Columns: - `query` (String) — Query string. - `exception` (String) — Exception message. - `stack_trace` (String) — Stack trace (a list of methods called before the error occurred). An empty string, if the query is completed successfully. -- `is_initial_query` (UInt8) — Kind of query. Possible values: +- `is_initial_query` (UInt8) — Query type. Possible values: - 1 — Query was initiated by the client. - 0 — Query was initiated by another query for distributed query execution. - `user` (String) — Name of the user who initiated the current query. diff --git a/docs/ru/operations/system_tables.md b/docs/ru/operations/system_tables.md index 3aa77776c44..ee50dfddce9 100644 --- a/docs/ru/operations/system_tables.md +++ b/docs/ru/operations/system_tables.md @@ -288,41 +288,52 @@ SELECT * FROM system.metrics LIMIT 10 ## system.parts {#system_tables-parts} -Содержит информацию о кусках таблиц семейства [MergeTree](table_engines/mergetree.md). +Содержит информацию о кусках данных таблиц семейства [MergeTree](table_engines/mergetree.md). Каждая строка описывает один кусок данных. Столбцы: -- partition (String) - Имя партиции. Что такое партиция можно узнать из описания запроса [ALTER](../query_language/alter.md#query_language_queries_alter). +- `partition` (`String`) – Имя партиции. Что такое партиция можно узнать из описания запроса [ALTER](../query_language/alter.md#query_language_queries_alter). Форматы: - `YYYYMM` для автоматической схемы партиционирования по месяцам. - `any_string` при партиционировании вручную. -- `name` (String) - имя куска; -- `active` (UInt8) - признак активности. Если кусок активен, то он используется таблицей, в противном случает он будет удален. Неактивные куски остаются после слияний; -- `marks` (UInt64) - количество засечек. Чтобы получить примерное количество строк в куске, умножьте `marks` на гранулированность индекса (обычно 8192); -- `marks_size` (UInt64) - размер файла с засечками; -- `rows` (UInt64) - количество строк; -- `bytes` (UInt64) - количество байт в сжатом виде; -- `modification_time` (DateTime) - время модификации директории с куском. Обычно соответствует времени создания куска; -- `remove_time` (DateTime) - время, когда кусок стал неактивным; -- `refcount` (UInt32) - количество мест, в котором кусок используется. Значение больше 2 говорит о том, что кусок участвует в запросах или в слияниях; -- `min_date` (Date) - минимальное значение ключа даты в куске; -- `max_date` (Date) - максимальное значение ключа даты в куске; -- `min_block_number` (UInt64) - минимальное число кусков, из которых состоит текущий после слияния; -- `max_block_number` (UInt64) - максимальное число кусков, из которых состоит текущий после слияния; -- `level` (UInt32) - глубина дерева слияний. Если слияний не было, то `level=0`; -- `primary_key_bytes_in_memory` (UInt64) - объем памяти (в байтах), занимаемой значениями первичных ключей; -- `primary_key_bytes_in_memory_allocated` (UInt64) - выделенный с резервом объем памяти (в байтах) для размещения первичных ключей; -- `database (String)` - имя базы данных; -- `table (String)` - имя таблицы; -- `engine (String)` - имя движка таблицы, без параметров; -- `path (String)` - путь к куску на диске; -- `disk (String)` - имя диска, на котором находится кусок; -- `is_frozen (UInt8)` – Признак, показывающий существование бэкапа партиции. 1, бэкап есть. 0, бэкапа нет. Смотрите раздел [FREEZE PARTITION](../query_language/alter.md#alter_freeze-partition) +- `name` (`String`) – имя куска. +- `active` (`UInt8`) – признак активности. Если кусок активен, то он используется таблицей, в противном случает он будет удален. Неактивные куски остаются после слияний. +- `marks` (`UInt64`) – количество засечек. Чтобы получить примерное количество строк в куске, умножьте `marks` на гранулированность индекса (обычно 8192). +- `rows` (`UInt64`) – количество строк. +- `bytes_on_disk` (`UInt64`) – общий размер всех файлов кусков данных в байтах. +- `data_compressed_bytes` (`UInt64`) – общий размер сжатой информации в куске данных. Размер всех дополнительных файлов (например, файлов с засечками) не учитывается. +- `data_uncompressed_bytes` (`UInt64`) – общий размер распакованной информации куска данных. Размер всех дополнительных файлов (например, файлов с засечками) не учитывается. +- `marks_bytes` (`UInt64`) – размер файла с засечками. +- `modification_time` (`DateTime`) – время модификации директории с куском данных. Обычно соответствует времени создания куска. +- `remove_time` (`DateTime`) – время, когда кусок стал неактивным. +- `refcount` (`UInt32`) – количество мест, в котором кусок используется. Значение больше 2 говорит о том, что кусок участвует в запросах или в слияниях. +- `min_date` (`Date`) – минимальное значение ключа даты в куске данных. +- `max_date` (`Date`) – максимальное значение ключа даты в куске данных. +- `min_time` (`DateTime`) – минимальное значение даты и времени в куске данных. +- `max_time`(`DateTime`) – максимальное значение даты и времени в куске данных. +- `partition_id` (`String`) – ID партиции. +- `min_block_number` (`UInt64`) – минимальное число кусков, из которых состоит текущий после слияния. +- `max_block_number` (`UInt64`) – максимальное число кусков, из которых состоит текущий после слияния. +- `level` (`UInt32`) - глубина дерева слияний. Если слияний не было, то `level=0`. +- `data_version` (`UInt64`) – число, которое используется для определения того, какие мутации необходимо применить к куску данных (мутации с версией большей, чем `data_version`). +- `primary_key_bytes_in_memory` (`UInt64`) – объем памяти (в байтах), занимаемой значениями первичных ключей. +- `primary_key_bytes_in_memory_allocated` (`UInt64`) – объем памяти (в байтах) выделенный для размещения первичных ключей. +- `is_frozen` (`UInt8`) – Признак, показывающий существование бэкапа партиции. 1, бэкап есть. 0, бэкапа нет. Смотрите раздел [FREEZE PARTITION](../query_language/alter.md#alter_freeze-partition). +- `database` (`String`) – имя базы данных. +- `table` (`String`) – имя таблицы. +- `engine` (`String`) – имя движка таблицы, без параметров. +- `path` (`String`) – абсолютный путь к папке с файлами кусков данных.. +- `hash_of_all_files` (`String`) – значение [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) для сжатых файлов. +- `hash_of_uncompressed_files` (`String`) – значение [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) несжатых файлов (файлы с засечками, первичным ключом и пр.) +- `uncompressed_hash_of_compressed_files` (`String`) – значение [sipHash128](../query_language/functions/hash_functions.md#hash_functions-siphash128) данных в сжатых файлах как если бы они были разжатыми. +- `bytes` (`UInt64`) – алиас для `bytes_on_disk`. +- `marks_size` (`UInt64`) – алиас для `marks_bytes`. + ## system.part_log {#system_tables-part-log} diff --git a/docs/ru/query_language/functions/hash_functions.md b/docs/ru/query_language/functions/hash_functions.md index 44793b0e290..e66cee3b344 100644 --- a/docs/ru/query_language/functions/hash_functions.md +++ b/docs/ru/query_language/functions/hash_functions.md @@ -76,7 +76,7 @@ SELECT sipHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00 └──────────────────────┴────────┘ ``` -## sipHash128 +## sipHash128 {#hash_functions-siphash128} Вычисляет SipHash от строки. Принимает аргумент типа String. Возвращает FixedString(16). From 0a686b1e8684f23511e82f630863b629c2fe12eb Mon Sep 17 00:00:00 2001 From: Denis Zhuravlev Date: Sat, 28 Sep 2019 18:52:52 -0300 Subject: [PATCH 303/309] Update array_functions.md Fix arrayDifference description. --- docs/ru/query_language/functions/array_functions.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/ru/query_language/functions/array_functions.md b/docs/ru/query_language/functions/array_functions.md index 680055fa816..93c75ac3525 100644 --- a/docs/ru/query_language/functions/array_functions.md +++ b/docs/ru/query_language/functions/array_functions.md @@ -660,8 +660,7 @@ SELECT arrayReverseSort((x, y) -> -y, [4, 3, 5], [1, 2, 3]) AS res; ## arrayDifference(arr) {#array_functions-arraydifference} -Принимает массив, возвращает массив разностей между соседними элементами. Первым элементом будет 0, вторым разность между вторым и первым элементами исходного массива, и т.д. -Результирующий массив имеет тип Array(Int64) для целых чисел и Array(Float64) для чисел с плавающей точкой. Другие типы, в том числе Decimal, не поддерживаются. +Принимает массив, возвращает массив разностей между соседними элементами. Первым элементом будет 0, вторым разность между вторым и первым элементами исходного массива, и т.д. Тип элементов результирующего массива определяется правилами выведения типов при вычитании (напр. UInt8 - UInt8 = Int16). Поддерживаются UInt*/Int*/Float* типы (тип Decimal не поддерживается). Пример: @@ -794,7 +793,7 @@ SELECT arrayReverse([1, 2, 3]) └─────────────────────────┘ ``` -# reverse(arr) {#array_functions-reverse} +## reverse(arr) {#array_functions-reverse} Синоним для ["arrayReverse"](#array_functions-arrayreverse) From dbc352fdf96ac9017930175585e215eb1486691d Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Sat, 28 Sep 2019 22:36:56 +0800 Subject: [PATCH 304/309] glibc 2.29 compatibility --- cmake/sanitize.cmake | 4 +- libs/libcommon/src/preciseExp10.c | 47 ++- libs/libglibc-compatibility/musl/README | 2 + .../musl/__math_divzero.c | 6 + .../musl/__math_divzerof.c | 13 +- .../musl/__math_invalid.c | 6 + .../musl/__math_invalidf.c | 2 + .../musl/__math_oflow.c | 6 + .../musl/__math_oflowf.c | 6 + .../musl/__math_uflow.c | 6 + .../musl/__math_uflowf.c | 6 + .../musl/__math_xflow.c | 6 + .../musl/__math_xflowf.c | 6 + libs/libglibc-compatibility/musl/exp.c | 134 +++++++ libs/libglibc-compatibility/musl/exp2.c | 121 ++++++ libs/libglibc-compatibility/musl/exp2f.c | 168 +++------ libs/libglibc-compatibility/musl/exp2f_data.c | 35 ++ libs/libglibc-compatibility/musl/exp2f_data.h | 23 ++ libs/libglibc-compatibility/musl/exp_data.c | 182 ++++++++++ libs/libglibc-compatibility/musl/exp_data.h | 26 ++ libs/libglibc-compatibility/musl/libm.h | 249 +++++++++++++ libs/libglibc-compatibility/musl/log.c | 112 ++++++ libs/libglibc-compatibility/musl/log2.c | 122 +++++++ libs/libglibc-compatibility/musl/log2_data.c | 201 ++++++++++ libs/libglibc-compatibility/musl/log2_data.h | 28 ++ libs/libglibc-compatibility/musl/log_data.c | 328 +++++++++++++++++ libs/libglibc-compatibility/musl/log_data.h | 28 ++ libs/libglibc-compatibility/musl/logf.c | 23 +- libs/libglibc-compatibility/musl/logf_data.h | 4 +- .../musl/musl_features.h | 8 + libs/libglibc-compatibility/musl/pow.c | 343 ++++++++++++++++++ libs/libglibc-compatibility/musl/pow_data.c | 180 +++++++++ libs/libglibc-compatibility/musl/pow_data.h | 22 ++ 33 files changed, 2294 insertions(+), 159 deletions(-) create mode 100644 libs/libglibc-compatibility/musl/__math_divzero.c create mode 100644 libs/libglibc-compatibility/musl/__math_invalid.c create mode 100644 libs/libglibc-compatibility/musl/__math_oflow.c create mode 100644 libs/libglibc-compatibility/musl/__math_oflowf.c create mode 100644 libs/libglibc-compatibility/musl/__math_uflow.c create mode 100644 libs/libglibc-compatibility/musl/__math_uflowf.c create mode 100644 libs/libglibc-compatibility/musl/__math_xflow.c create mode 100644 libs/libglibc-compatibility/musl/__math_xflowf.c create mode 100644 libs/libglibc-compatibility/musl/exp.c create mode 100644 libs/libglibc-compatibility/musl/exp2.c create mode 100644 libs/libglibc-compatibility/musl/exp2f_data.c create mode 100644 libs/libglibc-compatibility/musl/exp2f_data.h create mode 100644 libs/libglibc-compatibility/musl/exp_data.c create mode 100644 libs/libglibc-compatibility/musl/exp_data.h create mode 100644 libs/libglibc-compatibility/musl/libm.h create mode 100644 libs/libglibc-compatibility/musl/log.c create mode 100644 libs/libglibc-compatibility/musl/log2.c create mode 100644 libs/libglibc-compatibility/musl/log2_data.c create mode 100644 libs/libglibc-compatibility/musl/log2_data.h create mode 100644 libs/libglibc-compatibility/musl/log_data.c create mode 100644 libs/libglibc-compatibility/musl/log_data.h create mode 100644 libs/libglibc-compatibility/musl/musl_features.h create mode 100644 libs/libglibc-compatibility/musl/pow.c create mode 100644 libs/libglibc-compatibility/musl/pow_data.c create mode 100644 libs/libglibc-compatibility/musl/pow_data.h diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index 5063e12fc1f..a5d1d0d3055 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -68,8 +68,8 @@ if (SANITIZE) endif () elseif (SANITIZE STREQUAL "undefined") - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} -fsanitize=undefined -fno-sanitize-recover=all") - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} -fsanitize=undefined -fno-sanitize-recover=all") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} -fsanitize=undefined -fno-sanitize-recover=all -fno-sanitize=float-divide-by-zero") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} -fsanitize=undefined -fno-sanitize-recover=all -fno-sanitize=float-divide-by-zero") if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined") endif() diff --git a/libs/libcommon/src/preciseExp10.c b/libs/libcommon/src/preciseExp10.c index 49c508e6fed..d24a7e60241 100644 --- a/libs/libcommon/src/preciseExp10.c +++ b/libs/libcommon/src/preciseExp10.c @@ -174,19 +174,44 @@ obstacle to adoption, that text has been removed. double preciseExp10(double x) { - static const double p10[] = { - 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, - 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, - 1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, - 1e10, 1e11, 1e12, 1e13, 1e14, 1e15 - }; + static const double p10[] + = {1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, + 1e+1, 1e+2, 1e+3, 1e+4, 1e+5, 1e+6, 1e+7, 1e+8, 1e+9, 1e+10, 1e+11, 1e+12, 1e+13, 1e+14, 1e+15, 1e+16, + 1e+17, 1e+18, 1e+19, 1e+20, 1e+21, 1e+22, 1e+23, 1e+24, 1e+25, 1e+26, 1e+27, 1e+28, 1e+29, 1e+30, 1e+31, 1e+32, + 1e+33, 1e+34, 1e+35, 1e+36, 1e+37, 1e+38, 1e+39, 1e+40, 1e+41, 1e+42, 1e+43, 1e+44, 1e+45, 1e+46, 1e+47, 1e+48, + 1e+49, 1e+50, 1e+51, 1e+52, 1e+53, 1e+54, 1e+55, 1e+56, 1e+57, 1e+58, 1e+59, 1e+60, 1e+61, 1e+62, 1e+63, 1e+64, + 1e+65, 1e+66, 1e+67, 1e+68, 1e+69, 1e+70, 1e+71, 1e+72, 1e+73, 1e+74, 1e+75, 1e+76, 1e+77, 1e+78, 1e+79, 1e+80, + 1e+81, 1e+82, 1e+83, 1e+84, 1e+85, 1e+86, 1e+87, 1e+88, 1e+89, 1e+90, 1e+91, 1e+92, 1e+93, 1e+94, 1e+95, 1e+96, + 1e+97, 1e+98, 1e+99, 1e+100, 1e+101, 1e+102, 1e+103, 1e+104, 1e+105, 1e+106, 1e+107, 1e+108, 1e+109, 1e+110, 1e+111, 1e+112, + 1e+113, 1e+114, 1e+115, 1e+116, 1e+117, 1e+118, 1e+119, 1e+120, 1e+121, 1e+122, 1e+123, 1e+124, 1e+125, 1e+126, 1e+127, 1e+128, + 1e+129, 1e+130, 1e+131, 1e+132, 1e+133, 1e+134, 1e+135, 1e+136, 1e+137, 1e+138, 1e+139, 1e+140, 1e+141, 1e+142, 1e+143, 1e+144, + 1e+145, 1e+146, 1e+147, 1e+148, 1e+149, 1e+150, 1e+151, 1e+152, 1e+153, 1e+154, 1e+155, 1e+156, 1e+157, 1e+158, 1e+159, 1e+160, + 1e+161, 1e+162, 1e+163, 1e+164, 1e+165, 1e+166, 1e+167, 1e+168, 1e+169, 1e+170, 1e+171, 1e+172, 1e+173, 1e+174, 1e+175, 1e+176, + 1e+177, 1e+178, 1e+179, 1e+180, 1e+181, 1e+182, 1e+183, 1e+184, 1e+185, 1e+186, 1e+187, 1e+188, 1e+189, 1e+190, 1e+191, 1e+192, + 1e+193, 1e+194, 1e+195, 1e+196, 1e+197, 1e+198, 1e+199, 1e+200, 1e+201, 1e+202, 1e+203, 1e+204, 1e+205, 1e+206, 1e+207, 1e+208, + 1e+209, 1e+210, 1e+211, 1e+212, 1e+213, 1e+214, 1e+215, 1e+216, 1e+217, 1e+218, 1e+219, 1e+220, 1e+221, 1e+222, 1e+223, 1e+224, + 1e+225, 1e+226, 1e+227, 1e+228, 1e+229, 1e+230, 1e+231, 1e+232, 1e+233, 1e+234, 1e+235, 1e+236, 1e+237, 1e+238, 1e+239, 1e+240, + 1e+241, 1e+242, 1e+243, 1e+244, 1e+245, 1e+246, 1e+247, 1e+248, 1e+249, 1e+250, 1e+251, 1e+252, 1e+253, 1e+254, 1e+255, 1e+256, + 1e+257, 1e+258, 1e+259, 1e+260, 1e+261, 1e+262, 1e+263, 1e+264, 1e+265, 1e+266, 1e+267, 1e+268, 1e+269, 1e+270, 1e+271, 1e+272, + 1e+273, 1e+274, 1e+275, 1e+276, 1e+277, 1e+278, 1e+279, 1e+280, 1e+281, 1e+282, 1e+283, 1e+284, 1e+285, 1e+286, 1e+287, 1e+288, + 1e+289, 1e+290, 1e+291, 1e+292, 1e+293, 1e+294, 1e+295, 1e+296, 1e+297, 1e+298, 1e+299, 1e+300, 1e+301, 1e+302, 1e+303, 1e+304, + 1e+305, 1e+306, 1e+307, 1e+308}; + double n, y = modf(x, &n); - union {double f; uint64_t i;} u = {n}; - /* fabs(n) < 16 without raising invalid on nan */ - if ((u.i>>52 & 0x7ff) < 0x3ff+4) { - if (!y) return p10[(int)n+15]; + if (n > 308) + return x > 0 ? INFINITY : -INFINITY; + if (!y) + return p10[(int)n + 15]; + + union + { + double f; + uint64_t i; + } u = {n}; + if ((u.i >> 52 & 0x7ff) < 0x3ff + 4) + { y = exp2(3.32192809488736234787031942948939 * y); - return y * p10[(int)n+15]; + return y * p10[(int)n + 15]; } return pow(10.0, x); } diff --git a/libs/libglibc-compatibility/musl/README b/libs/libglibc-compatibility/musl/README index 11f6caa2d7e..994134b14ef 100644 --- a/libs/libglibc-compatibility/musl/README +++ b/libs/libglibc-compatibility/musl/README @@ -4,3 +4,5 @@ git://git.musl-libc.org/musl c10bc61508dc52b8315084e628f36a6c3c2dabb1 NOTE: Files was edited. + +NOTE: Math related files are pulled from commit 6ad514e4e278f0c3b18eb2db1d45638c9af1c07f. diff --git a/libs/libglibc-compatibility/musl/__math_divzero.c b/libs/libglibc-compatibility/musl/__math_divzero.c new file mode 100644 index 00000000000..59d2135001c --- /dev/null +++ b/libs/libglibc-compatibility/musl/__math_divzero.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_divzero(uint32_t sign) +{ + return fp_barrier(sign ? -1.0 : 1.0) / 0.0; +} diff --git a/libs/libglibc-compatibility/musl/__math_divzerof.c b/libs/libglibc-compatibility/musl/__math_divzerof.c index cd1263fde2a..ce046f3e320 100644 --- a/libs/libglibc-compatibility/musl/__math_divzerof.c +++ b/libs/libglibc-compatibility/musl/__math_divzerof.c @@ -1,15 +1,4 @@ -#include - -/* fp_barrier returns its input, but limits code transformations - as if it had a side-effect (e.g. observable io) and returned - an arbitrary value. */ - -static inline float fp_barrierf(float x) -{ - volatile float y = x; - return y; -} - +#include "libm.h" float __math_divzerof(uint32_t sign) { diff --git a/libs/libglibc-compatibility/musl/__math_invalid.c b/libs/libglibc-compatibility/musl/__math_invalid.c new file mode 100644 index 00000000000..177404900d1 --- /dev/null +++ b/libs/libglibc-compatibility/musl/__math_invalid.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_invalid(double x) +{ + return (x - x) / (x - x); +} diff --git a/libs/libglibc-compatibility/musl/__math_invalidf.c b/libs/libglibc-compatibility/musl/__math_invalidf.c index ee41c32378e..357d4b12117 100644 --- a/libs/libglibc-compatibility/musl/__math_invalidf.c +++ b/libs/libglibc-compatibility/musl/__math_invalidf.c @@ -1,3 +1,5 @@ +#include "libm.h" + float __math_invalidf(float x) { return (x - x) / (x - x); diff --git a/libs/libglibc-compatibility/musl/__math_oflow.c b/libs/libglibc-compatibility/musl/__math_oflow.c new file mode 100644 index 00000000000..c85dbf982a0 --- /dev/null +++ b/libs/libglibc-compatibility/musl/__math_oflow.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_oflow(uint32_t sign) +{ + return __math_xflow(sign, 0x1p769); +} diff --git a/libs/libglibc-compatibility/musl/__math_oflowf.c b/libs/libglibc-compatibility/musl/__math_oflowf.c new file mode 100644 index 00000000000..fa7d06208e4 --- /dev/null +++ b/libs/libglibc-compatibility/musl/__math_oflowf.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_oflowf(uint32_t sign) +{ + return __math_xflowf(sign, 0x1p97f); +} diff --git a/libs/libglibc-compatibility/musl/__math_uflow.c b/libs/libglibc-compatibility/musl/__math_uflow.c new file mode 100644 index 00000000000..b90594aee14 --- /dev/null +++ b/libs/libglibc-compatibility/musl/__math_uflow.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_uflow(uint32_t sign) +{ + return __math_xflow(sign, 0x1p-767); +} diff --git a/libs/libglibc-compatibility/musl/__math_uflowf.c b/libs/libglibc-compatibility/musl/__math_uflowf.c new file mode 100644 index 00000000000..94d50f2bf12 --- /dev/null +++ b/libs/libglibc-compatibility/musl/__math_uflowf.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_uflowf(uint32_t sign) +{ + return __math_xflowf(sign, 0x1p-95f); +} diff --git a/libs/libglibc-compatibility/musl/__math_xflow.c b/libs/libglibc-compatibility/musl/__math_xflow.c new file mode 100644 index 00000000000..744203c4c81 --- /dev/null +++ b/libs/libglibc-compatibility/musl/__math_xflow.c @@ -0,0 +1,6 @@ +#include "libm.h" + +double __math_xflow(uint32_t sign, double y) +{ + return eval_as_double(fp_barrier(sign ? -y : y) * y); +} diff --git a/libs/libglibc-compatibility/musl/__math_xflowf.c b/libs/libglibc-compatibility/musl/__math_xflowf.c new file mode 100644 index 00000000000..f2c84784f81 --- /dev/null +++ b/libs/libglibc-compatibility/musl/__math_xflowf.c @@ -0,0 +1,6 @@ +#include "libm.h" + +float __math_xflowf(uint32_t sign, float y) +{ + return eval_as_float(fp_barrierf(sign ? -y : y) * y); +} diff --git a/libs/libglibc-compatibility/musl/exp.c b/libs/libglibc-compatibility/musl/exp.c new file mode 100644 index 00000000000..b764d73cfe3 --- /dev/null +++ b/libs/libglibc-compatibility/musl/exp.c @@ -0,0 +1,134 @@ +/* + * Double-precision e^x function. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include "libm.h" +#include "exp_data.h" + +#define N (1 << EXP_TABLE_BITS) +#define InvLn2N __exp_data.invln2N +#define NegLn2hiN __exp_data.negln2hiN +#define NegLn2loN __exp_data.negln2loN +#define Shift __exp_data.shift +#define T __exp_data.tab +#define C2 __exp_data.poly[5 - EXP_POLY_ORDER] +#define C3 __exp_data.poly[6 - EXP_POLY_ORDER] +#define C4 __exp_data.poly[7 - EXP_POLY_ORDER] +#define C5 __exp_data.poly[8 - EXP_POLY_ORDER] + +/* Handle cases that may overflow or underflow when computing the result that + is scale*(1+TMP) without intermediate rounding. The bit representation of + scale is in SBITS, however it has a computed exponent that may have + overflown into the sign bit so that needs to be adjusted before using it as + a double. (int32_t)KI is the k used in the argument reduction and exponent + adjustment of scale, positive k here means the result may overflow and + negative k means the result may underflow. */ +static inline double specialcase(double_t tmp, uint64_t sbits, uint64_t ki) +{ + double_t scale, y; + + if ((ki & 0x80000000) == 0) { + /* k > 0, the exponent of scale might have overflowed by <= 460. */ + sbits -= 1009ull << 52; + scale = asdouble(sbits); + y = 0x1p1009 * (scale + scale * tmp); + return eval_as_double(y); + } + /* k < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + scale = asdouble(sbits); + y = scale + scale * tmp; + if (y < 1.0) { + /* Round y to the right precision before scaling it into the subnormal + range to avoid double rounding that can cause 0.5+E/2 ulp error where + E is the worst-case ulp error outside the subnormal range. So this + is only useful if the goal is better than 1 ulp worst-case error. */ + double_t hi, lo; + lo = scale - y + scale * tmp; + hi = 1.0 + y; + lo = 1.0 - hi + y + lo; + y = eval_as_double(hi + lo) - 1.0; + /* Avoid -0.0 with downward rounding. */ + if (WANT_ROUNDING && y == 0.0) + y = 0.0; + /* The underflow exception needs to be signaled explicitly. */ + fp_force_eval(fp_barrier(0x1p-1022) * 0x1p-1022); + } + y = 0x1p-1022 * y; + return eval_as_double(y); +} + +/* Top 12 bits of a double (sign and exponent bits). */ +static inline uint32_t top12(double x) +{ + return asuint64(x) >> 52; +} + +double exp(double x) +{ + uint32_t abstop; + uint64_t ki, idx, top, sbits; + double_t kd, z, r, r2, scale, tail, tmp; + + abstop = top12(x) & 0x7ff; + if (predict_false(abstop - top12(0x1p-54) >= top12(512.0) - top12(0x1p-54))) { + if (abstop - top12(0x1p-54) >= 0x80000000) + /* Avoid spurious underflow for tiny x. */ + /* Note: 0 is common input. */ + return WANT_ROUNDING ? 1.0 + x : 1.0; + if (abstop >= top12(1024.0)) { + if (asuint64(x) == asuint64(-INFINITY)) + return 0.0; + if (abstop >= top12(INFINITY)) + return 1.0 + x; + if (asuint64(x) >> 63) + return __math_uflow(0); + else + return __math_oflow(0); + } + /* Large x is special cased below. */ + abstop = 0; + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + z = InvLn2N * x; +#if TOINT_INTRINSICS + kd = roundtoint(z); + ki = converttoint(z); +#elif EXP_USE_TOINT_NARROW + /* z - kd is in [-0.5-2^-16, 0.5] in all rounding modes. */ + kd = eval_as_double(z + Shift); + ki = asuint64(kd) >> 16; + kd = (double_t)(int32_t)ki; +#else + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + kd = eval_as_double(z + Shift); + ki = asuint64(kd); + kd -= Shift; +#endif + r = x + kd * NegLn2hiN + kd * NegLn2loN; + /* 2^(k/N) ~= scale * (1 + tail). */ + idx = 2 * (ki % N); + top = ki << (52 - EXP_TABLE_BITS); + tail = asdouble(T[idx]); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + sbits = T[idx + 1] + top; + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1). */ + /* Evaluation is optimized assuming superscalar pipelined execution. */ + r2 = r * r; + /* Without fma the worst case error is 0.25/N ulp larger. */ + /* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp. */ + tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5); + if (predict_false(abstop == 0)) + return specialcase(tmp, sbits, ki); + scale = asdouble(sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return eval_as_double(scale + scale * tmp); +} diff --git a/libs/libglibc-compatibility/musl/exp2.c b/libs/libglibc-compatibility/musl/exp2.c new file mode 100644 index 00000000000..e0ff54bd85b --- /dev/null +++ b/libs/libglibc-compatibility/musl/exp2.c @@ -0,0 +1,121 @@ +/* + * Double-precision 2^x function. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include "libm.h" +#include "exp_data.h" + +#define N (1 << EXP_TABLE_BITS) +#define Shift __exp_data.exp2_shift +#define T __exp_data.tab +#define C1 __exp_data.exp2_poly[0] +#define C2 __exp_data.exp2_poly[1] +#define C3 __exp_data.exp2_poly[2] +#define C4 __exp_data.exp2_poly[3] +#define C5 __exp_data.exp2_poly[4] + +/* Handle cases that may overflow or underflow when computing the result that + is scale*(1+TMP) without intermediate rounding. The bit representation of + scale is in SBITS, however it has a computed exponent that may have + overflown into the sign bit so that needs to be adjusted before using it as + a double. (int32_t)KI is the k used in the argument reduction and exponent + adjustment of scale, positive k here means the result may overflow and + negative k means the result may underflow. */ +static inline double specialcase(double_t tmp, uint64_t sbits, uint64_t ki) +{ + double_t scale, y; + + if ((ki & 0x80000000) == 0) { + /* k > 0, the exponent of scale might have overflowed by 1. */ + sbits -= 1ull << 52; + scale = asdouble(sbits); + y = 2 * (scale + scale * tmp); + return eval_as_double(y); + } + /* k < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + scale = asdouble(sbits); + y = scale + scale * tmp; + if (y < 1.0) { + /* Round y to the right precision before scaling it into the subnormal + range to avoid double rounding that can cause 0.5+E/2 ulp error where + E is the worst-case ulp error outside the subnormal range. So this + is only useful if the goal is better than 1 ulp worst-case error. */ + double_t hi, lo; + lo = scale - y + scale * tmp; + hi = 1.0 + y; + lo = 1.0 - hi + y + lo; + y = eval_as_double(hi + lo) - 1.0; + /* Avoid -0.0 with downward rounding. */ + if (WANT_ROUNDING && y == 0.0) + y = 0.0; + /* The underflow exception needs to be signaled explicitly. */ + fp_force_eval(fp_barrier(0x1p-1022) * 0x1p-1022); + } + y = 0x1p-1022 * y; + return eval_as_double(y); +} + +/* Top 12 bits of a double (sign and exponent bits). */ +static inline uint32_t top12(double x) +{ + return asuint64(x) >> 52; +} + +double exp2(double x) +{ + uint32_t abstop; + uint64_t ki, idx, top, sbits; + double_t kd, r, r2, scale, tail, tmp; + + abstop = top12(x) & 0x7ff; + if (predict_false(abstop - top12(0x1p-54) >= top12(512.0) - top12(0x1p-54))) { + if (abstop - top12(0x1p-54) >= 0x80000000) + /* Avoid spurious underflow for tiny x. */ + /* Note: 0 is common input. */ + return WANT_ROUNDING ? 1.0 + x : 1.0; + if (abstop >= top12(1024.0)) { + if (asuint64(x) == asuint64(-INFINITY)) + return 0.0; + if (abstop >= top12(INFINITY)) + return 1.0 + x; + if (!(asuint64(x) >> 63)) + return __math_oflow(0); + else if (asuint64(x) >= asuint64(-1075.0)) + return __math_uflow(0); + } + if (2 * asuint64(x) > 2 * asuint64(928.0)) + /* Large x is special cased below. */ + abstop = 0; + } + + /* exp2(x) = 2^(k/N) * 2^r, with 2^r in [2^(-1/2N),2^(1/2N)]. */ + /* x = k/N + r, with int k and r in [-1/2N, 1/2N]. */ + kd = eval_as_double(x + Shift); + ki = asuint64(kd); /* k. */ + kd -= Shift; /* k/N for int k. */ + r = x - kd; + /* 2^(k/N) ~= scale * (1 + tail). */ + idx = 2 * (ki % N); + top = ki << (52 - EXP_TABLE_BITS); + tail = asdouble(T[idx]); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + sbits = T[idx + 1] + top; + /* exp2(x) = 2^(k/N) * 2^r ~= scale + scale * (tail + 2^r - 1). */ + /* Evaluation is optimized assuming superscalar pipelined execution. */ + r2 = r * r; + /* Without fma the worst case error is 0.5/N ulp larger. */ + /* Worst case error is less than 0.5+0.86/N+(abs poly error * 2^53) ulp. */ + tmp = tail + r * C1 + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5); + if (predict_false(abstop == 0)) + return specialcase(tmp, sbits, ki); + scale = asdouble(sbits); + /* Note: tmp == 0 or |tmp| > 2^-65 and scale > 2^-928, so there + is no spurious underflow here even without fma. */ + return eval_as_double(scale + scale * tmp); +} diff --git a/libs/libglibc-compatibility/musl/exp2f.c b/libs/libglibc-compatibility/musl/exp2f.c index 8aaedfb9821..0360482cae0 100644 --- a/libs/libglibc-compatibility/musl/exp2f.c +++ b/libs/libglibc-compatibility/musl/exp2f.c @@ -1,127 +1,69 @@ -/* origin: FreeBSD /usr/src/lib/msun/src/s_exp2f.c */ -/*- - * Copyright (c) 2005 David Schultz - * All rights reserved. +/* + * Single-precision 2^x function. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT */ #include #include - -#define TBLSIZE 16 - -static const float -redux = 0x1.8p23f / TBLSIZE, -P1 = 0x1.62e430p-1f, -P2 = 0x1.ebfbe0p-3f, -P3 = 0x1.c6b348p-5f, -P4 = 0x1.3b2c9cp-7f; - -static const double exp2ft[TBLSIZE] = { - 0x1.6a09e667f3bcdp-1, - 0x1.7a11473eb0187p-1, - 0x1.8ace5422aa0dbp-1, - 0x1.9c49182a3f090p-1, - 0x1.ae89f995ad3adp-1, - 0x1.c199bdd85529cp-1, - 0x1.d5818dcfba487p-1, - 0x1.ea4afa2a490dap-1, - 0x1.0000000000000p+0, - 0x1.0b5586cf9890fp+0, - 0x1.172b83c7d517bp+0, - 0x1.2387a6e756238p+0, - 0x1.306fe0a31b715p+0, - 0x1.3dea64c123422p+0, - 0x1.4bfdad5362a27p+0, - 0x1.5ab07dd485429p+0, -}; +#include "libm.h" +#include "exp2f_data.h" /* - * exp2f(x): compute the base 2 exponential of x - * - * Accuracy: Peak error < 0.501 ulp; location of peak: -0.030110927. - * - * Method: (equally-spaced tables) - * - * Reduce x: - * x = k + y, for integer k and |y| <= 1/2. - * Thus we have exp2f(x) = 2**k * exp2(y). - * - * Reduce y: - * y = i/TBLSIZE + z for integer i near y * TBLSIZE. - * Thus we have exp2(y) = exp2(i/TBLSIZE) * exp2(z), - * with |z| <= 2**-(TBLSIZE+1). - * - * We compute exp2(i/TBLSIZE) via table lookup and exp2(z) via a - * degree-4 minimax polynomial with maximum error under 1.4 * 2**-33. - * Using double precision for everything except the reduction makes - * roundoff error insignificant and simplifies the scaling step. - * - * This method is due to Tang, but I do not use his suggested parameters: - * - * Tang, P. Table-driven Implementation of the Exponential Function - * in IEEE Floating-Point Arithmetic. TOMS 15(2), 144-157 (1989). - */ +EXP2F_TABLE_BITS = 5 +EXP2F_POLY_ORDER = 3 + +ULP error: 0.502 (nearest rounding.) +Relative error: 1.69 * 2^-34 in [-1/64, 1/64] (before rounding.) +Wrong count: 168353 (all nearest rounding wrong results with fma.) +Non-nearest ULP error: 1 (rounded ULP error) +*/ + +#define N (1 << EXP2F_TABLE_BITS) +#define T __exp2f_data.tab +#define C __exp2f_data.poly +#define SHIFT __exp2f_data.shift_scaled + +static inline uint32_t top12(float x) +{ + return asuint(x) >> 20; +} + float exp2f(float x) { - double_t t, r, z; - union {float f; uint32_t i;} u = {x}; - union {double f; uint64_t i;} uk; - uint32_t ix, i0, k; + uint32_t abstop; + uint64_t ki, t; + double_t kd, xd, z, r, r2, y, s; - /* Filter out exceptional cases. */ - ix = u.i & 0x7fffffff; - if (ix > 0x42fc0000) { /* |x| > 126 */ - if (ix > 0x7f800000) /* NaN */ - return x; - if (u.i >= 0x43000000 && u.i < 0x80000000) { /* x >= 128 */ - x *= 0x1p127f; - return x; - } - if (u.i >= 0x80000000) { /* x < -126 */ - if (u.i >= 0xc3160000 || (u.i & 0x0000ffff)) - { volatile float tmp; tmp = (-0x1p-149f/x); (void)tmp; } - if (u.i >= 0xc3160000) /* x <= -150 */ - return 0; - } - } else if (ix <= 0x33000000) { /* |x| <= 0x1p-25 */ - return 1.0f + x; + xd = (double_t)x; + abstop = top12(x) & 0x7ff; + if (predict_false(abstop >= top12(128.0f))) { + /* |x| >= 128 or x is nan. */ + if (asuint(x) == asuint(-INFINITY)) + return 0.0f; + if (abstop >= top12(INFINITY)) + return x + x; + if (x > 0.0f) + return __math_oflowf(0); + if (x <= -150.0f) + return __math_uflowf(0); } - /* Reduce x, computing z, i0, and k. */ - u.f = x + redux; - i0 = u.i; - i0 += TBLSIZE / 2; - k = i0 / TBLSIZE; - uk.i = (uint64_t)(0x3ff + k)<<52; - i0 &= TBLSIZE - 1; - u.f -= redux; - z = x - u.f; - /* Compute r = exp2(y) = exp2ft[i0] * p(z). */ - r = exp2ft[i0]; - t = r * z; - r = r + t * (P1 + z * P2) + t * (z * z) * (P3 + z * P4); + /* x = k/N + r with r in [-1/(2N), 1/(2N)] and int k. */ + kd = eval_as_double(xd + SHIFT); + ki = asuint64(kd); + kd -= SHIFT; /* k/N for int k. */ + r = xd - kd; - /* Scale by 2**k */ - return r * uk.f; + /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1) */ + t = T[ki % N]; + t += ki << (52 - EXP2F_TABLE_BITS); + s = asdouble(t); + z = C[0] * r + C[1]; + r2 = r * r; + y = C[2] * r + 1; + y = z * r2 + y; + y = y * s; + return eval_as_float(y); } diff --git a/libs/libglibc-compatibility/musl/exp2f_data.c b/libs/libglibc-compatibility/musl/exp2f_data.c new file mode 100644 index 00000000000..be324727f5f --- /dev/null +++ b/libs/libglibc-compatibility/musl/exp2f_data.c @@ -0,0 +1,35 @@ +/* + * Shared data between expf, exp2f and powf. + * + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "exp2f_data.h" + +#define N (1 << EXP2F_TABLE_BITS) + +const struct exp2f_data __exp2f_data = { + /* tab[i] = uint(2^(i/N)) - (i << 52-BITS) + used for computing 2^(k/N) for an int |k| < 150 N as + double(tab[k%N] + (k << 52-BITS)) */ + .tab = { +0x3ff0000000000000, 0x3fefd9b0d3158574, 0x3fefb5586cf9890f, 0x3fef9301d0125b51, +0x3fef72b83c7d517b, 0x3fef54873168b9aa, 0x3fef387a6e756238, 0x3fef1e9df51fdee1, +0x3fef06fe0a31b715, 0x3feef1a7373aa9cb, 0x3feedea64c123422, 0x3feece086061892d, +0x3feebfdad5362a27, 0x3feeb42b569d4f82, 0x3feeab07dd485429, 0x3feea47eb03a5585, +0x3feea09e667f3bcd, 0x3fee9f75e8ec5f74, 0x3feea11473eb0187, 0x3feea589994cce13, +0x3feeace5422aa0db, 0x3feeb737b0cdc5e5, 0x3feec49182a3f090, 0x3feed503b23e255d, +0x3feee89f995ad3ad, 0x3feeff76f2fb5e47, 0x3fef199bdd85529c, 0x3fef3720dcef9069, +0x3fef5818dcfba487, 0x3fef7c97337b9b5f, 0x3fefa4afa2a490da, 0x3fefd0765b6e4540, + }, + .shift_scaled = 0x1.8p+52 / N, + .poly = { + 0x1.c6af84b912394p-5, 0x1.ebfce50fac4f3p-3, 0x1.62e42ff0c52d6p-1, + }, + .shift = 0x1.8p+52, + .invln2_scaled = 0x1.71547652b82fep+0 * N, + .poly_scaled = { + 0x1.c6af84b912394p-5/N/N/N, 0x1.ebfce50fac4f3p-3/N/N, 0x1.62e42ff0c52d6p-1/N, + }, +}; diff --git a/libs/libglibc-compatibility/musl/exp2f_data.h b/libs/libglibc-compatibility/musl/exp2f_data.h new file mode 100644 index 00000000000..4df689101c5 --- /dev/null +++ b/libs/libglibc-compatibility/musl/exp2f_data.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2017-2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _EXP2F_DATA_H +#define _EXP2F_DATA_H + +#include "musl_features.h" +#include + +/* Shared between expf, exp2f and powf. */ +#define EXP2F_TABLE_BITS 5 +#define EXP2F_POLY_ORDER 3 +extern hidden const struct exp2f_data { + uint64_t tab[1 << EXP2F_TABLE_BITS]; + double shift_scaled; + double poly[EXP2F_POLY_ORDER]; + double shift; + double invln2_scaled; + double poly_scaled[EXP2F_POLY_ORDER]; +} __exp2f_data; + +#endif diff --git a/libs/libglibc-compatibility/musl/exp_data.c b/libs/libglibc-compatibility/musl/exp_data.c new file mode 100644 index 00000000000..21be0146a16 --- /dev/null +++ b/libs/libglibc-compatibility/musl/exp_data.c @@ -0,0 +1,182 @@ +/* + * Shared data between exp, exp2 and pow. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "exp_data.h" + +#define N (1 << EXP_TABLE_BITS) + +const struct exp_data __exp_data = { +// N/ln2 +.invln2N = 0x1.71547652b82fep0 * N, +// -ln2/N +.negln2hiN = -0x1.62e42fefa0000p-8, +.negln2loN = -0x1.cf79abc9e3b3ap-47, +// Used for rounding when !TOINT_INTRINSICS +#if EXP_USE_TOINT_NARROW +.shift = 0x1800000000.8p0, +#else +.shift = 0x1.8p52, +#endif +// exp polynomial coefficients. +.poly = { +// abs error: 1.555*2^-66 +// ulp error: 0.509 (0.511 without fma) +// if |x| < ln2/256+eps +// abs error if |x| < ln2/256+0x1p-15: 1.09*2^-65 +// abs error if |x| < ln2/128: 1.7145*2^-56 +0x1.ffffffffffdbdp-2, +0x1.555555555543cp-3, +0x1.55555cf172b91p-5, +0x1.1111167a4d017p-7, +}, +.exp2_shift = 0x1.8p52 / N, +// exp2 polynomial coefficients. +.exp2_poly = { +// abs error: 1.2195*2^-65 +// ulp error: 0.507 (0.511 without fma) +// if |x| < 1/256 +// abs error if |x| < 1/128: 1.9941*2^-56 +0x1.62e42fefa39efp-1, +0x1.ebfbdff82c424p-3, +0x1.c6b08d70cf4b5p-5, +0x1.3b2abd24650ccp-7, +0x1.5d7e09b4e3a84p-10, +}, +// 2^(k/N) ~= H[k]*(1 + T[k]) for int k in [0,N) +// tab[2*k] = asuint64(T[k]) +// tab[2*k+1] = asuint64(H[k]) - (k << 52)/N +.tab = { +0x0, 0x3ff0000000000000, +0x3c9b3b4f1a88bf6e, 0x3feff63da9fb3335, +0xbc7160139cd8dc5d, 0x3fefec9a3e778061, +0xbc905e7a108766d1, 0x3fefe315e86e7f85, +0x3c8cd2523567f613, 0x3fefd9b0d3158574, +0xbc8bce8023f98efa, 0x3fefd06b29ddf6de, +0x3c60f74e61e6c861, 0x3fefc74518759bc8, +0x3c90a3e45b33d399, 0x3fefbe3ecac6f383, +0x3c979aa65d837b6d, 0x3fefb5586cf9890f, +0x3c8eb51a92fdeffc, 0x3fefac922b7247f7, +0x3c3ebe3d702f9cd1, 0x3fefa3ec32d3d1a2, +0xbc6a033489906e0b, 0x3fef9b66affed31b, +0xbc9556522a2fbd0e, 0x3fef9301d0125b51, +0xbc5080ef8c4eea55, 0x3fef8abdc06c31cc, +0xbc91c923b9d5f416, 0x3fef829aaea92de0, +0x3c80d3e3e95c55af, 0x3fef7a98c8a58e51, +0xbc801b15eaa59348, 0x3fef72b83c7d517b, +0xbc8f1ff055de323d, 0x3fef6af9388c8dea, +0x3c8b898c3f1353bf, 0x3fef635beb6fcb75, +0xbc96d99c7611eb26, 0x3fef5be084045cd4, +0x3c9aecf73e3a2f60, 0x3fef54873168b9aa, +0xbc8fe782cb86389d, 0x3fef4d5022fcd91d, +0x3c8a6f4144a6c38d, 0x3fef463b88628cd6, +0x3c807a05b0e4047d, 0x3fef3f49917ddc96, +0x3c968efde3a8a894, 0x3fef387a6e756238, +0x3c875e18f274487d, 0x3fef31ce4fb2a63f, +0x3c80472b981fe7f2, 0x3fef2b4565e27cdd, +0xbc96b87b3f71085e, 0x3fef24dfe1f56381, +0x3c82f7e16d09ab31, 0x3fef1e9df51fdee1, +0xbc3d219b1a6fbffa, 0x3fef187fd0dad990, +0x3c8b3782720c0ab4, 0x3fef1285a6e4030b, +0x3c6e149289cecb8f, 0x3fef0cafa93e2f56, +0x3c834d754db0abb6, 0x3fef06fe0a31b715, +0x3c864201e2ac744c, 0x3fef0170fc4cd831, +0x3c8fdd395dd3f84a, 0x3feefc08b26416ff, +0xbc86a3803b8e5b04, 0x3feef6c55f929ff1, +0xbc924aedcc4b5068, 0x3feef1a7373aa9cb, +0xbc9907f81b512d8e, 0x3feeecae6d05d866, +0xbc71d1e83e9436d2, 0x3feee7db34e59ff7, +0xbc991919b3ce1b15, 0x3feee32dc313a8e5, +0x3c859f48a72a4c6d, 0x3feedea64c123422, +0xbc9312607a28698a, 0x3feeda4504ac801c, +0xbc58a78f4817895b, 0x3feed60a21f72e2a, +0xbc7c2c9b67499a1b, 0x3feed1f5d950a897, +0x3c4363ed60c2ac11, 0x3feece086061892d, +0x3c9666093b0664ef, 0x3feeca41ed1d0057, +0x3c6ecce1daa10379, 0x3feec6a2b5c13cd0, +0x3c93ff8e3f0f1230, 0x3feec32af0d7d3de, +0x3c7690cebb7aafb0, 0x3feebfdad5362a27, +0x3c931dbdeb54e077, 0x3feebcb299fddd0d, +0xbc8f94340071a38e, 0x3feeb9b2769d2ca7, +0xbc87deccdc93a349, 0x3feeb6daa2cf6642, +0xbc78dec6bd0f385f, 0x3feeb42b569d4f82, +0xbc861246ec7b5cf6, 0x3feeb1a4ca5d920f, +0x3c93350518fdd78e, 0x3feeaf4736b527da, +0x3c7b98b72f8a9b05, 0x3feead12d497c7fd, +0x3c9063e1e21c5409, 0x3feeab07dd485429, +0x3c34c7855019c6ea, 0x3feea9268a5946b7, +0x3c9432e62b64c035, 0x3feea76f15ad2148, +0xbc8ce44a6199769f, 0x3feea5e1b976dc09, +0xbc8c33c53bef4da8, 0x3feea47eb03a5585, +0xbc845378892be9ae, 0x3feea34634ccc320, +0xbc93cedd78565858, 0x3feea23882552225, +0x3c5710aa807e1964, 0x3feea155d44ca973, +0xbc93b3efbf5e2228, 0x3feea09e667f3bcd, +0xbc6a12ad8734b982, 0x3feea012750bdabf, +0xbc6367efb86da9ee, 0x3fee9fb23c651a2f, +0xbc80dc3d54e08851, 0x3fee9f7df9519484, +0xbc781f647e5a3ecf, 0x3fee9f75e8ec5f74, +0xbc86ee4ac08b7db0, 0x3fee9f9a48a58174, +0xbc8619321e55e68a, 0x3fee9feb564267c9, +0x3c909ccb5e09d4d3, 0x3feea0694fde5d3f, +0xbc7b32dcb94da51d, 0x3feea11473eb0187, +0x3c94ecfd5467c06b, 0x3feea1ed0130c132, +0x3c65ebe1abd66c55, 0x3feea2f336cf4e62, +0xbc88a1c52fb3cf42, 0x3feea427543e1a12, +0xbc9369b6f13b3734, 0x3feea589994cce13, +0xbc805e843a19ff1e, 0x3feea71a4623c7ad, +0xbc94d450d872576e, 0x3feea8d99b4492ed, +0x3c90ad675b0e8a00, 0x3feeaac7d98a6699, +0x3c8db72fc1f0eab4, 0x3feeace5422aa0db, +0xbc65b6609cc5e7ff, 0x3feeaf3216b5448c, +0x3c7bf68359f35f44, 0x3feeb1ae99157736, +0xbc93091fa71e3d83, 0x3feeb45b0b91ffc6, +0xbc5da9b88b6c1e29, 0x3feeb737b0cdc5e5, +0xbc6c23f97c90b959, 0x3feeba44cbc8520f, +0xbc92434322f4f9aa, 0x3feebd829fde4e50, +0xbc85ca6cd7668e4b, 0x3feec0f170ca07ba, +0x3c71affc2b91ce27, 0x3feec49182a3f090, +0x3c6dd235e10a73bb, 0x3feec86319e32323, +0xbc87c50422622263, 0x3feecc667b5de565, +0x3c8b1c86e3e231d5, 0x3feed09bec4a2d33, +0xbc91bbd1d3bcbb15, 0x3feed503b23e255d, +0x3c90cc319cee31d2, 0x3feed99e1330b358, +0x3c8469846e735ab3, 0x3feede6b5579fdbf, +0xbc82dfcd978e9db4, 0x3feee36bbfd3f37a, +0x3c8c1a7792cb3387, 0x3feee89f995ad3ad, +0xbc907b8f4ad1d9fa, 0x3feeee07298db666, +0xbc55c3d956dcaeba, 0x3feef3a2b84f15fb, +0xbc90a40e3da6f640, 0x3feef9728de5593a, +0xbc68d6f438ad9334, 0x3feeff76f2fb5e47, +0xbc91eee26b588a35, 0x3fef05b030a1064a, +0x3c74ffd70a5fddcd, 0x3fef0c1e904bc1d2, +0xbc91bdfbfa9298ac, 0x3fef12c25bd71e09, +0x3c736eae30af0cb3, 0x3fef199bdd85529c, +0x3c8ee3325c9ffd94, 0x3fef20ab5fffd07a, +0x3c84e08fd10959ac, 0x3fef27f12e57d14b, +0x3c63cdaf384e1a67, 0x3fef2f6d9406e7b5, +0x3c676b2c6c921968, 0x3fef3720dcef9069, +0xbc808a1883ccb5d2, 0x3fef3f0b555dc3fa, +0xbc8fad5d3ffffa6f, 0x3fef472d4a07897c, +0xbc900dae3875a949, 0x3fef4f87080d89f2, +0x3c74a385a63d07a7, 0x3fef5818dcfba487, +0xbc82919e2040220f, 0x3fef60e316c98398, +0x3c8e5a50d5c192ac, 0x3fef69e603db3285, +0x3c843a59ac016b4b, 0x3fef7321f301b460, +0xbc82d52107b43e1f, 0x3fef7c97337b9b5f, +0xbc892ab93b470dc9, 0x3fef864614f5a129, +0x3c74b604603a88d3, 0x3fef902ee78b3ff6, +0x3c83c5ec519d7271, 0x3fef9a51fbc74c83, +0xbc8ff7128fd391f0, 0x3fefa4afa2a490da, +0xbc8dae98e223747d, 0x3fefaf482d8e67f1, +0x3c8ec3bc41aa2008, 0x3fefba1bee615a27, +0x3c842b94c3a9eb32, 0x3fefc52b376bba97, +0x3c8a64a931d185ee, 0x3fefd0765b6e4540, +0xbc8e37bae43be3ed, 0x3fefdbfdad9cbe14, +0x3c77893b4d91cd9d, 0x3fefe7c1819e90d8, +0x3c5305c14160cc89, 0x3feff3c22b8f71f1, +}, +}; diff --git a/libs/libglibc-compatibility/musl/exp_data.h b/libs/libglibc-compatibility/musl/exp_data.h new file mode 100644 index 00000000000..25361b1a791 --- /dev/null +++ b/libs/libglibc-compatibility/musl/exp_data.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _EXP_DATA_H +#define _EXP_DATA_H + +#include "musl_features.h" +#include + +#define EXP_TABLE_BITS 7 +#define EXP_POLY_ORDER 5 +#define EXP_USE_TOINT_NARROW 0 +#define EXP2_POLY_ORDER 5 +extern hidden const struct exp_data { + double invln2N; + double shift; + double negln2hiN; + double negln2loN; + double poly[4]; /* Last four coefficients. */ + double exp2_shift; + double exp2_poly[EXP2_POLY_ORDER]; + uint64_t tab[2*(1 << EXP_TABLE_BITS)]; +} __exp_data; + +#endif diff --git a/libs/libglibc-compatibility/musl/libm.h b/libs/libglibc-compatibility/musl/libm.h new file mode 100644 index 00000000000..55520c2fb03 --- /dev/null +++ b/libs/libglibc-compatibility/musl/libm.h @@ -0,0 +1,249 @@ +#ifndef _LIBM_H +#define _LIBM_H + +#include +#include +#include +#include +#include "musl_features.h" + +#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024 +#elif LDBL_MANT_DIG == 64 && LDBL_MAX_EXP == 16384 && __BYTE_ORDER == __LITTLE_ENDIAN +union ldshape { + long double f; + struct { + uint64_t m; + uint16_t se; + } i; +}; +#elif LDBL_MANT_DIG == 64 && LDBL_MAX_EXP == 16384 && __BYTE_ORDER == __BIG_ENDIAN +/* This is the m68k variant of 80-bit long double, and this definition only works + * on archs where the alignment requirement of uint64_t is <= 4. */ +union ldshape { + long double f; + struct { + uint16_t se; + uint16_t pad; + uint64_t m; + } i; +}; +#elif LDBL_MANT_DIG == 113 && LDBL_MAX_EXP == 16384 && __BYTE_ORDER == __LITTLE_ENDIAN +union ldshape { + long double f; + struct { + uint64_t lo; + uint32_t mid; + uint16_t top; + uint16_t se; + } i; + struct { + uint64_t lo; + uint64_t hi; + } i2; +}; +#elif LDBL_MANT_DIG == 113 && LDBL_MAX_EXP == 16384 && __BYTE_ORDER == __BIG_ENDIAN +union ldshape { + long double f; + struct { + uint16_t se; + uint16_t top; + uint32_t mid; + uint64_t lo; + } i; + struct { + uint64_t hi; + uint64_t lo; + } i2; +}; +#else +#error Unsupported long double representation +#endif + +/* Support non-nearest rounding mode. */ +#define WANT_ROUNDING 1 +/* Support signaling NaNs. */ +#define WANT_SNAN 0 + +#if WANT_SNAN +#error SNaN is unsupported +#else +#define issignalingf_inline(x) 0 +#define issignaling_inline(x) 0 +#endif + +#ifndef TOINT_INTRINSICS +#define TOINT_INTRINSICS 0 +#endif + +#if TOINT_INTRINSICS +/* Round x to nearest int in all rounding modes, ties have to be rounded + consistently with converttoint so the results match. If the result + would be outside of [-2^31, 2^31-1] then the semantics is unspecified. */ +static double_t roundtoint(double_t); + +/* Convert x to nearest int in all rounding modes, ties have to be rounded + consistently with roundtoint. If the result is not representible in an + int32_t then the semantics is unspecified. */ +static int32_t converttoint(double_t); +#endif + +/* Helps static branch prediction so hot path can be better optimized. */ +#ifdef __GNUC__ +#define predict_true(x) __builtin_expect(!!(x), 1) +#define predict_false(x) __builtin_expect(x, 0) +#else +#define predict_true(x) (x) +#define predict_false(x) (x) +#endif + +/* Evaluate an expression as the specified type. With standard excess + precision handling a type cast or assignment is enough (with + -ffloat-store an assignment is required, in old compilers argument + passing and return statement may not drop excess precision). */ + +static inline float eval_as_float(float x) +{ + float y = x; + return y; +} + +static inline double eval_as_double(double x) +{ + double y = x; + return y; +} + +/* fp_barrier returns its input, but limits code transformations + as if it had a side-effect (e.g. observable io) and returned + an arbitrary value. */ + +#ifndef fp_barrierf +#define fp_barrierf fp_barrierf +static inline float fp_barrierf(float x) +{ + volatile float y = x; + return y; +} +#endif + +#ifndef fp_barrier +#define fp_barrier fp_barrier +static inline double fp_barrier(double x) +{ + volatile double y = x; + return y; +} +#endif + +#ifndef fp_barrierl +#define fp_barrierl fp_barrierl +static inline long double fp_barrierl(long double x) +{ + volatile long double y = x; + return y; +} +#endif + +/* fp_force_eval ensures that the input value is computed when that's + otherwise unused. To prevent the constant folding of the input + expression, an additional fp_barrier may be needed or a compilation + mode that does so (e.g. -frounding-math in gcc). Then it can be + used to evaluate an expression for its fenv side-effects only. */ + +#ifndef fp_force_evalf +#define fp_force_evalf fp_force_evalf +static inline void fp_force_evalf(float x) +{ + volatile float y; + y = x; +} +#endif + +#ifndef fp_force_eval +#define fp_force_eval fp_force_eval +static inline void fp_force_eval(double x) +{ + volatile double y; + y = x; +} +#endif + +#ifndef fp_force_evall +#define fp_force_evall fp_force_evall +static inline void fp_force_evall(long double x) +{ + volatile long double y; + y = x; +} +#endif + +#define FORCE_EVAL(x) do { \ + if (sizeof(x) == sizeof(float)) { \ + fp_force_evalf(x); \ + } else if (sizeof(x) == sizeof(double)) { \ + fp_force_eval(x); \ + } else { \ + fp_force_evall(x); \ + } \ +} while(0) + +#define asuint(f) ((union{float _f; uint32_t _i;}){f})._i +#define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f +#define asuint64(f) ((union{double _f; uint64_t _i;}){f})._i +#define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f + +#define EXTRACT_WORDS(hi,lo,d) \ +do { \ + uint64_t __u = asuint64(d); \ + (hi) = __u >> 32; \ + (lo) = (uint32_t)__u; \ +} while (0) + +#define GET_HIGH_WORD(hi,d) \ +do { \ + (hi) = asuint64(d) >> 32; \ +} while (0) + +#define GET_LOW_WORD(lo,d) \ +do { \ + (lo) = (uint32_t)asuint64(d); \ +} while (0) + +#define INSERT_WORDS(d,hi,lo) \ +do { \ + (d) = asdouble(((uint64_t)(hi)<<32) | (uint32_t)(lo)); \ +} while (0) + +#define SET_HIGH_WORD(d,hi) \ + INSERT_WORDS(d, hi, (uint32_t)asuint64(d)) + +#define SET_LOW_WORD(d,lo) \ + INSERT_WORDS(d, asuint64(d)>>32, lo) + +#define GET_FLOAT_WORD(w,d) \ +do { \ + (w) = asuint(d); \ +} while (0) + +#define SET_FLOAT_WORD(d,w) \ +do { \ + (d) = asfloat(w); \ +} while (0) + +extern int __signgam; +hidden double __lgamma_r(double, int *); +hidden float __lgammaf_r(float, int *); + +/* error handling functions */ +hidden float __math_xflowf(uint32_t, float); +hidden float __math_uflowf(uint32_t); +hidden float __math_oflowf(uint32_t); +hidden float __math_divzerof(uint32_t); +hidden float __math_invalidf(float); +hidden double __math_xflow(uint32_t, double); +hidden double __math_uflow(uint32_t); +hidden double __math_oflow(uint32_t); +hidden double __math_divzero(uint32_t); +hidden double __math_invalid(double); + +#endif diff --git a/libs/libglibc-compatibility/musl/log.c b/libs/libglibc-compatibility/musl/log.c new file mode 100644 index 00000000000..cc52585a949 --- /dev/null +++ b/libs/libglibc-compatibility/musl/log.c @@ -0,0 +1,112 @@ +/* + * Double-precision log(x) function. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include "libm.h" +#include "log_data.h" + +#define T __log_data.tab +#define T2 __log_data.tab2 +#define B __log_data.poly1 +#define A __log_data.poly +#define Ln2hi __log_data.ln2hi +#define Ln2lo __log_data.ln2lo +#define N (1 << LOG_TABLE_BITS) +#define OFF 0x3fe6000000000000 + +/* Top 16 bits of a double. */ +static inline uint32_t top16(double x) +{ + return asuint64(x) >> 48; +} + +double log(double x) +{ + double_t w, z, r, r2, r3, y, invc, logc, kd, hi, lo; + uint64_t ix, iz, tmp; + uint32_t top; + int k, i; + + ix = asuint64(x); + top = top16(x); +#define LO asuint64(1.0 - 0x1p-4) +#define HI asuint64(1.0 + 0x1.09p-4) + if (predict_false(ix - LO < HI - LO)) { + /* Handle close to 1.0 inputs separately. */ + /* Fix sign of zero with downward rounding when x==1. */ + if (WANT_ROUNDING && predict_false(ix == asuint64(1.0))) + return 0; + r = x - 1.0; + r2 = r * r; + r3 = r * r2; + y = r3 * + (B[1] + r * B[2] + r2 * B[3] + + r3 * (B[4] + r * B[5] + r2 * B[6] + + r3 * (B[7] + r * B[8] + r2 * B[9] + r3 * B[10]))); + /* Worst-case error is around 0.507 ULP. */ + w = r * 0x1p27; + double_t rhi = r + w - w; + double_t rlo = r - rhi; + w = rhi * rhi * B[0]; /* B[0] == -0.5. */ + hi = r + w; + lo = r - hi + w; + lo += B[0] * rlo * (rhi + r); + y += lo; + y += hi; + return eval_as_double(y); + } + if (predict_false(top - 0x0010 >= 0x7ff0 - 0x0010)) { + /* x < 0x1p-1022 or inf or nan. */ + if (ix * 2 == 0) + return __math_divzero(1); + if (ix == asuint64(INFINITY)) /* log(inf) == inf. */ + return x; + if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0) + return __math_invalid(x); + /* x is subnormal, normalize it. */ + ix = asuint64(x * 0x1p52); + ix -= 52ULL << 52; + } + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (52 - LOG_TABLE_BITS)) % N; + k = (int64_t)tmp >> 52; /* arithmetic shift */ + iz = ix - (tmp & 0xfffULL << 52); + invc = T[i].invc; + logc = T[i].logc; + z = asdouble(iz); + + /* log(x) = log1p(z/c-1) + log(c) + k*Ln2. */ + /* r ~= z/c - 1, |r| < 1/(2*N). */ +#if __FP_FAST_FMA + /* rounding error: 0x1p-55/N. */ + r = __builtin_fma(z, invc, -1.0); +#else + /* rounding error: 0x1p-55/N + 0x1p-66. */ + r = (z - T2[i].chi - T2[i].clo) * invc; +#endif + kd = (double_t)k; + + /* hi + lo = r + log(c) + k*Ln2. */ + w = kd * Ln2hi + logc; + hi = w + r; + lo = w - hi + r + kd * Ln2lo; + + /* log(x) = lo + (log1p(r) - r) + hi. */ + r2 = r * r; /* rounding error: 0x1p-54/N^2. */ + /* Worst case error if |y| > 0x1p-5: + 0.5 + 4.13/N + abs-poly-error*2^57 ULP (+ 0.002 ULP without fma) + Worst case error if |y| > 0x1p-4: + 0.5 + 2.06/N + abs-poly-error*2^56 ULP (+ 0.001 ULP without fma). */ + y = lo + r2 * A[0] + + r * r2 * (A[1] + r * A[2] + r2 * (A[3] + r * A[4])) + hi; + return eval_as_double(y); +} diff --git a/libs/libglibc-compatibility/musl/log2.c b/libs/libglibc-compatibility/musl/log2.c new file mode 100644 index 00000000000..1276ed4e310 --- /dev/null +++ b/libs/libglibc-compatibility/musl/log2.c @@ -0,0 +1,122 @@ +/* + * Double-precision log2(x) function. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include "libm.h" +#include "log2_data.h" + +#define T __log2_data.tab +#define T2 __log2_data.tab2 +#define B __log2_data.poly1 +#define A __log2_data.poly +#define InvLn2hi __log2_data.invln2hi +#define InvLn2lo __log2_data.invln2lo +#define N (1 << LOG2_TABLE_BITS) +#define OFF 0x3fe6000000000000 + +/* Top 16 bits of a double. */ +static inline uint32_t top16(double x) +{ + return asuint64(x) >> 48; +} + +double log2(double x) +{ + double_t z, r, r2, r4, y, invc, logc, kd, hi, lo, t1, t2, t3, p; + uint64_t ix, iz, tmp; + uint32_t top; + int k, i; + + ix = asuint64(x); + top = top16(x); +#define LO asuint64(1.0 - 0x1.5b51p-5) +#define HI asuint64(1.0 + 0x1.6ab2p-5) + if (predict_false(ix - LO < HI - LO)) { + /* Handle close to 1.0 inputs separately. */ + /* Fix sign of zero with downward rounding when x==1. */ + if (WANT_ROUNDING && predict_false(ix == asuint64(1.0))) + return 0; + r = x - 1.0; +#if __FP_FAST_FMA + hi = r * InvLn2hi; + lo = r * InvLn2lo + __builtin_fma(r, InvLn2hi, -hi); +#else + double_t rhi, rlo; + rhi = asdouble(asuint64(r) & -1ULL << 32); + rlo = r - rhi; + hi = rhi * InvLn2hi; + lo = rlo * InvLn2hi + r * InvLn2lo; +#endif + r2 = r * r; /* rounding error: 0x1p-62. */ + r4 = r2 * r2; + /* Worst-case error is less than 0.54 ULP (0.55 ULP without fma). */ + p = r2 * (B[0] + r * B[1]); + y = hi + p; + lo += hi - y + p; + lo += r4 * (B[2] + r * B[3] + r2 * (B[4] + r * B[5]) + + r4 * (B[6] + r * B[7] + r2 * (B[8] + r * B[9]))); + y += lo; + return eval_as_double(y); + } + if (predict_false(top - 0x0010 >= 0x7ff0 - 0x0010)) { + /* x < 0x1p-1022 or inf or nan. */ + if (ix * 2 == 0) + return __math_divzero(1); + if (ix == asuint64(INFINITY)) /* log(inf) == inf. */ + return x; + if ((top & 0x8000) || (top & 0x7ff0) == 0x7ff0) + return __math_invalid(x); + /* x is subnormal, normalize it. */ + ix = asuint64(x * 0x1p52); + ix -= 52ULL << 52; + } + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (52 - LOG2_TABLE_BITS)) % N; + k = (int64_t)tmp >> 52; /* arithmetic shift */ + iz = ix - (tmp & 0xfffULL << 52); + invc = T[i].invc; + logc = T[i].logc; + z = asdouble(iz); + kd = (double_t)k; + + /* log2(x) = log2(z/c) + log2(c) + k. */ + /* r ~= z/c - 1, |r| < 1/(2*N). */ +#if __FP_FAST_FMA + /* rounding error: 0x1p-55/N. */ + r = __builtin_fma(z, invc, -1.0); + t1 = r * InvLn2hi; + t2 = r * InvLn2lo + __builtin_fma(r, InvLn2hi, -t1); +#else + double_t rhi, rlo; + /* rounding error: 0x1p-55/N + 0x1p-65. */ + r = (z - T2[i].chi - T2[i].clo) * invc; + rhi = asdouble(asuint64(r) & -1ULL << 32); + rlo = r - rhi; + t1 = rhi * InvLn2hi; + t2 = rlo * InvLn2hi + r * InvLn2lo; +#endif + + /* hi + lo = r/ln2 + log2(c) + k. */ + t3 = kd + logc; + hi = t3 + t1; + lo = t3 - hi + t1 + t2; + + /* log2(r+1) = r/ln2 + r^2*poly(r). */ + /* Evaluation is optimized assuming superscalar pipelined execution. */ + r2 = r * r; /* rounding error: 0x1p-54/N^2. */ + r4 = r2 * r2; + /* Worst-case error if |y| > 0x1p-4: 0.547 ULP (0.550 ULP without fma). + ~ 0.5 + 2/N/ln2 + abs-poly-error*0x1p56 ULP (+ 0.003 ULP without fma). */ + p = A[0] + r * A[1] + r2 * (A[2] + r * A[3]) + r4 * (A[4] + r * A[5]); + y = lo + r2 * p + hi; + return eval_as_double(y); +} diff --git a/libs/libglibc-compatibility/musl/log2_data.c b/libs/libglibc-compatibility/musl/log2_data.c new file mode 100644 index 00000000000..3dd1ca5146c --- /dev/null +++ b/libs/libglibc-compatibility/musl/log2_data.c @@ -0,0 +1,201 @@ +/* + * Data for log2. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "log2_data.h" + +#define N (1 << LOG2_TABLE_BITS) + +const struct log2_data __log2_data = { +// First coefficient: 0x1.71547652b82fe1777d0ffda0d24p0 +.invln2hi = 0x1.7154765200000p+0, +.invln2lo = 0x1.705fc2eefa200p-33, +.poly1 = { +// relative error: 0x1.2fad8188p-63 +// in -0x1.5b51p-5 0x1.6ab2p-5 +-0x1.71547652b82fep-1, +0x1.ec709dc3a03f7p-2, +-0x1.71547652b7c3fp-2, +0x1.2776c50f05be4p-2, +-0x1.ec709dd768fe5p-3, +0x1.a61761ec4e736p-3, +-0x1.7153fbc64a79bp-3, +0x1.484d154f01b4ap-3, +-0x1.289e4a72c383cp-3, +0x1.0b32f285aee66p-3, +}, +.poly = { +// relative error: 0x1.a72c2bf8p-58 +// abs error: 0x1.67a552c8p-66 +// in -0x1.f45p-8 0x1.f45p-8 +-0x1.71547652b8339p-1, +0x1.ec709dc3a04bep-2, +-0x1.7154764702ffbp-2, +0x1.2776c50034c48p-2, +-0x1.ec7b328ea92bcp-3, +0x1.a6225e117f92ep-3, +}, +/* Algorithm: + + x = 2^k z + log2(x) = k + log2(c) + log2(z/c) + log2(z/c) = poly(z/c - 1) + +where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls +into the ith one, then table entries are computed as + + tab[i].invc = 1/c + tab[i].logc = (double)log2(c) + tab2[i].chi = (double)c + tab2[i].clo = (double)(c - (double)c) + +where c is near the center of the subinterval and is chosen by trying +-2^29 +floating point invc candidates around 1/center and selecting one for which + + 1) the rounding error in 0x1.8p10 + logc is 0, + 2) the rounding error in z - chi - clo is < 0x1p-64 and + 3) the rounding error in (double)log2(c) is minimized (< 0x1p-68). + +Note: 1) ensures that k + logc can be computed without rounding error, 2) +ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to a +single rounding error when there is no fast fma for z*invc - 1, 3) ensures +that logc + poly(z/c - 1) has small error, however near x == 1 when +|log2(x)| < 0x1p-4, this is not enough so that is special cased. */ +.tab = { +{0x1.724286bb1acf8p+0, -0x1.1095feecdb000p-1}, +{0x1.6e1f766d2cca1p+0, -0x1.08494bd76d000p-1}, +{0x1.6a13d0e30d48ap+0, -0x1.00143aee8f800p-1}, +{0x1.661ec32d06c85p+0, -0x1.efec5360b4000p-2}, +{0x1.623fa951198f8p+0, -0x1.dfdd91ab7e000p-2}, +{0x1.5e75ba4cf026cp+0, -0x1.cffae0cc79000p-2}, +{0x1.5ac055a214fb8p+0, -0x1.c043811fda000p-2}, +{0x1.571ed0f166e1ep+0, -0x1.b0b67323ae000p-2}, +{0x1.53909590bf835p+0, -0x1.a152f5a2db000p-2}, +{0x1.5014fed61adddp+0, -0x1.9217f5af86000p-2}, +{0x1.4cab88e487bd0p+0, -0x1.8304db0719000p-2}, +{0x1.49539b4334feep+0, -0x1.74189f9a9e000p-2}, +{0x1.460cbdfafd569p+0, -0x1.6552bb5199000p-2}, +{0x1.42d664ee4b953p+0, -0x1.56b23a29b1000p-2}, +{0x1.3fb01111dd8a6p+0, -0x1.483650f5fa000p-2}, +{0x1.3c995b70c5836p+0, -0x1.39de937f6a000p-2}, +{0x1.3991c4ab6fd4ap+0, -0x1.2baa1538d6000p-2}, +{0x1.3698e0ce099b5p+0, -0x1.1d98340ca4000p-2}, +{0x1.33ae48213e7b2p+0, -0x1.0fa853a40e000p-2}, +{0x1.30d191985bdb1p+0, -0x1.01d9c32e73000p-2}, +{0x1.2e025cab271d7p+0, -0x1.e857da2fa6000p-3}, +{0x1.2b404cf13cd82p+0, -0x1.cd3c8633d8000p-3}, +{0x1.288b02c7ccb50p+0, -0x1.b26034c14a000p-3}, +{0x1.25e2263944de5p+0, -0x1.97c1c2f4fe000p-3}, +{0x1.234563d8615b1p+0, -0x1.7d6023f800000p-3}, +{0x1.20b46e33eaf38p+0, -0x1.633a71a05e000p-3}, +{0x1.1e2eefdcda3ddp+0, -0x1.494f5e9570000p-3}, +{0x1.1bb4a580b3930p+0, -0x1.2f9e424e0a000p-3}, +{0x1.19453847f2200p+0, -0x1.162595afdc000p-3}, +{0x1.16e06c0d5d73cp+0, -0x1.f9c9a75bd8000p-4}, +{0x1.1485f47b7e4c2p+0, -0x1.c7b575bf9c000p-4}, +{0x1.12358ad0085d1p+0, -0x1.960c60ff48000p-4}, +{0x1.0fef00f532227p+0, -0x1.64ce247b60000p-4}, +{0x1.0db2077d03a8fp+0, -0x1.33f78b2014000p-4}, +{0x1.0b7e6d65980d9p+0, -0x1.0387d1a42c000p-4}, +{0x1.0953efe7b408dp+0, -0x1.a6f9208b50000p-5}, +{0x1.07325cac53b83p+0, -0x1.47a954f770000p-5}, +{0x1.05197e40d1b5cp+0, -0x1.d23a8c50c0000p-6}, +{0x1.03091c1208ea2p+0, -0x1.16a2629780000p-6}, +{0x1.0101025b37e21p+0, -0x1.720f8d8e80000p-8}, +{0x1.fc07ef9caa76bp-1, 0x1.6fe53b1500000p-7}, +{0x1.f4465d3f6f184p-1, 0x1.11ccce10f8000p-5}, +{0x1.ecc079f84107fp-1, 0x1.c4dfc8c8b8000p-5}, +{0x1.e573a99975ae8p-1, 0x1.3aa321e574000p-4}, +{0x1.de5d6f0bd3de6p-1, 0x1.918a0d08b8000p-4}, +{0x1.d77b681ff38b3p-1, 0x1.e72e9da044000p-4}, +{0x1.d0cb5724de943p-1, 0x1.1dcd2507f6000p-3}, +{0x1.ca4b2dc0e7563p-1, 0x1.476ab03dea000p-3}, +{0x1.c3f8ee8d6cb51p-1, 0x1.7074377e22000p-3}, +{0x1.bdd2b4f020c4cp-1, 0x1.98ede8ba94000p-3}, +{0x1.b7d6c006015cap-1, 0x1.c0db86ad2e000p-3}, +{0x1.b20366e2e338fp-1, 0x1.e840aafcee000p-3}, +{0x1.ac57026295039p-1, 0x1.0790ab4678000p-2}, +{0x1.a6d01bc2731ddp-1, 0x1.1ac056801c000p-2}, +{0x1.a16d3bc3ff18bp-1, 0x1.2db11d4fee000p-2}, +{0x1.9c2d14967feadp-1, 0x1.406464ec58000p-2}, +{0x1.970e4f47c9902p-1, 0x1.52dbe093af000p-2}, +{0x1.920fb3982bcf2p-1, 0x1.651902050d000p-2}, +{0x1.8d30187f759f1p-1, 0x1.771d2cdeaf000p-2}, +{0x1.886e5ebb9f66dp-1, 0x1.88e9c857d9000p-2}, +{0x1.83c97b658b994p-1, 0x1.9a80155e16000p-2}, +{0x1.7f405ffc61022p-1, 0x1.abe186ed3d000p-2}, +{0x1.7ad22181415cap-1, 0x1.bd0f2aea0e000p-2}, +{0x1.767dcf99eff8cp-1, 0x1.ce0a43dbf4000p-2}, +}, +#if !__FP_FAST_FMA +.tab2 = { +{0x1.6200012b90a8ep-1, 0x1.904ab0644b605p-55}, +{0x1.66000045734a6p-1, 0x1.1ff9bea62f7a9p-57}, +{0x1.69fffc325f2c5p-1, 0x1.27ecfcb3c90bap-55}, +{0x1.6e00038b95a04p-1, 0x1.8ff8856739326p-55}, +{0x1.71fffe09994e3p-1, 0x1.afd40275f82b1p-55}, +{0x1.7600015590e1p-1, -0x1.2fd75b4238341p-56}, +{0x1.7a00012655bd5p-1, 0x1.808e67c242b76p-56}, +{0x1.7e0003259e9a6p-1, -0x1.208e426f622b7p-57}, +{0x1.81fffedb4b2d2p-1, -0x1.402461ea5c92fp-55}, +{0x1.860002dfafcc3p-1, 0x1.df7f4a2f29a1fp-57}, +{0x1.89ffff78c6b5p-1, -0x1.e0453094995fdp-55}, +{0x1.8e00039671566p-1, -0x1.a04f3bec77b45p-55}, +{0x1.91fffe2bf1745p-1, -0x1.7fa34400e203cp-56}, +{0x1.95fffcc5c9fd1p-1, -0x1.6ff8005a0695dp-56}, +{0x1.9a0003bba4767p-1, 0x1.0f8c4c4ec7e03p-56}, +{0x1.9dfffe7b92da5p-1, 0x1.e7fd9478c4602p-55}, +{0x1.a1fffd72efdafp-1, -0x1.a0c554dcdae7ep-57}, +{0x1.a5fffde04ff95p-1, 0x1.67da98ce9b26bp-55}, +{0x1.a9fffca5e8d2bp-1, -0x1.284c9b54c13dep-55}, +{0x1.adfffddad03eap-1, 0x1.812c8ea602e3cp-58}, +{0x1.b1ffff10d3d4dp-1, -0x1.efaddad27789cp-55}, +{0x1.b5fffce21165ap-1, 0x1.3cb1719c61237p-58}, +{0x1.b9fffd950e674p-1, 0x1.3f7d94194cep-56}, +{0x1.be000139ca8afp-1, 0x1.50ac4215d9bcp-56}, +{0x1.c20005b46df99p-1, 0x1.beea653e9c1c9p-57}, +{0x1.c600040b9f7aep-1, -0x1.c079f274a70d6p-56}, +{0x1.ca0006255fd8ap-1, -0x1.a0b4076e84c1fp-56}, +{0x1.cdfffd94c095dp-1, 0x1.8f933f99ab5d7p-55}, +{0x1.d1ffff975d6cfp-1, -0x1.82c08665fe1bep-58}, +{0x1.d5fffa2561c93p-1, -0x1.b04289bd295f3p-56}, +{0x1.d9fff9d228b0cp-1, 0x1.70251340fa236p-55}, +{0x1.de00065bc7e16p-1, -0x1.5011e16a4d80cp-56}, +{0x1.e200002f64791p-1, 0x1.9802f09ef62ep-55}, +{0x1.e600057d7a6d8p-1, -0x1.e0b75580cf7fap-56}, +{0x1.ea00027edc00cp-1, -0x1.c848309459811p-55}, +{0x1.ee0006cf5cb7cp-1, -0x1.f8027951576f4p-55}, +{0x1.f2000782b7dccp-1, -0x1.f81d97274538fp-55}, +{0x1.f6000260c450ap-1, -0x1.071002727ffdcp-59}, +{0x1.f9fffe88cd533p-1, -0x1.81bdce1fda8bp-58}, +{0x1.fdfffd50f8689p-1, 0x1.7f91acb918e6ep-55}, +{0x1.0200004292367p+0, 0x1.b7ff365324681p-54}, +{0x1.05fffe3e3d668p+0, 0x1.6fa08ddae957bp-55}, +{0x1.0a0000a85a757p+0, -0x1.7e2de80d3fb91p-58}, +{0x1.0e0001a5f3fccp+0, -0x1.1823305c5f014p-54}, +{0x1.11ffff8afbaf5p+0, -0x1.bfabb6680bac2p-55}, +{0x1.15fffe54d91adp+0, -0x1.d7f121737e7efp-54}, +{0x1.1a00011ac36e1p+0, 0x1.c000a0516f5ffp-54}, +{0x1.1e00019c84248p+0, -0x1.082fbe4da5dap-54}, +{0x1.220000ffe5e6ep+0, -0x1.8fdd04c9cfb43p-55}, +{0x1.26000269fd891p+0, 0x1.cfe2a7994d182p-55}, +{0x1.2a00029a6e6dap+0, -0x1.00273715e8bc5p-56}, +{0x1.2dfffe0293e39p+0, 0x1.b7c39dab2a6f9p-54}, +{0x1.31ffff7dcf082p+0, 0x1.df1336edc5254p-56}, +{0x1.35ffff05a8b6p+0, -0x1.e03564ccd31ebp-54}, +{0x1.3a0002e0eaeccp+0, 0x1.5f0e74bd3a477p-56}, +{0x1.3e000043bb236p+0, 0x1.c7dcb149d8833p-54}, +{0x1.4200002d187ffp+0, 0x1.e08afcf2d3d28p-56}, +{0x1.460000d387cb1p+0, 0x1.20837856599a6p-55}, +{0x1.4a00004569f89p+0, -0x1.9fa5c904fbcd2p-55}, +{0x1.4e000043543f3p+0, -0x1.81125ed175329p-56}, +{0x1.51fffcc027f0fp+0, 0x1.883d8847754dcp-54}, +{0x1.55ffffd87b36fp+0, -0x1.709e731d02807p-55}, +{0x1.59ffff21df7bap+0, 0x1.7f79f68727b02p-55}, +{0x1.5dfffebfc3481p+0, -0x1.180902e30e93ep-54}, +}, +#endif +}; diff --git a/libs/libglibc-compatibility/musl/log2_data.h b/libs/libglibc-compatibility/musl/log2_data.h new file mode 100644 index 00000000000..c4a748cf8e0 --- /dev/null +++ b/libs/libglibc-compatibility/musl/log2_data.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _LOG2_DATA_H +#define _LOG2_DATA_H + +#include "musl_features.h" + +#define LOG2_TABLE_BITS 6 +#define LOG2_POLY_ORDER 7 +#define LOG2_POLY1_ORDER 11 +extern hidden const struct log2_data { + double invln2hi; + double invln2lo; + double poly[LOG2_POLY_ORDER - 1]; + double poly1[LOG2_POLY1_ORDER - 1]; + struct { + double invc, logc; + } tab[1 << LOG2_TABLE_BITS]; +#if !__FP_FAST_FMA + struct { + double chi, clo; + } tab2[1 << LOG2_TABLE_BITS]; +#endif +} __log2_data; + +#endif diff --git a/libs/libglibc-compatibility/musl/log_data.c b/libs/libglibc-compatibility/musl/log_data.c new file mode 100644 index 00000000000..1a6ec712a0c --- /dev/null +++ b/libs/libglibc-compatibility/musl/log_data.c @@ -0,0 +1,328 @@ +/* + * Data for log. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "log_data.h" + +#define N (1 << LOG_TABLE_BITS) + +const struct log_data __log_data = { +.ln2hi = 0x1.62e42fefa3800p-1, +.ln2lo = 0x1.ef35793c76730p-45, +.poly1 = { +// relative error: 0x1.c04d76cp-63 +// in -0x1p-4 0x1.09p-4 (|log(1+x)| > 0x1p-4 outside the interval) +-0x1p-1, +0x1.5555555555577p-2, +-0x1.ffffffffffdcbp-3, +0x1.999999995dd0cp-3, +-0x1.55555556745a7p-3, +0x1.24924a344de3p-3, +-0x1.fffffa4423d65p-4, +0x1.c7184282ad6cap-4, +-0x1.999eb43b068ffp-4, +0x1.78182f7afd085p-4, +-0x1.5521375d145cdp-4, +}, +.poly = { +// relative error: 0x1.926199e8p-56 +// abs error: 0x1.882ff33p-65 +// in -0x1.fp-9 0x1.fp-9 +-0x1.0000000000001p-1, +0x1.555555551305bp-2, +-0x1.fffffffeb459p-3, +0x1.999b324f10111p-3, +-0x1.55575e506c89fp-3, +}, +/* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + log(z/c) + log(z/c) = poly(z/c - 1) + +where z is in [1.6p-1; 1.6p0] which is split into N subintervals and z falls +into the ith one, then table entries are computed as + + tab[i].invc = 1/c + tab[i].logc = (double)log(c) + tab2[i].chi = (double)c + tab2[i].clo = (double)(c - (double)c) + +where c is near the center of the subinterval and is chosen by trying +-2^29 +floating point invc candidates around 1/center and selecting one for which + + 1) the rounding error in 0x1.8p9 + logc is 0, + 2) the rounding error in z - chi - clo is < 0x1p-66 and + 3) the rounding error in (double)log(c) is minimized (< 0x1p-66). + +Note: 1) ensures that k*ln2hi + logc can be computed without rounding error, +2) ensures that z/c - 1 can be computed as (z - chi - clo)*invc with close to +a single rounding error when there is no fast fma for z*invc - 1, 3) ensures +that logc + poly(z/c - 1) has small error, however near x == 1 when +|log(x)| < 0x1p-4, this is not enough so that is special cased. */ +.tab = { +{0x1.734f0c3e0de9fp+0, -0x1.7cc7f79e69000p-2}, +{0x1.713786a2ce91fp+0, -0x1.76feec20d0000p-2}, +{0x1.6f26008fab5a0p+0, -0x1.713e31351e000p-2}, +{0x1.6d1a61f138c7dp+0, -0x1.6b85b38287800p-2}, +{0x1.6b1490bc5b4d1p+0, -0x1.65d5590807800p-2}, +{0x1.69147332f0cbap+0, -0x1.602d076180000p-2}, +{0x1.6719f18224223p+0, -0x1.5a8ca86909000p-2}, +{0x1.6524f99a51ed9p+0, -0x1.54f4356035000p-2}, +{0x1.63356aa8f24c4p+0, -0x1.4f637c36b4000p-2}, +{0x1.614b36b9ddc14p+0, -0x1.49da7fda85000p-2}, +{0x1.5f66452c65c4cp+0, -0x1.445923989a800p-2}, +{0x1.5d867b5912c4fp+0, -0x1.3edf439b0b800p-2}, +{0x1.5babccb5b90dep+0, -0x1.396ce448f7000p-2}, +{0x1.59d61f2d91a78p+0, -0x1.3401e17bda000p-2}, +{0x1.5805612465687p+0, -0x1.2e9e2ef468000p-2}, +{0x1.56397cee76bd3p+0, -0x1.2941b3830e000p-2}, +{0x1.54725e2a77f93p+0, -0x1.23ec58cda8800p-2}, +{0x1.52aff42064583p+0, -0x1.1e9e129279000p-2}, +{0x1.50f22dbb2bddfp+0, -0x1.1956d2b48f800p-2}, +{0x1.4f38f4734ded7p+0, -0x1.141679ab9f800p-2}, +{0x1.4d843cfde2840p+0, -0x1.0edd094ef9800p-2}, +{0x1.4bd3ec078a3c8p+0, -0x1.09aa518db1000p-2}, +{0x1.4a27fc3e0258ap+0, -0x1.047e65263b800p-2}, +{0x1.4880524d48434p+0, -0x1.feb224586f000p-3}, +{0x1.46dce1b192d0bp+0, -0x1.f474a7517b000p-3}, +{0x1.453d9d3391854p+0, -0x1.ea4443d103000p-3}, +{0x1.43a2744b4845ap+0, -0x1.e020d44e9b000p-3}, +{0x1.420b54115f8fbp+0, -0x1.d60a22977f000p-3}, +{0x1.40782da3ef4b1p+0, -0x1.cc00104959000p-3}, +{0x1.3ee8f5d57fe8fp+0, -0x1.c202956891000p-3}, +{0x1.3d5d9a00b4ce9p+0, -0x1.b81178d811000p-3}, +{0x1.3bd60c010c12bp+0, -0x1.ae2c9ccd3d000p-3}, +{0x1.3a5242b75dab8p+0, -0x1.a45402e129000p-3}, +{0x1.38d22cd9fd002p+0, -0x1.9a877681df000p-3}, +{0x1.3755bc5847a1cp+0, -0x1.90c6d69483000p-3}, +{0x1.35dce49ad36e2p+0, -0x1.87120a645c000p-3}, +{0x1.34679984dd440p+0, -0x1.7d68fb4143000p-3}, +{0x1.32f5cceffcb24p+0, -0x1.73cb83c627000p-3}, +{0x1.3187775a10d49p+0, -0x1.6a39a9b376000p-3}, +{0x1.301c8373e3990p+0, -0x1.60b3154b7a000p-3}, +{0x1.2eb4ebb95f841p+0, -0x1.5737d76243000p-3}, +{0x1.2d50a0219a9d1p+0, -0x1.4dc7b8fc23000p-3}, +{0x1.2bef9a8b7fd2ap+0, -0x1.4462c51d20000p-3}, +{0x1.2a91c7a0c1babp+0, -0x1.3b08abc830000p-3}, +{0x1.293726014b530p+0, -0x1.31b996b490000p-3}, +{0x1.27dfa5757a1f5p+0, -0x1.2875490a44000p-3}, +{0x1.268b39b1d3bbfp+0, -0x1.1f3b9f879a000p-3}, +{0x1.2539d838ff5bdp+0, -0x1.160c8252ca000p-3}, +{0x1.23eb7aac9083bp+0, -0x1.0ce7f57f72000p-3}, +{0x1.22a012ba940b6p+0, -0x1.03cdc49fea000p-3}, +{0x1.2157996cc4132p+0, -0x1.f57bdbc4b8000p-4}, +{0x1.201201dd2fc9bp+0, -0x1.e370896404000p-4}, +{0x1.1ecf4494d480bp+0, -0x1.d17983ef94000p-4}, +{0x1.1d8f5528f6569p+0, -0x1.bf9674ed8a000p-4}, +{0x1.1c52311577e7cp+0, -0x1.adc79202f6000p-4}, +{0x1.1b17c74cb26e9p+0, -0x1.9c0c3e7288000p-4}, +{0x1.19e010c2c1ab6p+0, -0x1.8a646b372c000p-4}, +{0x1.18ab07bb670bdp+0, -0x1.78d01b3ac0000p-4}, +{0x1.1778a25efbcb6p+0, -0x1.674f145380000p-4}, +{0x1.1648d354c31dap+0, -0x1.55e0e6d878000p-4}, +{0x1.151b990275fddp+0, -0x1.4485cdea1e000p-4}, +{0x1.13f0ea432d24cp+0, -0x1.333d94d6aa000p-4}, +{0x1.12c8b7210f9dap+0, -0x1.22079f8c56000p-4}, +{0x1.11a3028ecb531p+0, -0x1.10e4698622000p-4}, +{0x1.107fbda8434afp+0, -0x1.ffa6c6ad20000p-5}, +{0x1.0f5ee0f4e6bb3p+0, -0x1.dda8d4a774000p-5}, +{0x1.0e4065d2a9fcep+0, -0x1.bbcece4850000p-5}, +{0x1.0d244632ca521p+0, -0x1.9a1894012c000p-5}, +{0x1.0c0a77ce2981ap+0, -0x1.788583302c000p-5}, +{0x1.0af2f83c636d1p+0, -0x1.5715e67d68000p-5}, +{0x1.09ddb98a01339p+0, -0x1.35c8a49658000p-5}, +{0x1.08cabaf52e7dfp+0, -0x1.149e364154000p-5}, +{0x1.07b9f2f4e28fbp+0, -0x1.e72c082eb8000p-6}, +{0x1.06ab58c358f19p+0, -0x1.a55f152528000p-6}, +{0x1.059eea5ecf92cp+0, -0x1.63d62cf818000p-6}, +{0x1.04949cdd12c90p+0, -0x1.228fb8caa0000p-6}, +{0x1.038c6c6f0ada9p+0, -0x1.c317b20f90000p-7}, +{0x1.02865137932a9p+0, -0x1.419355daa0000p-7}, +{0x1.0182427ea7348p+0, -0x1.81203c2ec0000p-8}, +{0x1.008040614b195p+0, -0x1.0040979240000p-9}, +{0x1.fe01ff726fa1ap-1, 0x1.feff384900000p-9}, +{0x1.fa11cc261ea74p-1, 0x1.7dc41353d0000p-7}, +{0x1.f6310b081992ep-1, 0x1.3cea3c4c28000p-6}, +{0x1.f25f63ceeadcdp-1, 0x1.b9fc114890000p-6}, +{0x1.ee9c8039113e7p-1, 0x1.1b0d8ce110000p-5}, +{0x1.eae8078cbb1abp-1, 0x1.58a5bd001c000p-5}, +{0x1.e741aa29d0c9bp-1, 0x1.95c8340d88000p-5}, +{0x1.e3a91830a99b5p-1, 0x1.d276aef578000p-5}, +{0x1.e01e009609a56p-1, 0x1.07598e598c000p-4}, +{0x1.dca01e577bb98p-1, 0x1.253f5e30d2000p-4}, +{0x1.d92f20b7c9103p-1, 0x1.42edd8b380000p-4}, +{0x1.d5cac66fb5ccep-1, 0x1.606598757c000p-4}, +{0x1.d272caa5ede9dp-1, 0x1.7da76356a0000p-4}, +{0x1.cf26e3e6b2ccdp-1, 0x1.9ab434e1c6000p-4}, +{0x1.cbe6da2a77902p-1, 0x1.b78c7bb0d6000p-4}, +{0x1.c8b266d37086dp-1, 0x1.d431332e72000p-4}, +{0x1.c5894bd5d5804p-1, 0x1.f0a3171de6000p-4}, +{0x1.c26b533bb9f8cp-1, 0x1.067152b914000p-3}, +{0x1.bf583eeece73fp-1, 0x1.147858292b000p-3}, +{0x1.bc4fd75db96c1p-1, 0x1.2266ecdca3000p-3}, +{0x1.b951e0c864a28p-1, 0x1.303d7a6c55000p-3}, +{0x1.b65e2c5ef3e2cp-1, 0x1.3dfc33c331000p-3}, +{0x1.b374867c9888bp-1, 0x1.4ba366b7a8000p-3}, +{0x1.b094b211d304ap-1, 0x1.5933928d1f000p-3}, +{0x1.adbe885f2ef7ep-1, 0x1.66acd2418f000p-3}, +{0x1.aaf1d31603da2p-1, 0x1.740f8ec669000p-3}, +{0x1.a82e63fd358a7p-1, 0x1.815c0f51af000p-3}, +{0x1.a5740ef09738bp-1, 0x1.8e92954f68000p-3}, +{0x1.a2c2a90ab4b27p-1, 0x1.9bb3602f84000p-3}, +{0x1.a01a01393f2d1p-1, 0x1.a8bed1c2c0000p-3}, +{0x1.9d79f24db3c1bp-1, 0x1.b5b515c01d000p-3}, +{0x1.9ae2505c7b190p-1, 0x1.c2967ccbcc000p-3}, +{0x1.9852ef297ce2fp-1, 0x1.cf635d5486000p-3}, +{0x1.95cbaeea44b75p-1, 0x1.dc1bd3446c000p-3}, +{0x1.934c69de74838p-1, 0x1.e8c01b8cfe000p-3}, +{0x1.90d4f2f6752e6p-1, 0x1.f5509c0179000p-3}, +{0x1.8e6528effd79dp-1, 0x1.00e6c121fb800p-2}, +{0x1.8bfce9fcc007cp-1, 0x1.071b80e93d000p-2}, +{0x1.899c0dabec30ep-1, 0x1.0d46b9e867000p-2}, +{0x1.87427aa2317fbp-1, 0x1.13687334bd000p-2}, +{0x1.84f00acb39a08p-1, 0x1.1980d67234800p-2}, +{0x1.82a49e8653e55p-1, 0x1.1f8ffe0cc8000p-2}, +{0x1.8060195f40260p-1, 0x1.2595fd7636800p-2}, +{0x1.7e22563e0a329p-1, 0x1.2b9300914a800p-2}, +{0x1.7beb377dcb5adp-1, 0x1.3187210436000p-2}, +{0x1.79baa679725c2p-1, 0x1.377266dec1800p-2}, +{0x1.77907f2170657p-1, 0x1.3d54ffbaf3000p-2}, +{0x1.756cadbd6130cp-1, 0x1.432eee32fe000p-2}, +}, +#if !__FP_FAST_FMA +.tab2 = { +{0x1.61000014fb66bp-1, 0x1.e026c91425b3cp-56}, +{0x1.63000034db495p-1, 0x1.dbfea48005d41p-55}, +{0x1.650000d94d478p-1, 0x1.e7fa786d6a5b7p-55}, +{0x1.67000074e6fadp-1, 0x1.1fcea6b54254cp-57}, +{0x1.68ffffedf0faep-1, -0x1.c7e274c590efdp-56}, +{0x1.6b0000763c5bcp-1, -0x1.ac16848dcda01p-55}, +{0x1.6d0001e5cc1f6p-1, 0x1.33f1c9d499311p-55}, +{0x1.6efffeb05f63ep-1, -0x1.e80041ae22d53p-56}, +{0x1.710000e86978p-1, 0x1.bff6671097952p-56}, +{0x1.72ffffc67e912p-1, 0x1.c00e226bd8724p-55}, +{0x1.74fffdf81116ap-1, -0x1.e02916ef101d2p-57}, +{0x1.770000f679c9p-1, -0x1.7fc71cd549c74p-57}, +{0x1.78ffffa7ec835p-1, 0x1.1bec19ef50483p-55}, +{0x1.7affffe20c2e6p-1, -0x1.07e1729cc6465p-56}, +{0x1.7cfffed3fc9p-1, -0x1.08072087b8b1cp-55}, +{0x1.7efffe9261a76p-1, 0x1.dc0286d9df9aep-55}, +{0x1.81000049ca3e8p-1, 0x1.97fd251e54c33p-55}, +{0x1.8300017932c8fp-1, -0x1.afee9b630f381p-55}, +{0x1.850000633739cp-1, 0x1.9bfbf6b6535bcp-55}, +{0x1.87000204289c6p-1, -0x1.bbf65f3117b75p-55}, +{0x1.88fffebf57904p-1, -0x1.9006ea23dcb57p-55}, +{0x1.8b00022bc04dfp-1, -0x1.d00df38e04b0ap-56}, +{0x1.8cfffe50c1b8ap-1, -0x1.8007146ff9f05p-55}, +{0x1.8effffc918e43p-1, 0x1.3817bd07a7038p-55}, +{0x1.910001efa5fc7p-1, 0x1.93e9176dfb403p-55}, +{0x1.9300013467bb9p-1, 0x1.f804e4b980276p-56}, +{0x1.94fffe6ee076fp-1, -0x1.f7ef0d9ff622ep-55}, +{0x1.96fffde3c12d1p-1, -0x1.082aa962638bap-56}, +{0x1.98ffff4458a0dp-1, -0x1.7801b9164a8efp-55}, +{0x1.9afffdd982e3ep-1, -0x1.740e08a5a9337p-55}, +{0x1.9cfffed49fb66p-1, 0x1.fce08c19bep-60}, +{0x1.9f00020f19c51p-1, -0x1.a3faa27885b0ap-55}, +{0x1.a10001145b006p-1, 0x1.4ff489958da56p-56}, +{0x1.a300007bbf6fap-1, 0x1.cbeab8a2b6d18p-55}, +{0x1.a500010971d79p-1, 0x1.8fecadd78793p-55}, +{0x1.a70001df52e48p-1, -0x1.f41763dd8abdbp-55}, +{0x1.a90001c593352p-1, -0x1.ebf0284c27612p-55}, +{0x1.ab0002a4f3e4bp-1, -0x1.9fd043cff3f5fp-57}, +{0x1.acfffd7ae1ed1p-1, -0x1.23ee7129070b4p-55}, +{0x1.aefffee510478p-1, 0x1.a063ee00edea3p-57}, +{0x1.b0fffdb650d5bp-1, 0x1.a06c8381f0ab9p-58}, +{0x1.b2ffffeaaca57p-1, -0x1.9011e74233c1dp-56}, +{0x1.b4fffd995badcp-1, -0x1.9ff1068862a9fp-56}, +{0x1.b7000249e659cp-1, 0x1.aff45d0864f3ep-55}, +{0x1.b8ffff987164p-1, 0x1.cfe7796c2c3f9p-56}, +{0x1.bafffd204cb4fp-1, -0x1.3ff27eef22bc4p-57}, +{0x1.bcfffd2415c45p-1, -0x1.cffb7ee3bea21p-57}, +{0x1.beffff86309dfp-1, -0x1.14103972e0b5cp-55}, +{0x1.c0fffe1b57653p-1, 0x1.bc16494b76a19p-55}, +{0x1.c2ffff1fa57e3p-1, -0x1.4feef8d30c6edp-57}, +{0x1.c4fffdcbfe424p-1, -0x1.43f68bcec4775p-55}, +{0x1.c6fffed54b9f7p-1, 0x1.47ea3f053e0ecp-55}, +{0x1.c8fffeb998fd5p-1, 0x1.383068df992f1p-56}, +{0x1.cb0002125219ap-1, -0x1.8fd8e64180e04p-57}, +{0x1.ccfffdd94469cp-1, 0x1.e7ebe1cc7ea72p-55}, +{0x1.cefffeafdc476p-1, 0x1.ebe39ad9f88fep-55}, +{0x1.d1000169af82bp-1, 0x1.57d91a8b95a71p-56}, +{0x1.d30000d0ff71dp-1, 0x1.9c1906970c7dap-55}, +{0x1.d4fffea790fc4p-1, -0x1.80e37c558fe0cp-58}, +{0x1.d70002edc87e5p-1, -0x1.f80d64dc10f44p-56}, +{0x1.d900021dc82aap-1, -0x1.47c8f94fd5c5cp-56}, +{0x1.dafffd86b0283p-1, 0x1.c7f1dc521617ep-55}, +{0x1.dd000296c4739p-1, 0x1.8019eb2ffb153p-55}, +{0x1.defffe54490f5p-1, 0x1.e00d2c652cc89p-57}, +{0x1.e0fffcdabf694p-1, -0x1.f8340202d69d2p-56}, +{0x1.e2fffdb52c8ddp-1, 0x1.b00c1ca1b0864p-56}, +{0x1.e4ffff24216efp-1, 0x1.2ffa8b094ab51p-56}, +{0x1.e6fffe88a5e11p-1, -0x1.7f673b1efbe59p-58}, +{0x1.e9000119eff0dp-1, -0x1.4808d5e0bc801p-55}, +{0x1.eafffdfa51744p-1, 0x1.80006d54320b5p-56}, +{0x1.ed0001a127fa1p-1, -0x1.002f860565c92p-58}, +{0x1.ef00007babcc4p-1, -0x1.540445d35e611p-55}, +{0x1.f0ffff57a8d02p-1, -0x1.ffb3139ef9105p-59}, +{0x1.f30001ee58ac7p-1, 0x1.a81acf2731155p-55}, +{0x1.f4ffff5823494p-1, 0x1.a3f41d4d7c743p-55}, +{0x1.f6ffffca94c6bp-1, -0x1.202f41c987875p-57}, +{0x1.f8fffe1f9c441p-1, 0x1.77dd1f477e74bp-56}, +{0x1.fafffd2e0e37ep-1, -0x1.f01199a7ca331p-57}, +{0x1.fd0001c77e49ep-1, 0x1.181ee4bceacb1p-56}, +{0x1.feffff7e0c331p-1, -0x1.e05370170875ap-57}, +{0x1.00ffff465606ep+0, -0x1.a7ead491c0adap-55}, +{0x1.02ffff3867a58p+0, -0x1.77f69c3fcb2ep-54}, +{0x1.04ffffdfc0d17p+0, 0x1.7bffe34cb945bp-54}, +{0x1.0700003cd4d82p+0, 0x1.20083c0e456cbp-55}, +{0x1.08ffff9f2cbe8p+0, -0x1.dffdfbe37751ap-57}, +{0x1.0b000010cda65p+0, -0x1.13f7faee626ebp-54}, +{0x1.0d00001a4d338p+0, 0x1.07dfa79489ff7p-55}, +{0x1.0effffadafdfdp+0, -0x1.7040570d66bcp-56}, +{0x1.110000bbafd96p+0, 0x1.e80d4846d0b62p-55}, +{0x1.12ffffae5f45dp+0, 0x1.dbffa64fd36efp-54}, +{0x1.150000dd59ad9p+0, 0x1.a0077701250aep-54}, +{0x1.170000f21559ap+0, 0x1.dfdf9e2e3deeep-55}, +{0x1.18ffffc275426p+0, 0x1.10030dc3b7273p-54}, +{0x1.1b000123d3c59p+0, 0x1.97f7980030188p-54}, +{0x1.1cffff8299eb7p+0, -0x1.5f932ab9f8c67p-57}, +{0x1.1effff48ad4p+0, 0x1.37fbf9da75bebp-54}, +{0x1.210000c8b86a4p+0, 0x1.f806b91fd5b22p-54}, +{0x1.2300003854303p+0, 0x1.3ffc2eb9fbf33p-54}, +{0x1.24fffffbcf684p+0, 0x1.601e77e2e2e72p-56}, +{0x1.26ffff52921d9p+0, 0x1.ffcbb767f0c61p-56}, +{0x1.2900014933a3cp+0, -0x1.202ca3c02412bp-56}, +{0x1.2b00014556313p+0, -0x1.2808233f21f02p-54}, +{0x1.2cfffebfe523bp+0, -0x1.8ff7e384fdcf2p-55}, +{0x1.2f0000bb8ad96p+0, -0x1.5ff51503041c5p-55}, +{0x1.30ffffb7ae2afp+0, -0x1.10071885e289dp-55}, +{0x1.32ffffeac5f7fp+0, -0x1.1ff5d3fb7b715p-54}, +{0x1.350000ca66756p+0, 0x1.57f82228b82bdp-54}, +{0x1.3700011fbf721p+0, 0x1.000bac40dd5ccp-55}, +{0x1.38ffff9592fb9p+0, -0x1.43f9d2db2a751p-54}, +{0x1.3b00004ddd242p+0, 0x1.57f6b707638e1p-55}, +{0x1.3cffff5b2c957p+0, 0x1.a023a10bf1231p-56}, +{0x1.3efffeab0b418p+0, 0x1.87f6d66b152bp-54}, +{0x1.410001532aff4p+0, 0x1.7f8375f198524p-57}, +{0x1.4300017478b29p+0, 0x1.301e672dc5143p-55}, +{0x1.44fffe795b463p+0, 0x1.9ff69b8b2895ap-55}, +{0x1.46fffe80475ep+0, -0x1.5c0b19bc2f254p-54}, +{0x1.48fffef6fc1e7p+0, 0x1.b4009f23a2a72p-54}, +{0x1.4afffe5bea704p+0, -0x1.4ffb7bf0d7d45p-54}, +{0x1.4d000171027dep+0, -0x1.9c06471dc6a3dp-54}, +{0x1.4f0000ff03ee2p+0, 0x1.77f890b85531cp-54}, +{0x1.5100012dc4bd1p+0, 0x1.004657166a436p-57}, +{0x1.530001605277ap+0, -0x1.6bfcece233209p-54}, +{0x1.54fffecdb704cp+0, -0x1.902720505a1d7p-55}, +{0x1.56fffef5f54a9p+0, 0x1.bbfe60ec96412p-54}, +{0x1.5900017e61012p+0, 0x1.87ec581afef9p-55}, +{0x1.5b00003c93e92p+0, -0x1.f41080abf0ccp-54}, +{0x1.5d0001d4919bcp+0, -0x1.8812afb254729p-54}, +{0x1.5efffe7b87a89p+0, -0x1.47eb780ed6904p-54}, +}, +#endif +}; diff --git a/libs/libglibc-compatibility/musl/log_data.h b/libs/libglibc-compatibility/musl/log_data.h new file mode 100644 index 00000000000..5fb90329a32 --- /dev/null +++ b/libs/libglibc-compatibility/musl/log_data.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _LOG_DATA_H +#define _LOG_DATA_H + +#include "musl_features.h" + +#define LOG_TABLE_BITS 7 +#define LOG_POLY_ORDER 6 +#define LOG_POLY1_ORDER 12 +extern hidden const struct log_data { + double ln2hi; + double ln2lo; + double poly[LOG_POLY_ORDER - 1]; /* First coefficient is 1. */ + double poly1[LOG_POLY1_ORDER - 1]; + struct { + double invc, logc; + } tab[1 << LOG_TABLE_BITS]; +#if !__FP_FAST_FMA + struct { + double chi, clo; + } tab2[1 << LOG_TABLE_BITS]; +#endif +} __log_data; + +#endif diff --git a/libs/libglibc-compatibility/musl/logf.c b/libs/libglibc-compatibility/musl/logf.c index bb2cf39405b..7ee5d7fe623 100644 --- a/libs/libglibc-compatibility/musl/logf.c +++ b/libs/libglibc-compatibility/musl/logf.c @@ -7,11 +7,9 @@ #include #include +#include "libm.h" #include "logf_data.h" -float __math_invalidf(float); -float __math_divzerof(uint32_t); - /* LOGF_TABLE_BITS = 4 LOGF_POLY_ORDER = 4 @@ -25,21 +23,6 @@ Relative error: 1.957 * 2^-26 (before rounding.) #define Ln2 __logf_data.ln2 #define N (1 << LOGF_TABLE_BITS) #define OFF 0x3f330000 -#define WANT_ROUNDING 1 - -#define asuint(f) ((union{float _f; uint32_t _i;}){f})._i -#define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f - -/* Evaluate an expression as the specified type. With standard excess - precision handling a type cast or assignment is enough (with - -ffloat-store an assignment is required, in old compilers argument - passing and return statement may not drop excess precision). */ - -static inline float eval_as_float(float x) -{ - float y = x; - return y; -} float logf(float x) { @@ -49,9 +32,9 @@ float logf(float x) ix = asuint(x); /* Fix sign of zero with downward rounding when x==1. */ - if (WANT_ROUNDING && __builtin_expect(ix == 0x3f800000, 0)) + if (WANT_ROUNDING && predict_false(ix == 0x3f800000)) return 0; - if (__builtin_expect(ix - 0x00800000 >= 0x7f800000 - 0x00800000, 0)) { + if (predict_false(ix - 0x00800000 >= 0x7f800000 - 0x00800000)) { /* x < 0x1p-126 or inf or nan. */ if (ix * 2 == 0) return __math_divzerof(1); diff --git a/libs/libglibc-compatibility/musl/logf_data.h b/libs/libglibc-compatibility/musl/logf_data.h index a11a8984cc1..278b7a72a16 100644 --- a/libs/libglibc-compatibility/musl/logf_data.h +++ b/libs/libglibc-compatibility/musl/logf_data.h @@ -5,9 +5,11 @@ #ifndef _LOGF_DATA_H #define _LOGF_DATA_H +#include "musl_features.h" + #define LOGF_TABLE_BITS 4 #define LOGF_POLY_ORDER 4 -extern __attribute__((__visibility__("hidden"))) const struct logf_data { +extern hidden const struct logf_data { struct { double invc, logc; } tab[1 << LOGF_TABLE_BITS]; diff --git a/libs/libglibc-compatibility/musl/musl_features.h b/libs/libglibc-compatibility/musl/musl_features.h new file mode 100644 index 00000000000..b656efcf4d6 --- /dev/null +++ b/libs/libglibc-compatibility/musl/musl_features.h @@ -0,0 +1,8 @@ +#pragma once + +#define weak __attribute__((__weak__)) +#define hidden __attribute__((__visibility__("hidden"))) +#define weak_alias(old, new) \ + extern __typeof(old) new __attribute__((__weak__, __alias__(#old))) + +#define predict_false(x) __builtin_expect(x, 0) diff --git a/libs/libglibc-compatibility/musl/pow.c b/libs/libglibc-compatibility/musl/pow.c new file mode 100644 index 00000000000..694c2ef64d0 --- /dev/null +++ b/libs/libglibc-compatibility/musl/pow.c @@ -0,0 +1,343 @@ +/* + * Double-precision x^y function. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include "libm.h" +#include "exp_data.h" +#include "pow_data.h" + +/* +Worst-case error: 0.54 ULP (~= ulperr_exp + 1024*Ln2*relerr_log*2^53) +relerr_log: 1.3 * 2^-68 (Relative error of log, 1.5 * 2^-68 without fma) +ulperr_exp: 0.509 ULP (ULP error of exp, 0.511 ULP without fma) +*/ + +#define T __pow_log_data.tab +#define A __pow_log_data.poly +#define Ln2hi __pow_log_data.ln2hi +#define Ln2lo __pow_log_data.ln2lo +#define N (1 << POW_LOG_TABLE_BITS) +#define OFF 0x3fe6955500000000 + +/* Top 12 bits of a double (sign and exponent bits). */ +static inline uint32_t top12(double x) +{ + return asuint64(x) >> 52; +} + +/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about + additional 15 bits precision. IX is the bit representation of x, but + normalized in the subnormal range using the sign bit for the exponent. */ +static inline double_t log_inline(uint64_t ix, double_t *tail) +{ + /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */ + double_t z, r, y, invc, logc, logctail, kd, hi, t1, t2, lo, lo1, lo2, p; + uint64_t iz, tmp; + int k, i; + + /* x = 2^k z; where z is in range [OFF,2*OFF) and exact. + The range is split into N subintervals. + The ith subinterval contains z and c is near its center. */ + tmp = ix - OFF; + i = (tmp >> (52 - POW_LOG_TABLE_BITS)) % N; + k = (int64_t)tmp >> 52; /* arithmetic shift */ + iz = ix - (tmp & 0xfffULL << 52); + z = asdouble(iz); + kd = (double_t)k; + + /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ + invc = T[i].invc; + logc = T[i].logc; + logctail = T[i].logctail; + + /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and + |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */ +#if __FP_FAST_FMA + r = __builtin_fma(z, invc, -1.0); +#else + /* Split z such that rhi, rlo and rhi*rhi are exact and |rlo| <= |r|. */ + double_t zhi = asdouble((iz + (1ULL << 31)) & (-1ULL << 32)); + double_t zlo = z - zhi; + double_t rhi = zhi * invc - 1.0; + double_t rlo = zlo * invc; + r = rhi + rlo; +#endif + + /* k*Ln2 + log(c) + r. */ + t1 = kd * Ln2hi + logc; + t2 = t1 + r; + lo1 = kd * Ln2lo + logctail; + lo2 = t1 - t2 + r; + + /* Evaluation is optimized assuming superscalar pipelined execution. */ + double_t ar, ar2, ar3, lo3, lo4; + ar = A[0] * r; /* A[0] = -0.5. */ + ar2 = r * ar; + ar3 = r * ar2; + /* k*Ln2 + log(c) + r + A[0]*r*r. */ +#if __FP_FAST_FMA + hi = t2 + ar2; + lo3 = __builtin_fma(ar, r, -ar2); + lo4 = t2 - hi + ar2; +#else + double_t arhi = A[0] * rhi; + double_t arhi2 = rhi * arhi; + hi = t2 + arhi2; + lo3 = rlo * (ar + arhi); + lo4 = t2 - hi + arhi2; +#endif + /* p = log1p(r) - r - A[0]*r*r. */ + p = (ar3 * (A[1] + r * A[2] + + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r * A[6])))); + lo = lo1 + lo2 + lo3 + lo4 + p; + y = hi + lo; + *tail = hi - y + lo; + return y; +} + +#undef N +#undef T +#define N (1 << EXP_TABLE_BITS) +#define InvLn2N __exp_data.invln2N +#define NegLn2hiN __exp_data.negln2hiN +#define NegLn2loN __exp_data.negln2loN +#define Shift __exp_data.shift +#define T __exp_data.tab +#define C2 __exp_data.poly[5 - EXP_POLY_ORDER] +#define C3 __exp_data.poly[6 - EXP_POLY_ORDER] +#define C4 __exp_data.poly[7 - EXP_POLY_ORDER] +#define C5 __exp_data.poly[8 - EXP_POLY_ORDER] +#define C6 __exp_data.poly[9 - EXP_POLY_ORDER] + +/* Handle cases that may overflow or underflow when computing the result that + is scale*(1+TMP) without intermediate rounding. The bit representation of + scale is in SBITS, however it has a computed exponent that may have + overflown into the sign bit so that needs to be adjusted before using it as + a double. (int32_t)KI is the k used in the argument reduction and exponent + adjustment of scale, positive k here means the result may overflow and + negative k means the result may underflow. */ +static inline double specialcase(double_t tmp, uint64_t sbits, uint64_t ki) +{ + double_t scale, y; + + if ((ki & 0x80000000) == 0) { + /* k > 0, the exponent of scale might have overflowed by <= 460. */ + sbits -= 1009ull << 52; + scale = asdouble(sbits); + y = 0x1p1009 * (scale + scale * tmp); + return eval_as_double(y); + } + /* k < 0, need special care in the subnormal range. */ + sbits += 1022ull << 52; + /* Note: sbits is signed scale. */ + scale = asdouble(sbits); + y = scale + scale * tmp; + if (fabs(y) < 1.0) { + /* Round y to the right precision before scaling it into the subnormal + range to avoid double rounding that can cause 0.5+E/2 ulp error where + E is the worst-case ulp error outside the subnormal range. So this + is only useful if the goal is better than 1 ulp worst-case error. */ + double_t hi, lo, one = 1.0; + if (y < 0.0) + one = -1.0; + lo = scale - y + scale * tmp; + hi = one + y; + lo = one - hi + y + lo; + y = eval_as_double(hi + lo) - one; + /* Fix the sign of 0. */ + if (y == 0.0) + y = asdouble(sbits & 0x8000000000000000); + /* The underflow exception needs to be signaled explicitly. */ + fp_force_eval(fp_barrier(0x1p-1022) * 0x1p-1022); + } + y = 0x1p-1022 * y; + return eval_as_double(y); +} + +#define SIGN_BIAS (0x800 << EXP_TABLE_BITS) + +/* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|. + The sign_bias argument is SIGN_BIAS or 0 and sets the sign to -1 or 1. */ +static inline double exp_inline(double_t x, double_t xtail, uint32_t sign_bias) +{ + uint32_t abstop; + uint64_t ki, idx, top, sbits; + /* double_t for better performance on targets with FLT_EVAL_METHOD==2. */ + double_t kd, z, r, r2, scale, tail, tmp; + + abstop = top12(x) & 0x7ff; + if (predict_false(abstop - top12(0x1p-54) >= + top12(512.0) - top12(0x1p-54))) { + if (abstop - top12(0x1p-54) >= 0x80000000) { + /* Avoid spurious underflow for tiny x. */ + /* Note: 0 is common input. */ + double_t one = WANT_ROUNDING ? 1.0 + x : 1.0; + return sign_bias ? -one : one; + } + if (abstop >= top12(1024.0)) { + /* Note: inf and nan are already handled. */ + if (asuint64(x) >> 63) + return __math_uflow(sign_bias); + else + return __math_oflow(sign_bias); + } + /* Large x is special cased below. */ + abstop = 0; + } + + /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */ + /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */ + z = InvLn2N * x; +#if TOINT_INTRINSICS + kd = roundtoint(z); + ki = converttoint(z); +#elif EXP_USE_TOINT_NARROW + /* z - kd is in [-0.5-2^-16, 0.5] in all rounding modes. */ + kd = eval_as_double(z + Shift); + ki = asuint64(kd) >> 16; + kd = (double_t)(int32_t)ki; +#else + /* z - kd is in [-1, 1] in non-nearest rounding modes. */ + kd = eval_as_double(z + Shift); + ki = asuint64(kd); + kd -= Shift; +#endif + r = x + kd * NegLn2hiN + kd * NegLn2loN; + /* The code assumes 2^-200 < |xtail| < 2^-8/N. */ + r += xtail; + /* 2^(k/N) ~= scale * (1 + tail). */ + idx = 2 * (ki % N); + top = (ki + sign_bias) << (52 - EXP_TABLE_BITS); + tail = asdouble(T[idx]); + /* This is only a valid scale when -1023*N < k < 1024*N. */ + sbits = T[idx + 1] + top; + /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (tail + exp(r) - 1). */ + /* Evaluation is optimized assuming superscalar pipelined execution. */ + r2 = r * r; + /* Without fma the worst case error is 0.25/N ulp larger. */ + /* Worst case error is less than 0.5+1.11/N+(abs poly error * 2^53) ulp. */ + tmp = tail + r + r2 * (C2 + r * C3) + r2 * r2 * (C4 + r * C5); + if (predict_false(abstop == 0)) + return specialcase(tmp, sbits, ki); + scale = asdouble(sbits); + /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there + is no spurious underflow here even without fma. */ + return eval_as_double(scale + scale * tmp); +} + +/* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is + the bit representation of a non-zero finite floating-point value. */ +static inline int checkint(uint64_t iy) +{ + int e = iy >> 52 & 0x7ff; + if (e < 0x3ff) + return 0; + if (e > 0x3ff + 52) + return 2; + if (iy & ((1ULL << (0x3ff + 52 - e)) - 1)) + return 0; + if (iy & (1ULL << (0x3ff + 52 - e))) + return 1; + return 2; +} + +/* Returns 1 if input is the bit representation of 0, infinity or nan. */ +static inline int zeroinfnan(uint64_t i) +{ + return 2 * i - 1 >= 2 * asuint64(INFINITY) - 1; +} + +double pow(double x, double y) +{ + uint32_t sign_bias = 0; + uint64_t ix, iy; + uint32_t topx, topy; + + ix = asuint64(x); + iy = asuint64(y); + topx = top12(x); + topy = top12(y); + if (predict_false(topx - 0x001 >= 0x7ff - 0x001 || + (topy & 0x7ff) - 0x3be >= 0x43e - 0x3be)) { + /* Note: if |y| > 1075 * ln2 * 2^53 ~= 0x1.749p62 then pow(x,y) = inf/0 + and if |y| < 2^-54 / 1075 ~= 0x1.e7b6p-65 then pow(x,y) = +-1. */ + /* Special cases: (x < 0x1p-126 or inf or nan) or + (|y| < 0x1p-65 or |y| >= 0x1p63 or nan). */ + if (predict_false(zeroinfnan(iy))) { + if (2 * iy == 0) + return issignaling_inline(x) ? x + y : 1.0; + if (ix == asuint64(1.0)) + return issignaling_inline(y) ? x + y : 1.0; + if (2 * ix > 2 * asuint64(INFINITY) || + 2 * iy > 2 * asuint64(INFINITY)) + return x + y; + if (2 * ix == 2 * asuint64(1.0)) + return 1.0; + if ((2 * ix < 2 * asuint64(1.0)) == !(iy >> 63)) + return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */ + return y * y; + } + if (predict_false(zeroinfnan(ix))) { + double_t x2 = x * x; + if (ix >> 63 && checkint(iy) == 1) + x2 = -x2; + /* Without the barrier some versions of clang hoist the 1/x2 and + thus division by zero exception can be signaled spuriously. */ + return iy >> 63 ? fp_barrier(1 / x2) : x2; + } + /* Here x and y are non-zero finite. */ + if (ix >> 63) { + /* Finite x < 0. */ + int yint = checkint(iy); + if (yint == 0) + return __math_invalid(x); + if (yint == 1) + sign_bias = SIGN_BIAS; + ix &= 0x7fffffffffffffff; + topx &= 0x7ff; + } + if ((topy & 0x7ff) - 0x3be >= 0x43e - 0x3be) { + /* Note: sign_bias == 0 here because y is not odd. */ + if (ix == asuint64(1.0)) + return 1.0; + if ((topy & 0x7ff) < 0x3be) { + /* |y| < 2^-65, x^y ~= 1 + y*log(x). */ + if (WANT_ROUNDING) + return ix > asuint64(1.0) ? 1.0 + y : + 1.0 - y; + else + return 1.0; + } + return (ix > asuint64(1.0)) == (topy < 0x800) ? + __math_oflow(0) : + __math_uflow(0); + } + if (topx == 0) { + /* Normalize subnormal x so exponent becomes negative. */ + ix = asuint64(x * 0x1p52); + ix &= 0x7fffffffffffffff; + ix -= 52ULL << 52; + } + } + + double_t lo; + double_t hi = log_inline(ix, &lo); + double_t ehi, elo; +#if __FP_FAST_FMA + ehi = y * hi; + elo = y * lo + __builtin_fma(y, hi, -ehi); +#else + double_t yhi = asdouble(iy & -1ULL << 27); + double_t ylo = y - yhi; + double_t lhi = asdouble(asuint64(hi) & -1ULL << 27); + double_t llo = hi - lhi + lo; + ehi = yhi * lhi; + elo = ylo * lhi + y * llo; /* |elo| < |ehi| * 2^-25. */ +#endif + return exp_inline(ehi, elo, sign_bias); +} diff --git a/libs/libglibc-compatibility/musl/pow_data.c b/libs/libglibc-compatibility/musl/pow_data.c new file mode 100644 index 00000000000..81e760de196 --- /dev/null +++ b/libs/libglibc-compatibility/musl/pow_data.c @@ -0,0 +1,180 @@ +/* + * Data for the log part of pow. + * + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ + +#include "pow_data.h" + +#define N (1 << POW_LOG_TABLE_BITS) + +const struct pow_log_data __pow_log_data = { +.ln2hi = 0x1.62e42fefa3800p-1, +.ln2lo = 0x1.ef35793c76730p-45, +.poly = { +// relative error: 0x1.11922ap-70 +// in -0x1.6bp-8 0x1.6bp-8 +// Coefficients are scaled to match the scaling during evaluation. +-0x1p-1, +0x1.555555555556p-2 * -2, +-0x1.0000000000006p-2 * -2, +0x1.999999959554ep-3 * 4, +-0x1.555555529a47ap-3 * 4, +0x1.2495b9b4845e9p-3 * -8, +-0x1.0002b8b263fc3p-3 * -8, +}, +/* Algorithm: + + x = 2^k z + log(x) = k ln2 + log(c) + log(z/c) + log(z/c) = poly(z/c - 1) + +where z is in [0x1.69555p-1; 0x1.69555p0] which is split into N subintervals +and z falls into the ith one, then table entries are computed as + + tab[i].invc = 1/c + tab[i].logc = round(0x1p43*log(c))/0x1p43 + tab[i].logctail = (double)(log(c) - logc) + +where c is chosen near the center of the subinterval such that 1/c has only a +few precision bits so z/c - 1 is exactly representible as double: + + 1/c = center < 1 ? round(N/center)/N : round(2*N/center)/N/2 + +Note: |z/c - 1| < 1/N for the chosen c, |log(c) - logc - logctail| < 0x1p-97, +the last few bits of logc are rounded away so k*ln2hi + logc has no rounding +error and the interval for z is selected such that near x == 1, where log(x) +is tiny, large cancellation error is avoided in logc + poly(z/c - 1). */ +.tab = { +#define A(a, b, c) {a, 0, b, c}, +A(0x1.6a00000000000p+0, -0x1.62c82f2b9c800p-2, 0x1.ab42428375680p-48) +A(0x1.6800000000000p+0, -0x1.5d1bdbf580800p-2, -0x1.ca508d8e0f720p-46) +A(0x1.6600000000000p+0, -0x1.5767717455800p-2, -0x1.362a4d5b6506dp-45) +A(0x1.6400000000000p+0, -0x1.51aad872df800p-2, -0x1.684e49eb067d5p-49) +A(0x1.6200000000000p+0, -0x1.4be5f95777800p-2, -0x1.41b6993293ee0p-47) +A(0x1.6000000000000p+0, -0x1.4618bc21c6000p-2, 0x1.3d82f484c84ccp-46) +A(0x1.5e00000000000p+0, -0x1.404308686a800p-2, 0x1.c42f3ed820b3ap-50) +A(0x1.5c00000000000p+0, -0x1.3a64c55694800p-2, 0x1.0b1c686519460p-45) +A(0x1.5a00000000000p+0, -0x1.347dd9a988000p-2, 0x1.5594dd4c58092p-45) +A(0x1.5800000000000p+0, -0x1.2e8e2bae12000p-2, 0x1.67b1e99b72bd8p-45) +A(0x1.5600000000000p+0, -0x1.2895a13de8800p-2, 0x1.5ca14b6cfb03fp-46) +A(0x1.5600000000000p+0, -0x1.2895a13de8800p-2, 0x1.5ca14b6cfb03fp-46) +A(0x1.5400000000000p+0, -0x1.22941fbcf7800p-2, -0x1.65a242853da76p-46) +A(0x1.5200000000000p+0, -0x1.1c898c1699800p-2, -0x1.fafbc68e75404p-46) +A(0x1.5000000000000p+0, -0x1.1675cababa800p-2, 0x1.f1fc63382a8f0p-46) +A(0x1.4e00000000000p+0, -0x1.1058bf9ae4800p-2, -0x1.6a8c4fd055a66p-45) +A(0x1.4c00000000000p+0, -0x1.0a324e2739000p-2, -0x1.c6bee7ef4030ep-47) +A(0x1.4a00000000000p+0, -0x1.0402594b4d000p-2, -0x1.036b89ef42d7fp-48) +A(0x1.4a00000000000p+0, -0x1.0402594b4d000p-2, -0x1.036b89ef42d7fp-48) +A(0x1.4800000000000p+0, -0x1.fb9186d5e4000p-3, 0x1.d572aab993c87p-47) +A(0x1.4600000000000p+0, -0x1.ef0adcbdc6000p-3, 0x1.b26b79c86af24p-45) +A(0x1.4400000000000p+0, -0x1.e27076e2af000p-3, -0x1.72f4f543fff10p-46) +A(0x1.4200000000000p+0, -0x1.d5c216b4fc000p-3, 0x1.1ba91bbca681bp-45) +A(0x1.4000000000000p+0, -0x1.c8ff7c79aa000p-3, 0x1.7794f689f8434p-45) +A(0x1.4000000000000p+0, -0x1.c8ff7c79aa000p-3, 0x1.7794f689f8434p-45) +A(0x1.3e00000000000p+0, -0x1.bc286742d9000p-3, 0x1.94eb0318bb78fp-46) +A(0x1.3c00000000000p+0, -0x1.af3c94e80c000p-3, 0x1.a4e633fcd9066p-52) +A(0x1.3a00000000000p+0, -0x1.a23bc1fe2b000p-3, -0x1.58c64dc46c1eap-45) +A(0x1.3a00000000000p+0, -0x1.a23bc1fe2b000p-3, -0x1.58c64dc46c1eap-45) +A(0x1.3800000000000p+0, -0x1.9525a9cf45000p-3, -0x1.ad1d904c1d4e3p-45) +A(0x1.3600000000000p+0, -0x1.87fa06520d000p-3, 0x1.bbdbf7fdbfa09p-45) +A(0x1.3400000000000p+0, -0x1.7ab890210e000p-3, 0x1.bdb9072534a58p-45) +A(0x1.3400000000000p+0, -0x1.7ab890210e000p-3, 0x1.bdb9072534a58p-45) +A(0x1.3200000000000p+0, -0x1.6d60fe719d000p-3, -0x1.0e46aa3b2e266p-46) +A(0x1.3000000000000p+0, -0x1.5ff3070a79000p-3, -0x1.e9e439f105039p-46) +A(0x1.3000000000000p+0, -0x1.5ff3070a79000p-3, -0x1.e9e439f105039p-46) +A(0x1.2e00000000000p+0, -0x1.526e5e3a1b000p-3, -0x1.0de8b90075b8fp-45) +A(0x1.2c00000000000p+0, -0x1.44d2b6ccb8000p-3, 0x1.70cc16135783cp-46) +A(0x1.2c00000000000p+0, -0x1.44d2b6ccb8000p-3, 0x1.70cc16135783cp-46) +A(0x1.2a00000000000p+0, -0x1.371fc201e9000p-3, 0x1.178864d27543ap-48) +A(0x1.2800000000000p+0, -0x1.29552f81ff000p-3, -0x1.48d301771c408p-45) +A(0x1.2600000000000p+0, -0x1.1b72ad52f6000p-3, -0x1.e80a41811a396p-45) +A(0x1.2600000000000p+0, -0x1.1b72ad52f6000p-3, -0x1.e80a41811a396p-45) +A(0x1.2400000000000p+0, -0x1.0d77e7cd09000p-3, 0x1.a699688e85bf4p-47) +A(0x1.2400000000000p+0, -0x1.0d77e7cd09000p-3, 0x1.a699688e85bf4p-47) +A(0x1.2200000000000p+0, -0x1.fec9131dbe000p-4, -0x1.575545ca333f2p-45) +A(0x1.2000000000000p+0, -0x1.e27076e2b0000p-4, 0x1.a342c2af0003cp-45) +A(0x1.2000000000000p+0, -0x1.e27076e2b0000p-4, 0x1.a342c2af0003cp-45) +A(0x1.1e00000000000p+0, -0x1.c5e548f5bc000p-4, -0x1.d0c57585fbe06p-46) +A(0x1.1c00000000000p+0, -0x1.a926d3a4ae000p-4, 0x1.53935e85baac8p-45) +A(0x1.1c00000000000p+0, -0x1.a926d3a4ae000p-4, 0x1.53935e85baac8p-45) +A(0x1.1a00000000000p+0, -0x1.8c345d631a000p-4, 0x1.37c294d2f5668p-46) +A(0x1.1a00000000000p+0, -0x1.8c345d631a000p-4, 0x1.37c294d2f5668p-46) +A(0x1.1800000000000p+0, -0x1.6f0d28ae56000p-4, -0x1.69737c93373dap-45) +A(0x1.1600000000000p+0, -0x1.51b073f062000p-4, 0x1.f025b61c65e57p-46) +A(0x1.1600000000000p+0, -0x1.51b073f062000p-4, 0x1.f025b61c65e57p-46) +A(0x1.1400000000000p+0, -0x1.341d7961be000p-4, 0x1.c5edaccf913dfp-45) +A(0x1.1400000000000p+0, -0x1.341d7961be000p-4, 0x1.c5edaccf913dfp-45) +A(0x1.1200000000000p+0, -0x1.16536eea38000p-4, 0x1.47c5e768fa309p-46) +A(0x1.1000000000000p+0, -0x1.f0a30c0118000p-5, 0x1.d599e83368e91p-45) +A(0x1.1000000000000p+0, -0x1.f0a30c0118000p-5, 0x1.d599e83368e91p-45) +A(0x1.0e00000000000p+0, -0x1.b42dd71198000p-5, 0x1.c827ae5d6704cp-46) +A(0x1.0e00000000000p+0, -0x1.b42dd71198000p-5, 0x1.c827ae5d6704cp-46) +A(0x1.0c00000000000p+0, -0x1.77458f632c000p-5, -0x1.cfc4634f2a1eep-45) +A(0x1.0c00000000000p+0, -0x1.77458f632c000p-5, -0x1.cfc4634f2a1eep-45) +A(0x1.0a00000000000p+0, -0x1.39e87b9fec000p-5, 0x1.502b7f526feaap-48) +A(0x1.0a00000000000p+0, -0x1.39e87b9fec000p-5, 0x1.502b7f526feaap-48) +A(0x1.0800000000000p+0, -0x1.f829b0e780000p-6, -0x1.980267c7e09e4p-45) +A(0x1.0800000000000p+0, -0x1.f829b0e780000p-6, -0x1.980267c7e09e4p-45) +A(0x1.0600000000000p+0, -0x1.7b91b07d58000p-6, -0x1.88d5493faa639p-45) +A(0x1.0400000000000p+0, -0x1.fc0a8b0fc0000p-7, -0x1.f1e7cf6d3a69cp-50) +A(0x1.0400000000000p+0, -0x1.fc0a8b0fc0000p-7, -0x1.f1e7cf6d3a69cp-50) +A(0x1.0200000000000p+0, -0x1.fe02a6b100000p-8, -0x1.9e23f0dda40e4p-46) +A(0x1.0200000000000p+0, -0x1.fe02a6b100000p-8, -0x1.9e23f0dda40e4p-46) +A(0x1.0000000000000p+0, 0x0.0000000000000p+0, 0x0.0000000000000p+0) +A(0x1.0000000000000p+0, 0x0.0000000000000p+0, 0x0.0000000000000p+0) +A(0x1.fc00000000000p-1, 0x1.0101575890000p-7, -0x1.0c76b999d2be8p-46) +A(0x1.f800000000000p-1, 0x1.0205658938000p-6, -0x1.3dc5b06e2f7d2p-45) +A(0x1.f400000000000p-1, 0x1.8492528c90000p-6, -0x1.aa0ba325a0c34p-45) +A(0x1.f000000000000p-1, 0x1.0415d89e74000p-5, 0x1.111c05cf1d753p-47) +A(0x1.ec00000000000p-1, 0x1.466aed42e0000p-5, -0x1.c167375bdfd28p-45) +A(0x1.e800000000000p-1, 0x1.894aa149fc000p-5, -0x1.97995d05a267dp-46) +A(0x1.e400000000000p-1, 0x1.ccb73cdddc000p-5, -0x1.a68f247d82807p-46) +A(0x1.e200000000000p-1, 0x1.eea31c006c000p-5, -0x1.e113e4fc93b7bp-47) +A(0x1.de00000000000p-1, 0x1.1973bd1466000p-4, -0x1.5325d560d9e9bp-45) +A(0x1.da00000000000p-1, 0x1.3bdf5a7d1e000p-4, 0x1.cc85ea5db4ed7p-45) +A(0x1.d600000000000p-1, 0x1.5e95a4d97a000p-4, -0x1.c69063c5d1d1ep-45) +A(0x1.d400000000000p-1, 0x1.700d30aeac000p-4, 0x1.c1e8da99ded32p-49) +A(0x1.d000000000000p-1, 0x1.9335e5d594000p-4, 0x1.3115c3abd47dap-45) +A(0x1.cc00000000000p-1, 0x1.b6ac88dad6000p-4, -0x1.390802bf768e5p-46) +A(0x1.ca00000000000p-1, 0x1.c885801bc4000p-4, 0x1.646d1c65aacd3p-45) +A(0x1.c600000000000p-1, 0x1.ec739830a2000p-4, -0x1.dc068afe645e0p-45) +A(0x1.c400000000000p-1, 0x1.fe89139dbe000p-4, -0x1.534d64fa10afdp-45) +A(0x1.c000000000000p-1, 0x1.1178e8227e000p-3, 0x1.1ef78ce2d07f2p-45) +A(0x1.be00000000000p-1, 0x1.1aa2b7e23f000p-3, 0x1.ca78e44389934p-45) +A(0x1.ba00000000000p-1, 0x1.2d1610c868000p-3, 0x1.39d6ccb81b4a1p-47) +A(0x1.b800000000000p-1, 0x1.365fcb0159000p-3, 0x1.62fa8234b7289p-51) +A(0x1.b400000000000p-1, 0x1.4913d8333b000p-3, 0x1.5837954fdb678p-45) +A(0x1.b200000000000p-1, 0x1.527e5e4a1b000p-3, 0x1.633e8e5697dc7p-45) +A(0x1.ae00000000000p-1, 0x1.6574ebe8c1000p-3, 0x1.9cf8b2c3c2e78p-46) +A(0x1.ac00000000000p-1, 0x1.6f0128b757000p-3, -0x1.5118de59c21e1p-45) +A(0x1.aa00000000000p-1, 0x1.7898d85445000p-3, -0x1.c661070914305p-46) +A(0x1.a600000000000p-1, 0x1.8beafeb390000p-3, -0x1.73d54aae92cd1p-47) +A(0x1.a400000000000p-1, 0x1.95a5adcf70000p-3, 0x1.7f22858a0ff6fp-47) +A(0x1.a000000000000p-1, 0x1.a93ed3c8ae000p-3, -0x1.8724350562169p-45) +A(0x1.9e00000000000p-1, 0x1.b31d8575bd000p-3, -0x1.c358d4eace1aap-47) +A(0x1.9c00000000000p-1, 0x1.bd087383be000p-3, -0x1.d4bc4595412b6p-45) +A(0x1.9a00000000000p-1, 0x1.c6ffbc6f01000p-3, -0x1.1ec72c5962bd2p-48) +A(0x1.9600000000000p-1, 0x1.db13db0d49000p-3, -0x1.aff2af715b035p-45) +A(0x1.9400000000000p-1, 0x1.e530effe71000p-3, 0x1.212276041f430p-51) +A(0x1.9200000000000p-1, 0x1.ef5ade4dd0000p-3, -0x1.a211565bb8e11p-51) +A(0x1.9000000000000p-1, 0x1.f991c6cb3b000p-3, 0x1.bcbecca0cdf30p-46) +A(0x1.8c00000000000p-1, 0x1.07138604d5800p-2, 0x1.89cdb16ed4e91p-48) +A(0x1.8a00000000000p-1, 0x1.0c42d67616000p-2, 0x1.7188b163ceae9p-45) +A(0x1.8800000000000p-1, 0x1.1178e8227e800p-2, -0x1.c210e63a5f01cp-45) +A(0x1.8600000000000p-1, 0x1.16b5ccbacf800p-2, 0x1.b9acdf7a51681p-45) +A(0x1.8400000000000p-1, 0x1.1bf99635a6800p-2, 0x1.ca6ed5147bdb7p-45) +A(0x1.8200000000000p-1, 0x1.214456d0eb800p-2, 0x1.a87deba46baeap-47) +A(0x1.7e00000000000p-1, 0x1.2bef07cdc9000p-2, 0x1.a9cfa4a5004f4p-45) +A(0x1.7c00000000000p-1, 0x1.314f1e1d36000p-2, -0x1.8e27ad3213cb8p-45) +A(0x1.7a00000000000p-1, 0x1.36b6776be1000p-2, 0x1.16ecdb0f177c8p-46) +A(0x1.7800000000000p-1, 0x1.3c25277333000p-2, 0x1.83b54b606bd5cp-46) +A(0x1.7600000000000p-1, 0x1.419b423d5e800p-2, 0x1.8e436ec90e09dp-47) +A(0x1.7400000000000p-1, 0x1.4718dc271c800p-2, -0x1.f27ce0967d675p-45) +A(0x1.7200000000000p-1, 0x1.4c9e09e173000p-2, -0x1.e20891b0ad8a4p-45) +A(0x1.7000000000000p-1, 0x1.522ae0738a000p-2, 0x1.ebe708164c759p-45) +A(0x1.6e00000000000p-1, 0x1.57bf753c8d000p-2, 0x1.fadedee5d40efp-46) +A(0x1.6c00000000000p-1, 0x1.5d5bddf596000p-2, -0x1.a0b2a08a465dcp-47) +}, +}; diff --git a/libs/libglibc-compatibility/musl/pow_data.h b/libs/libglibc-compatibility/musl/pow_data.h new file mode 100644 index 00000000000..4c3d084b220 --- /dev/null +++ b/libs/libglibc-compatibility/musl/pow_data.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2018, Arm Limited. + * SPDX-License-Identifier: MIT + */ +#ifndef _POW_DATA_H +#define _POW_DATA_H + +#include "musl_features.h" + +#define POW_LOG_TABLE_BITS 7 +#define POW_LOG_POLY_ORDER 8 +extern hidden const struct pow_log_data { + double ln2hi; + double ln2lo; + double poly[POW_LOG_POLY_ORDER - 1]; /* First coefficient is 1. */ + /* Note: the pad field is unused, but allows slightly faster indexing. */ + struct { + double invc, pad, logc, logctail; + } tab[1 << POW_LOG_TABLE_BITS]; +} __pow_log_data; + +#endif From 8a579ee3a696d8589d6a0d8a39e3e04ea8b88770 Mon Sep 17 00:00:00 2001 From: Sergei Bocharov Date: Mon, 30 Sep 2019 10:24:02 +0300 Subject: [PATCH 305/309] Fixes after review --- docs/en/query_language/functions/string_functions.md | 4 ++-- docs/ru/query_language/functions/string_functions.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/query_language/functions/string_functions.md b/docs/en/query_language/functions/string_functions.md index 89114703530..02a8e1d64aa 100644 --- a/docs/en/query_language/functions/string_functions.md +++ b/docs/en/query_language/functions/string_functions.md @@ -157,13 +157,13 @@ Returns whether to end with the specified suffix. Returns 1 if the string ends w Returns 1 whether string starts with the specified prefix, otherwise it returns 0. ```sql -SELECT startsWith('string', 'str'); +SELECT startsWith('Spider-Man', 'Spi'); ``` **Returned values** - 1, if the string starts with the specified prefix. -- 0, if the string isn't start with the specified prefix. +- 0, if the string doesn't start with the specified prefix. **Example** diff --git a/docs/ru/query_language/functions/string_functions.md b/docs/ru/query_language/functions/string_functions.md index e8fcd737c61..68bb92add86 100644 --- a/docs/ru/query_language/functions/string_functions.md +++ b/docs/ru/query_language/functions/string_functions.md @@ -129,7 +129,7 @@ SELECT format('{} {}', 'Hello', 'World') Возвращает 1, если строка начинается указанным префиксом, в противном случае 0. ```sql -SELECT startsWith('string', 'str'); +SELECT startsWith('Spider-Man', 'Spi'); ``` **Возвращаемые значения** From 5466e6bb8accdcbd95eae56b8b60f81a5ad464d0 Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Mon, 30 Sep 2019 11:47:01 +0300 Subject: [PATCH 306/309] CLICKHOUSE-4669: some product placement (#7148) * Some product placement * rewrite next paragraph as well --- website/index.html | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/website/index.html b/website/index.html index 5c832745858..fcae470547e 100644 --- a/website/index.html +++ b/website/index.html @@ -427,15 +427,13 @@ clickhouse-client

    For other operating systems the easiest way to get started is using - official Docker images of ClickHouse - . Alternatively you can build ClickHouse from sources - according to the instruction.

    + official Docker images of ClickHouse, this is not the only option though. + Alternatively, you can easily get a running ClickHouse instance or cluster at + + Yandex Managed Service for ClickHouse. +

    -

    After installation proceed to tutorial or full +

    After you got connected to your ClickHouse server, you can proceed to tutorial or full documentation.

    Contacts

    From b641c7e163a075c4e1564379431d9667ab9296b0 Mon Sep 17 00:00:00 2001 From: Sergei Bocharov Date: Mon, 30 Sep 2019 12:17:55 +0300 Subject: [PATCH 307/309] Fix replicate function description --- .../query_language/functions/other_functions.md | 15 +++++++++++---- .../query_language/functions/other_functions.md | 16 +++++++++++----- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/docs/en/query_language/functions/other_functions.md b/docs/en/query_language/functions/other_functions.md index b1471b7dfe4..56084256c10 100644 --- a/docs/en/query_language/functions/other_functions.md +++ b/docs/en/query_language/functions/other_functions.md @@ -653,14 +653,14 @@ The response to the request shows that ClickHouse applied the index in the same Because the index is sparse in ClickHouse, "extra" data ends up in the response when reading a range (in this case, the adjacent dates). Use the `indexHint` function to see it. -## replicate +## replicate {#other_functions-replicate} Creates an array with a single value. Used for internal implementation of [arrayJoin](array_join.md#functions_arrayjoin). ```sql -replicate(x, arr) +SELECT replicate(x, arr); ``` **Parameters:** @@ -668,15 +668,22 @@ replicate(x, arr) - `arr` — Original array. ClickHouse creates a new array of the same length as the original and fills it with the value `x`. - `x` — The value that the resulting array will be filled with. -**Output value** +**Returned value** -- An array filled with the value `x`. +An array filled with the value `x`. + +Type: `Array`. **Example** +Query: + ```sql SELECT replicate(1, ['a', 'b', 'c']) ``` + +Result: + ```text ┌─replicate(1, ['a', 'b', 'c'])─┐ │ [1,1,1] │ diff --git a/docs/ru/query_language/functions/other_functions.md b/docs/ru/query_language/functions/other_functions.md index 3cc56bb1217..3d9cf34331d 100644 --- a/docs/ru/query_language/functions/other_functions.md +++ b/docs/ru/query_language/functions/other_functions.md @@ -638,14 +638,14 @@ ORDER BY k ASC Поскольку индекс в ClickHouse разреженный, то при чтении диапазона в ответ попадают "лишние" данные, в данном случае соседние даты. Функция `indexHint` позволяет их увидеть. -## replicate +## replicate {#other_functions-replicate} Создает массив, заполненный одним значением. Используется для внутренней реализации [arrayJoin](array_join.md#functions_arrayjoin). ```sql -replicate(x, arr) +SELECT replicate(x, arr); ``` **Параметры** @@ -653,16 +653,22 @@ replicate(x, arr) - `arr` — Исходный массив. ClickHouse создаёт новый массив такой же длины как исходный и заполняет его значением `x`. - `x` — Значение, которым будет заполнен результирующий массив. -**Выходное значение** +**Возвращаемое значение** -- Массив, заполненный значением `x`. +Массив, заполненный значением `x`. + +Тип: `Array`. **Пример** +Запрос: + ```sql -SELECT replicate(1, ['a', 'b', 'c']) +SELECT replicate(1, ['a', 'b', 'c']); ``` +Ответ: + ```text ┌─replicate(1, ['a', 'b', 'c'])─┐ │ [1,1,1] │ From 6de45d6863c5d6fbbbd3c60f8a179d79e6339044 Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Mon, 30 Sep 2019 12:44:47 +0300 Subject: [PATCH 308/309] Add   to events list --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8c1a600eb7a..e82dd170286 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,8 @@ ClickHouse is an open-source column-oriented database management system that all ## Upcoming Events * [ClickHouse Meetup in Paris](https://www.eventbrite.com/e/clickhouse-paris-meetup-2019-registration-68493270215) on October 3. -* [ClickHouse Meetup in San Francisco](https://www.meetup.com/San-Francisco-Bay-Area-ClickHouse-Meetup/events/264242199/) on October 9. -* [ClickHouse Meetup in Hong Kong](https://www.meetup.com/Hong-Kong-Machine-Learning-Meetup/events/263580542/) on October 17. +* [ClickHouse Meetup in San Francisco](https://www.meetup.com/San-Francisco-Bay-Area-ClickHouse-Meetup/events/264242199/) on October 9. +* [ClickHouse Meetup in Hong Kong](https://www.meetup.com/Hong-Kong-Machine-Learning-Meetup/events/263580542/) on October 17. * [ClickHouse Meetup in Shenzhen](https://www.huodongxing.com/event/3483759917300) on October 20. * [ClickHouse Meetup in Shanghai](https://www.huodongxing.com/event/4483760336000) on October 27. * [ClickHouse Meetup in Tokyo](https://clickhouse.connpass.com/event/147001/) on November 14. From dadc613072143eaea2b8f7e0a09965a9ff8358c0 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Mon, 30 Sep 2019 19:58:32 +0800 Subject: [PATCH 309/309] Get rid of malloc symbols in libcommon (#7134) --- CMakeLists.txt | 22 +++++++++++++++++++--- dbms/CMakeLists.txt | 16 +++++----------- dbms/programs/CMakeLists.txt | 6 +++--- dbms/src/Client/tests/CMakeLists.txt | 2 +- dbms/src/Processors/tests/CMakeLists.txt | 14 +++++++------- libs/libcommon/CMakeLists.txt | 24 ------------------------ libs/libcommon/src/tests/CMakeLists.txt | 18 +++++++++--------- libs/libmysqlxx/src/tests/CMakeLists.txt | 2 +- 8 files changed, 45 insertions(+), 59 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bb7387ddeb0..f45908ff066 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,3 @@ -project(ClickHouse) cmake_minimum_required(VERSION 3.3) foreach(policy @@ -13,6 +12,7 @@ foreach(policy endif() endforeach() +project(ClickHouse) include (cmake/target.cmake) # Ignore export() since we don't use it, @@ -348,7 +348,7 @@ include (libs/libcommon/cmake/find_jemalloc.cmake) include (libs/libcommon/cmake/find_cctz.cmake) include (libs/libmysqlxx/cmake/find_mysqlclient.cmake) -# When testing for memory leaks with Valgrind, dont link tcmalloc or jemalloc. +# When testing for memory leaks with Valgrind, don't link tcmalloc or jemalloc. if (USE_JEMALLOC) message (STATUS "Link jemalloc: ${JEMALLOC_LIBRARIES}") @@ -367,7 +367,7 @@ elseif (USE_TCMALLOC) endif () elseif (SANITIZE) message (STATUS "Will use ${SANITIZE} sanitizer.") -else () +elseif (OS_LINUX) message (WARNING "Non default allocator is disabled. This is not recommended for production Linux builds.") endif () @@ -376,6 +376,22 @@ include (cmake/print_flags.cmake) install (EXPORT global DESTINATION cmake) add_subdirectory (contrib EXCLUDE_FROM_ALL) + +macro (add_executable target) + # invoke built-in add_executable + _add_executable (${ARGV}) + get_target_property (type ${target} TYPE) + if (${type} STREQUAL EXECUTABLE) + file (RELATIVE_PATH dir ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) + if (${dir} MATCHES "^dbms") + # Only interpose operator::new/delete for dbms executables (MemoryTracker stuff) + target_link_libraries (${target} PRIVATE clickhouse_new_delete ${MALLOC_LIBRARIES}) + else () + target_link_libraries (${target} PRIVATE ${MALLOC_LIBRARIES}) + endif () + endif() +endmacro() + add_subdirectory (libs) add_subdirectory (utils) add_subdirectory (dbms) diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index 22a3111a70e..4c82bec807f 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -100,6 +100,7 @@ set(dbms_sources) add_headers_and_sources(clickhouse_common_io src/Common) add_headers_and_sources(clickhouse_common_io src/Common/HashTable) add_headers_and_sources(clickhouse_common_io src/IO) +list (REMOVE_ITEM clickhouse_common_io_sources src/Common/new_delete.cpp) if(USE_RDKAFKA) add_headers_and_sources(dbms src/Storages/Kafka) @@ -139,6 +140,9 @@ endif () add_library(clickhouse_common_io ${clickhouse_common_io_headers} ${clickhouse_common_io_sources}) +add_library (clickhouse_new_delete STATIC src/Common/new_delete.cpp) +target_link_libraries (clickhouse_new_delete PRIVATE clickhouse_common_io) + if (OS_FREEBSD) target_compile_definitions (clickhouse_common_io PUBLIC CLOCK_MONOTONIC_COARSE=CLOCK_MONOTONIC_FAST) endif () @@ -419,17 +423,7 @@ endif() if (USE_JEMALLOC) dbms_target_include_directories (SYSTEM BEFORE PRIVATE ${JEMALLOC_INCLUDE_DIR}) # used in Interpreters/AsynchronousMetrics.cpp - target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${JEMALLOC_INCLUDE_DIR}) # new_delete.cpp - # common/memory.h - if (MAKE_STATIC_LIBRARIES OR NOT SPLIT_SHARED_LIBRARIES) - # skip if we have bundled build, since jemalloc is static in this case - elseif (${JEMALLOC_LIBRARIES} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$") - # if the library is static we do not need to link with it, - # since in this case it will be in libs/libcommon, - # and we do not want to link with jemalloc multiple times. - else() - target_link_libraries(clickhouse_common_io PRIVATE ${JEMALLOC_LIBRARIES}) - endif() + target_include_directories (clickhouse_new_delete SYSTEM BEFORE PRIVATE ${JEMALLOC_INCLUDE_DIR}) endif () dbms_target_include_directories (PUBLIC ${DBMS_INCLUDE_DIR} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src/Formats/include) diff --git a/dbms/programs/CMakeLists.txt b/dbms/programs/CMakeLists.txt index bac3269468e..138321360f3 100644 --- a/dbms/programs/CMakeLists.txt +++ b/dbms/programs/CMakeLists.txt @@ -24,9 +24,9 @@ configure_file (config_tools.h.in ${CMAKE_CURRENT_BINARY_DIR}/config_tools.h) macro(clickhouse_target_link_split_lib target name) if(NOT CLICKHOUSE_ONE_SHARED) - target_link_libraries(${target} PRIVATE clickhouse-${name}-lib ${MALLOC_LIBRARIES}) + target_link_libraries(${target} PRIVATE clickhouse-${name}-lib) else() - target_link_libraries(${target} PRIVATE clickhouse-lib ${MALLOC_LIBRARIES}) + target_link_libraries(${target} PRIVATE clickhouse-lib) endif() endmacro() @@ -111,7 +111,7 @@ if (CLICKHOUSE_SPLIT_BINARY) install(PROGRAMS clickhouse-split-helper DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME clickhouse COMPONENT clickhouse) else () add_executable (clickhouse main.cpp) - target_link_libraries (clickhouse PRIVATE clickhouse_common_io string_utils ${MALLOC_LIBRARIES}) + target_link_libraries (clickhouse PRIVATE clickhouse_common_io string_utils) target_include_directories (clickhouse BEFORE PRIVATE ${COMMON_INCLUDE_DIR}) target_include_directories (clickhouse PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/dbms/src/Client/tests/CMakeLists.txt b/dbms/src/Client/tests/CMakeLists.txt index f4471136a8a..d952c006bb5 100644 --- a/dbms/src/Client/tests/CMakeLists.txt +++ b/dbms/src/Client/tests/CMakeLists.txt @@ -1,2 +1,2 @@ add_executable(test-connect test_connect.cpp) -target_link_libraries (test-connect dbms) +target_link_libraries (test-connect PRIVATE dbms) diff --git a/dbms/src/Processors/tests/CMakeLists.txt b/dbms/src/Processors/tests/CMakeLists.txt index 5f44ec2a8fd..4ddb6c68416 100644 --- a/dbms/src/Processors/tests/CMakeLists.txt +++ b/dbms/src/Processors/tests/CMakeLists.txt @@ -6,10 +6,10 @@ add_executable (processors_test_merge_sorting_transform processors_test_merge_so add_executable (processors_test_expand_pipeline processors_test_expand_pipeline.cpp) add_executable (processors_test_aggregation processors_test_aggregation.cpp) -target_link_libraries (processors_test dbms) -target_link_libraries (processors_test_chain dbms) -target_link_libraries (processors_test_merge dbms) -target_link_libraries (processors_test_expand_pipeline dbms) -target_link_libraries (processors_test_merging_sorted_transform dbms) -target_link_libraries (processors_test_merge_sorting_transform dbms) -target_link_libraries (processors_test_aggregation dbms clickhouse_aggregate_functions) +target_link_libraries (processors_test PRIVATE dbms) +target_link_libraries (processors_test_chain PRIVATE dbms) +target_link_libraries (processors_test_merge PRIVATE dbms) +target_link_libraries (processors_test_expand_pipeline PRIVATE dbms) +target_link_libraries (processors_test_merging_sorted_transform PRIVATE dbms) +target_link_libraries (processors_test_merge_sorting_transform PRIVATE dbms) +target_link_libraries (processors_test_aggregation PRIVATE dbms clickhouse_aggregate_functions) diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index 62c64a9bdb0..357e457b240 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -65,29 +65,6 @@ add_library (common ${CONFIG_COMMON}) -# When testing for memory leaks with Valgrind, dont link tcmalloc or jemalloc. - -if (USE_JEMALLOC) - message (STATUS "Link jemalloc: ${JEMALLOC_LIBRARIES}") - set (MALLOC_LIBRARIES ${JEMALLOC_LIBRARIES}) -elseif (USE_TCMALLOC) - if (DEBUG_TCMALLOC AND NOT GPERFTOOLS_TCMALLOC_MINIMAL_DEBUG) - message (FATAL_ERROR "Requested DEBUG_TCMALLOC but debug library is not found. You should install Google Perftools. Example: sudo apt-get install libgoogle-perftools-dev") - endif () - - if (DEBUG_TCMALLOC AND GPERFTOOLS_TCMALLOC_MINIMAL_DEBUG) - message (STATUS "Link libtcmalloc_minimal_debug for testing: ${GPERFTOOLS_TCMALLOC_MINIMAL_DEBUG}") - set (MALLOC_LIBRARIES ${GPERFTOOLS_TCMALLOC_MINIMAL_DEBUG}) - else () - message (STATUS "Link libtcmalloc_minimal: ${GPERFTOOLS_TCMALLOC_MINIMAL}") - set (MALLOC_LIBRARIES ${GPERFTOOLS_TCMALLOC_MINIMAL}) - endif () -elseif (SANITIZE) - message (STATUS "Will use ${SANITIZE} sanitizer.") -elseif (OS_LINUX) - message (WARNING "Non default allocator is disabled. This is not recommended for production Linux builds.") -endif () - if (USE_INTERNAL_MEMCPY) set (MEMCPY_LIBRARIES memcpy) endif () @@ -120,7 +97,6 @@ target_link_libraries (common PUBLIC ${Boost_SYSTEM_LIBRARY} PRIVATE - ${MALLOC_LIBRARIES} ${MEMCPY_LIBRARIES}) if (RT_LIBRARY) diff --git a/libs/libcommon/src/tests/CMakeLists.txt b/libs/libcommon/src/tests/CMakeLists.txt index 15d872ac49d..486914e4ca7 100644 --- a/libs/libcommon/src/tests/CMakeLists.txt +++ b/libs/libcommon/src/tests/CMakeLists.txt @@ -10,20 +10,20 @@ add_executable (realloc-perf allocator.cpp) set(PLATFORM_LIBS ${CMAKE_DL_LIBS}) -target_link_libraries (date_lut_init common ${PLATFORM_LIBS}) -target_link_libraries (date_lut2 common ${PLATFORM_LIBS}) -target_link_libraries (date_lut3 common ${PLATFORM_LIBS}) -target_link_libraries (date_lut4 common ${PLATFORM_LIBS}) -target_link_libraries (date_lut_default_timezone common ${PLATFORM_LIBS}) -target_link_libraries (local_date_time_comparison common) -target_link_libraries (realloc-perf common) +target_link_libraries (date_lut_init PRIVATE common ${PLATFORM_LIBS}) +target_link_libraries (date_lut2 PRIVATE common ${PLATFORM_LIBS}) +target_link_libraries (date_lut3 PRIVATE common ${PLATFORM_LIBS}) +target_link_libraries (date_lut4 PRIVATE common ${PLATFORM_LIBS}) +target_link_libraries (date_lut_default_timezone PRIVATE common ${PLATFORM_LIBS}) +target_link_libraries (local_date_time_comparison PRIVATE common) +target_link_libraries (realloc-perf PRIVATE common) add_check(local_date_time_comparison) if(USE_GTEST) add_executable(unit_tests_libcommon gtest_json_test.cpp gtest_strong_typedef.cpp gtest_find_symbols.cpp) - target_link_libraries(unit_tests_libcommon common ${GTEST_MAIN_LIBRARIES} ${GTEST_LIBRARIES}) + target_link_libraries(unit_tests_libcommon PRIVATE common ${GTEST_MAIN_LIBRARIES} ${GTEST_LIBRARIES}) add_check(unit_tests_libcommon) endif() add_executable (dump_variable dump_variable.cpp) -target_link_libraries (dump_variable clickhouse_common_io) +target_link_libraries (dump_variable PRIVATE clickhouse_common_io) diff --git a/libs/libmysqlxx/src/tests/CMakeLists.txt b/libs/libmysqlxx/src/tests/CMakeLists.txt index d2901513808..ec3fdfaa913 100644 --- a/libs/libmysqlxx/src/tests/CMakeLists.txt +++ b/libs/libmysqlxx/src/tests/CMakeLists.txt @@ -1,2 +1,2 @@ add_executable (mysqlxx_test mysqlxx_test.cpp) -target_link_libraries (mysqlxx_test mysqlxx) +target_link_libraries (mysqlxx_test PRIVATE mysqlxx)