diff --git a/.gitattributes b/.gitattributes index bcc7d57b904..a23f027122b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,3 @@ contrib/* linguist-vendored *.h linguist-language=C++ +tests/queries/0_stateless/data_json/* binary diff --git a/docker/test/integration/base/Dockerfile b/docker/test/integration/base/Dockerfile index 91b26735fe5..eaf0f01e36d 100644 --- a/docker/test/integration/base/Dockerfile +++ b/docker/test/integration/base/Dockerfile @@ -60,5 +60,5 @@ clientPort=2181 \n\ maxClientCnxns=80' > /opt/zookeeper/conf/zoo.cfg RUN mkdir /zookeeper && chmod -R 777 /zookeeper -ENV TZ=Europe/Moscow +ENV TZ=Etc/UTC RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone diff --git a/docker/test/integration/runner/Dockerfile b/docker/test/integration/runner/Dockerfile index 22dd2e14456..b5c6a39a965 100644 --- a/docker/test/integration/runner/Dockerfile +++ b/docker/test/integration/runner/Dockerfile @@ -40,7 +40,7 @@ RUN apt-get update \ /tmp/* \ && apt-get clean -ENV TZ=Europe/Moscow +ENV TZ=Etc/UTC RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone ENV DOCKER_CHANNEL stable diff --git a/docs/en/sql-reference/functions/ip-address-functions.md b/docs/en/sql-reference/functions/ip-address-functions.md index 469a66d460f..cf3f92580aa 100644 --- a/docs/en/sql-reference/functions/ip-address-functions.md +++ b/docs/en/sql-reference/functions/ip-address-functions.md @@ -13,10 +13,18 @@ Alias: `INET_NTOA`. ## IPv4StringToNum(s) {#ipv4stringtonums} -The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it returns 0. +The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it throws exception. Alias: `INET_ATON`. +## IPv4StringToNumOrDefault(s) {#ipv4stringtonums} + +Same as `IPv4StringToNum`, but if the IPv4 address has an invalid format, it returns 0. + +## IPv4StringToNumOrNull(s) {#ipv4stringtonums} + +Same as `IPv4StringToNum`, but if the IPv4 address has an invalid format, it returns null. + ## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum} Similar to IPv4NumToString, but using xxx instead of the last octet. @@ -123,7 +131,7 @@ LIMIT 10 ## IPv6StringToNum {#ipv6stringtonums} -The reverse function of [IPv6NumToString](#ipv6numtostringx). If the IPv6 address has an invalid format, it returns a string of null bytes. +The reverse function of [IPv6NumToString](#ipv6numtostringx). If the IPv6 address has an invalid format, it throws exception. If the input string contains a valid IPv4 address, returns its IPv6 equivalent. HEX can be uppercase or lowercase. @@ -168,6 +176,14 @@ Result: - [cutIPv6](#cutipv6x-bytestocutforipv6-bytestocutforipv4). +## IPv6StringToNumOrDefault(s) {#ipv6stringtonums} + +Same as `IPv6StringToNum`, but if the IPv6 address has an invalid format, it returns 0. + +## IPv6StringToNumOrNull(s) {#ipv6stringtonums} + +Same as `IPv6StringToNum`, but if the IPv6 address has an invalid format, it returns null. + ## IPv4ToIPv6(x) {#ipv4toipv6x} Takes a `UInt32` number. Interprets it as an IPv4 address in [big endian](https://en.wikipedia.org/wiki/Endianness). Returns a `FixedString(16)` value containing the IPv6 address in binary format. Examples: @@ -261,6 +277,14 @@ SELECT └───────────────────────────────────┴──────────────────────────┘ ``` +## toIPv4OrDefault(string) {#toipv4ordefaultstring} + +Same as `toIPv4`, but if the IPv4 address has an invalid format, it returns 0. + +## toIPv4OrNull(string) {#toipv4ornullstring} + +Same as `toIPv4`, but if the IPv4 address has an invalid format, it returns null. + ## toIPv6 {#toipv6string} Converts a string form of IPv6 address to [IPv6](../../sql-reference/data-types/domains/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value. @@ -317,6 +341,14 @@ Result: └─────────────────────┘ ``` +## IPv6StringToNumOrDefault(s) {#toipv6ordefaultstring} + +Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns 0. + +## IPv6StringToNumOrNull(s) {#toipv6ornullstring} + +Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns null. + ## isIPv4String {#isipv4string} Determines whether the input string is an IPv4 address or not. If `string` is IPv6 address returns `0`. diff --git a/docs/en/sql-reference/functions/statistics.md b/docs/en/sql-reference/functions/statistics.md new file mode 100644 index 00000000000..3f337b05cbc --- /dev/null +++ b/docs/en/sql-reference/functions/statistics.md @@ -0,0 +1,48 @@ +--- +toc_priority: 69 +toc_title: Statistics +--- + +# Functions for Working with Statistics {#functions-for-working-with-statistics} + +# proportionsZTest {#proportionsztest} + +Applies proportion z-test to samples from two populations (X and Y). The alternative is 'two-sided'. + +**Syntax** + +``` sql +proportionsZTest(successes_x, successes_y, trials_x, trials_y, significance_level, usevar) +``` + +**Arguments** + +- `successes_x` — The number of successes for X in trials. +- `successes_y` — The number of successes for X in trials. +- `trials_x` — The number of trials for X. +- `trials_y` — The number of trials for Y. +- `significance_level` +- `usevar` - It can be `'pooled'` or `'unpooled'`. + - `'pooled'` - The variance of the two populations are assumed to be equal. + - `'unpooled'` - The assumption of equal variances is dropped. + +**Returned value** + +- A tuple with the (z-statistic, p-value, confidence-interval-lower, confidence-interval-upper). + +Type: [Tuple](../../sql-reference/data-types/tuple.md). + +**Example** + +Query: + +``` sql +SELECT proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled'); +``` + +Result: + +``` text +(-0.20656724435948853,0.8363478437079654,-0.09345975390115283,0.07563797172293502) +``` + diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt index 4e0789b5d24..8bf1a5f477c 100644 --- a/docs/tools/requirements.txt +++ b/docs/tools/requirements.txt @@ -1,4 +1,4 @@ -Babel==2.8.0 +Babel==2.9.1 backports-abc==0.5 backports.functools-lru-cache==1.6.1 beautifulsoup4==4.9.1 @@ -10,22 +10,22 @@ cssmin==0.2.0 future==0.18.2 htmlmin==0.1.12 idna==2.10 -Jinja2>=2.11.3 +Jinja2>=3.0.3 jinja2-highlight==0.6.1 jsmin==3.0.0 -livereload==2.6.2 +livereload==2.6.3 Markdown==3.3.2 -MarkupSafe==1.1.1 +MarkupSafe==2.1.0 mkdocs==1.1.2 mkdocs-htmlproofer-plugin==0.0.3 mkdocs-macros-plugin==0.4.20 -nltk==3.5 +nltk==3.7 nose==1.3.7 protobuf==3.14.0 numpy==1.21.2 pymdown-extensions==8.0 python-slugify==4.0.1 -PyYAML==5.4.1 +PyYAML==6.0 repackage==0.7.3 requests==2.25.1 singledispatch==3.4.0.3 @@ -34,5 +34,6 @@ soupsieve==2.0.1 termcolor==1.1.0 tornado==6.1 Unidecode==1.1.1 -urllib3>=1.26.5 -Pygments>=2.7.4 +urllib3>=1.26.8 +Pygments>=2.11.2 + diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index e51f2aff064..f3e7f1775b8 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -787,6 +787,7 @@ void Client::printHelpMessage(const OptionsDescription & options_description) { std::cout << options_description.main_description.value() << "\n"; std::cout << options_description.external_description.value() << "\n"; + std::cout << options_description.hosts_and_ports_description.value() << "\n"; std::cout << "In addition, --param_name=value can be specified for substitution of parameters for parametrized queries.\n"; } diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 8afb9c663a3..26d42a11315 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -304,8 +304,8 @@ void LocalServer::setupUsers() ConfigurationPtr users_config; auto & access_control = global_context->getAccessControl(); - access_control.setPlaintextPasswordSetting(config().getBool("allow_plaintext_password", true)); - access_control.setNoPasswordSetting(config().getBool("allow_no_password", true)); + access_control.setNoPasswordAllowed(config().getBool("allow_no_password", true)); + access_control.setPlaintextPasswordAllowed(config().getBool("allow_plaintext_password", true)); if (config().has("users_config") || config().has("config-file") || fs::exists("config.xml")) { const auto users_config_path = config().getString("users_config", config().getString("config-file", "config.xml")); diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index d372ff8ea65..f278e94c7a2 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1074,9 +1074,10 @@ if (ThreadFuzzer::instance().isEffective()) auto & access_control = global_context->getAccessControl(); if (config().has("custom_settings_prefixes")) access_control.setCustomSettingsPrefixes(config().getString("custom_settings_prefixes")); - ///set the allow_plaintext_and_no_password setting in context. - access_control.setPlaintextPasswordSetting(config().getBool("allow_plaintext_password", true)); - access_control.setNoPasswordSetting(config().getBool("allow_no_password", true)); + + access_control.setNoPasswordAllowed(config().getBool("allow_no_password", true)); + access_control.setPlaintextPasswordAllowed(config().getBool("allow_plaintext_password", true)); + /// Initialize access storages. try { diff --git a/programs/server/config.xml b/programs/server/config.xml index d34340ac995..6ca64dc30c5 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -243,7 +243,7 @@ openssl dhparam -out /etc/clickhouse-server/dhparam.pem 4096 Only file format with BEGIN DH PARAMETERS is supported. --> - + none true true @@ -368,7 +368,7 @@ /var/lib/clickhouse/tmp/ - + ` diff --git a/src/Access/AccessControl.cpp b/src/Access/AccessControl.cpp index ef8eccb85fa..91ffd7f04ab 100644 --- a/src/Access/AccessControl.cpp +++ b/src/Access/AccessControl.cpp @@ -173,7 +173,8 @@ void AccessControl::addUsersConfigStorage(const String & storage_name_, const Po auto check_setting_name_function = [this](const std::string_view & setting_name) { checkSettingNameIsAllowed(setting_name); }; auto is_no_password_allowed_function = [this]() -> bool { return isNoPasswordAllowed(); }; auto is_plaintext_password_allowed_function = [this]() -> bool { return isPlaintextPasswordAllowed(); }; - auto new_storage = std::make_shared(storage_name_, check_setting_name_function,is_no_password_allowed_function,is_plaintext_password_allowed_function); + auto new_storage = std::make_shared(storage_name_, check_setting_name_function, + is_no_password_allowed_function, is_plaintext_password_allowed_function); new_storage->setConfig(users_config_); addStorage(new_storage); LOG_DEBUG(getLogger(), "Added {} access storage '{}', path: {}", @@ -209,7 +210,8 @@ void AccessControl::addUsersConfigStorage( auto check_setting_name_function = [this](const std::string_view & setting_name) { checkSettingNameIsAllowed(setting_name); }; auto is_no_password_allowed_function = [this]() -> bool { return isNoPasswordAllowed(); }; auto is_plaintext_password_allowed_function = [this]() -> bool { return isPlaintextPasswordAllowed(); }; - auto new_storage = std::make_shared(storage_name_, check_setting_name_function,is_no_password_allowed_function,is_plaintext_password_allowed_function); + auto new_storage = std::make_shared(storage_name_, check_setting_name_function, + is_no_password_allowed_function, is_plaintext_password_allowed_function); new_storage->load(users_config_path_, include_from_path_, preprocessed_dir_, get_zookeeper_function_); addStorage(new_storage); LOG_DEBUG(getLogger(), "Added {} access storage '{}', path: {}", String(new_storage->getStorageType()), new_storage->getStorageName(), new_storage->getPath()); @@ -411,7 +413,8 @@ UUID AccessControl::authenticate(const Credentials & credentials, const Poco::Ne { try { - return MultipleAccessStorage::authenticate(credentials, address, *external_authenticators,allow_no_password, allow_plaintext_password); + return MultipleAccessStorage::authenticate(credentials, address, *external_authenticators, allow_no_password, + allow_plaintext_password); } catch (...) { @@ -447,26 +450,38 @@ void AccessControl::setCustomSettingsPrefixes(const String & comma_separated_pre setCustomSettingsPrefixes(prefixes); } -void AccessControl::setPlaintextPasswordSetting(bool allow_plaintext_password_) -{ - allow_plaintext_password = allow_plaintext_password_; -} -void AccessControl::setNoPasswordSetting(bool allow_no_password_) -{ - allow_no_password = allow_no_password_; -} - -bool AccessControl::isSettingNameAllowed(const std::string_view & setting_name) const +bool AccessControl::isSettingNameAllowed(const std::string_view setting_name) const { return custom_settings_prefixes->isSettingNameAllowed(setting_name); } -void AccessControl::checkSettingNameIsAllowed(const std::string_view & setting_name) const +void AccessControl::checkSettingNameIsAllowed(const std::string_view setting_name) const { custom_settings_prefixes->checkSettingNameIsAllowed(setting_name); } +void AccessControl::setNoPasswordAllowed(bool allow_no_password_) +{ + allow_no_password = allow_no_password_; +} + +bool AccessControl::isNoPasswordAllowed() const +{ + return allow_no_password; +} + +void AccessControl::setPlaintextPasswordAllowed(bool allow_plaintext_password_) +{ + allow_plaintext_password = allow_plaintext_password_; +} + +bool AccessControl::isPlaintextPasswordAllowed() const +{ + return allow_plaintext_password; +} + + std::shared_ptr AccessControl::getContextAccess( const UUID & user_id, const std::vector & current_roles, @@ -550,15 +565,6 @@ std::vector AccessControl::getAllQuotasUsage() const return quota_cache->getAllQuotasUsage(); } -bool AccessControl::isPlaintextPasswordAllowed() const -{ - return allow_plaintext_password; -} - -bool AccessControl::isNoPasswordAllowed() const -{ - return allow_no_password; -} std::shared_ptr AccessControl::getEnabledSettings( const UUID & user_id, diff --git a/src/Access/AccessControl.h b/src/Access/AccessControl.h index 14f4dae9424..0ac3d9cb0c2 100644 --- a/src/Access/AccessControl.h +++ b/src/Access/AccessControl.h @@ -49,8 +49,6 @@ class AccessControl : public MultipleAccessStorage public: AccessControl(); ~AccessControl() override; - std::atomic_bool allow_plaintext_password; - std::atomic_bool allow_no_password; /// Parses access entities from a configuration loaded from users.xml. /// This function add UsersConfigAccessStorage if it wasn't added before. @@ -113,12 +111,16 @@ public: /// This function also enables custom prefixes to be used. void setCustomSettingsPrefixes(const Strings & prefixes); void setCustomSettingsPrefixes(const String & comma_separated_prefixes); - bool isSettingNameAllowed(const std::string_view & name) const; - void checkSettingNameIsAllowed(const std::string_view & name) const; + bool isSettingNameAllowed(const std::string_view name) const; + void checkSettingNameIsAllowed(const std::string_view name) const; - //sets allow_plaintext_password and allow_no_password setting - void setPlaintextPasswordSetting(const bool allow_plaintext_password_); - void setNoPasswordSetting(const bool allow_no_password_); + /// Allows users without password (by default it's allowed). + void setNoPasswordAllowed(const bool allow_no_password_); + bool isNoPasswordAllowed() const; + + /// Allows users with plaintext password (by default it's allowed). + void setPlaintextPasswordAllowed(const bool allow_plaintext_password_); + bool isPlaintextPasswordAllowed() const; UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address) const; void setExternalAuthenticatorsConfig(const Poco::Util::AbstractConfiguration & config); @@ -153,9 +155,6 @@ public: std::vector getAllQuotasUsage() const; - bool isPlaintextPasswordAllowed() const; - bool isNoPasswordAllowed() const; - std::shared_ptr getEnabledSettings( const UUID & user_id, const SettingsProfileElements & settings_from_user, @@ -177,6 +176,8 @@ private: std::unique_ptr settings_profiles_cache; std::unique_ptr external_authenticators; std::unique_ptr custom_settings_prefixes; + std::atomic_bool allow_plaintext_password = true; + std::atomic_bool allow_no_password = true; }; } diff --git a/src/Access/AccessEntityIO.cpp b/src/Access/AccessEntityIO.cpp index acf2a972b13..9d229bbc43b 100644 --- a/src/Access/AccessEntityIO.cpp +++ b/src/Access/AccessEntityIO.cpp @@ -120,7 +120,7 @@ AccessEntityPtr deserializeAccessEntityImpl(const String & definition) if (res) throw Exception("Two access entities attached in the same file", ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION); res = user = std::make_unique(); - InterpreterCreateUserQuery::updateUserFromQuery(*user, *create_user_query); + InterpreterCreateUserQuery::updateUserFromQuery(*user, *create_user_query, /* allow_no_password = */ true, /* allow_plaintext_password = */ true); } else if (auto * create_role_query = query->as()) { diff --git a/src/Access/IAccessStorage.cpp b/src/Access/IAccessStorage.cpp index 33bef719eff..8c53216c638 100644 --- a/src/Access/IAccessStorage.cpp +++ b/src/Access/IAccessStorage.cpp @@ -441,7 +441,9 @@ void IAccessStorage::notify(const Notifications & notifications) UUID IAccessStorage::authenticate( const Credentials & credentials, const Poco::Net::IPAddress & address, - const ExternalAuthenticators & external_authenticators, bool allow_no_password, bool allow_plaintext_password) const + const ExternalAuthenticators & external_authenticators, + bool allow_no_password, + bool allow_plaintext_password) const { return *authenticateImpl(credentials, address, external_authenticators, /* throw_if_user_not_exists = */ true, allow_no_password, allow_plaintext_password); } @@ -451,7 +453,9 @@ std::optional IAccessStorage::authenticate( const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, - bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const + bool throw_if_user_not_exists, + bool allow_no_password, + bool allow_plaintext_password) const { return authenticateImpl(credentials, address, external_authenticators, throw_if_user_not_exists, allow_no_password, allow_plaintext_password); } @@ -461,7 +465,9 @@ std::optional IAccessStorage::authenticateImpl( const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, - bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const + bool throw_if_user_not_exists, + bool allow_no_password, + bool allow_plaintext_password) const { if (auto id = find(credentials.getUserName())) { @@ -469,8 +475,11 @@ std::optional IAccessStorage::authenticateImpl( { if (!isAddressAllowed(*user, address)) throwAddressNotAllowed(address); - if (isNoPasswordAllowed(*user, allow_no_password) || isPlaintextPasswordAllowed(*user, allow_plaintext_password)) - throwPasswordTypeNotAllowed(); + + auto auth_type = user->auth_data.getType(); + if (((auth_type == AuthenticationType::NO_PASSWORD) && !allow_no_password) || + ((auth_type == AuthenticationType::PLAINTEXT_PASSWORD) && !allow_plaintext_password)) + throwAuthenticationTypeNotAllowed(auth_type); if (!areCredentialsValid(*user, credentials, external_authenticators)) throwInvalidCredentials(); @@ -506,15 +515,6 @@ bool IAccessStorage::isAddressAllowed(const User & user, const Poco::Net::IPAddr return user.allowed_client_hosts.contains(address); } -bool IAccessStorage::isPlaintextPasswordAllowed(const User & user, bool allow_plaintext_password) -{ - return !allow_plaintext_password && user.auth_data.getType() == AuthenticationType::PLAINTEXT_PASSWORD; -} - -bool IAccessStorage::isNoPasswordAllowed(const User & user, bool allow_no_password) -{ - return !allow_no_password && user.auth_data.getType() == AuthenticationType::NO_PASSWORD; -} UUID IAccessStorage::generateRandomID() { @@ -610,11 +610,12 @@ void IAccessStorage::throwAddressNotAllowed(const Poco::Net::IPAddress & address throw Exception("Connections from " + address.toString() + " are not allowed", ErrorCodes::IP_ADDRESS_NOT_ALLOWED); } -void IAccessStorage::throwPasswordTypeNotAllowed() +void IAccessStorage::throwAuthenticationTypeNotAllowed(AuthenticationType auth_type) { throw Exception( - "Authentication denied for users configured with AuthType PLAINTEXT_PASSWORD and NO_PASSWORD. Please check with Clickhouse admin to allow allow PLAINTEXT_PASSWORD and NO_PASSWORD through server configuration ", - ErrorCodes::AUTHENTICATION_FAILED); + ErrorCodes::AUTHENTICATION_FAILED, + "Authentication type {} is not allowed, check the setting allow_{} in the server configuration", + toString(auth_type), AuthenticationTypeInfo::get(auth_type).name); } void IAccessStorage::throwInvalidCredentials() { diff --git a/src/Access/IAccessStorage.h b/src/Access/IAccessStorage.h index 3069e41b285..428a0e8f052 100644 --- a/src/Access/IAccessStorage.h +++ b/src/Access/IAccessStorage.h @@ -18,6 +18,7 @@ namespace DB struct User; class Credentials; class ExternalAuthenticators; +enum class AuthenticationType; /// Contains entities, i.e. instances of classes derived from IAccessEntity. /// The implementations of this class MUST be thread-safe. @@ -148,7 +149,7 @@ public: /// Finds a user, check the provided credentials and returns the ID of the user if they are valid. /// Throws an exception if no such user or credentials are invalid. - UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool allow_no_password=true, bool allow_plaintext_password=true) const; + UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool allow_no_password, bool allow_plaintext_password) const; std::optional authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const; protected: @@ -164,8 +165,6 @@ protected: virtual std::optional authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const; virtual bool areCredentialsValid(const User & user, const Credentials & credentials, const ExternalAuthenticators & external_authenticators) const; virtual bool isAddressAllowed(const User & user, const Poco::Net::IPAddress & address) const; - static bool isPlaintextPasswordAllowed(const User & user, bool allow_plaintext_password) ; - static bool isNoPasswordAllowed(const User & user, bool allow_no_password); static UUID generateRandomID(); Poco::Logger * getLogger() const; static String formatEntityTypeWithName(AccessEntityType type, const String & name) { return AccessEntityTypeInfo::get(type).formatEntityNameWithType(name); } @@ -181,7 +180,7 @@ protected: [[noreturn]] void throwReadonlyCannotRemove(AccessEntityType type, const String & name) const; [[noreturn]] static void throwAddressNotAllowed(const Poco::Net::IPAddress & address); [[noreturn]] static void throwInvalidCredentials(); - [[noreturn]] static void throwPasswordTypeNotAllowed(); + [[noreturn]] static void throwAuthenticationTypeNotAllowed(AuthenticationType auth_type); using Notification = std::tuple; using Notifications = std::vector; static void notify(const Notifications & notifications); diff --git a/src/Access/LDAPAccessStorage.cpp b/src/Access/LDAPAccessStorage.cpp index dd1c50343f2..4cf42a5017c 100644 --- a/src/Access/LDAPAccessStorage.cpp +++ b/src/Access/LDAPAccessStorage.cpp @@ -481,7 +481,9 @@ std::optional LDAPAccessStorage::authenticateImpl( const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, - bool throw_if_user_not_exists,bool allow_no_password __attribute__((unused)), bool allow_plaintext_password __attribute__((unused))) const + bool throw_if_user_not_exists, + bool /* allow_no_password */, + bool /* allow_plaintext_password */) const { std::scoped_lock lock(mutex); auto id = memory_storage.find(credentials.getUserName()); diff --git a/src/Access/MultipleAccessStorage.cpp b/src/Access/MultipleAccessStorage.cpp index c988a4d374a..359214eac9f 100644 --- a/src/Access/MultipleAccessStorage.cpp +++ b/src/Access/MultipleAccessStorage.cpp @@ -449,14 +449,20 @@ void MultipleAccessStorage::updateSubscriptionsToNestedStorages(std::unique_lock } -std::optional MultipleAccessStorage::authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists,bool allow_no_password, bool allow_plaintext_password) const +std::optional +MultipleAccessStorage::authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, + const ExternalAuthenticators & external_authenticators, + bool throw_if_user_not_exists, + bool allow_no_password, bool allow_plaintext_password) const { auto storages = getStoragesInternal(); for (size_t i = 0; i != storages->size(); ++i) { const auto & storage = (*storages)[i]; bool is_last_storage = (i == storages->size() - 1); - auto id = storage->authenticate(credentials, address, external_authenticators, (throw_if_user_not_exists && is_last_storage), allow_no_password, allow_plaintext_password); + auto id = storage->authenticate(credentials, address, external_authenticators, + (throw_if_user_not_exists && is_last_storage), + allow_no_password, allow_plaintext_password); if (id) { std::lock_guard lock{mutex}; diff --git a/src/Access/UsersConfigAccessStorage.cpp b/src/Access/UsersConfigAccessStorage.cpp index b2bdebfcf6c..fe8e6d1d6c0 100644 --- a/src/Access/UsersConfigAccessStorage.cpp +++ b/src/Access/UsersConfigAccessStorage.cpp @@ -28,8 +28,6 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int UNKNOWN_ADDRESS_PATTERN_TYPE; extern const int NOT_IMPLEMENTED; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; - } namespace @@ -50,7 +48,7 @@ namespace UUID generateID(const IAccessEntity & entity) { return generateID(entity.getType(), entity.getName()); } - UserPtr parseUser(const Poco::Util::AbstractConfiguration & config, const String & user_name) + UserPtr parseUser(const Poco::Util::AbstractConfiguration & config, const String & user_name, bool allow_no_password, bool allow_plaintext_password) { auto user = std::make_shared(); user->setName(user_name); @@ -130,6 +128,15 @@ namespace user->auth_data.setSSLCertificateCommonNames(std::move(common_names)); } + auto auth_type = user->auth_data.getType(); + if (((auth_type == AuthenticationType::NO_PASSWORD) && !allow_no_password) || + ((auth_type == AuthenticationType::PLAINTEXT_PASSWORD) && !allow_plaintext_password)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Authentication type {} is not allowed, check the setting allow_{} in the server configuration", + toString(auth_type), AuthenticationTypeInfo::get(auth_type).name); + } + const auto profile_name_config = user_config + ".profile"; if (config.has(profile_name_config)) { @@ -225,24 +232,18 @@ namespace } - std::vector parseUsers(const Poco::Util::AbstractConfiguration & config, Fn auto && is_no_password_allowed_function, Fn auto && is_plaintext_password_allowed_function) + std::vector parseUsers(const Poco::Util::AbstractConfiguration & config, bool allow_no_password, bool allow_plaintext_password) { Poco::Util::AbstractConfiguration::Keys user_names; config.keys("users", user_names); std::vector users; users.reserve(user_names.size()); - bool allow_plaintext_password = is_plaintext_password_allowed_function(); - bool allow_no_password = is_no_password_allowed_function(); for (const auto & user_name : user_names) { try { - String user_config = "users." + user_name; - if ((config.has(user_config + ".password") && !allow_plaintext_password) || (config.has(user_config + ".no_password") && !allow_no_password)) - throw Exception("Incorrect User configuration. User is not allowed to configure PLAINTEXT_PASSWORD or NO_PASSWORD. Please configure User with authtype SHA256_PASSWORD_HASH, SHA256_PASSWORD, DOUBLE_SHA1_PASSWORD OR enable setting allow_plaintext_and_no_password in server configuration to configure user with plaintext and no password Auth_Type" - " Though it is not recommended to use plaintext_password and No_password for user authentication.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - users.push_back(parseUser(config, user_name)); + users.push_back(parseUser(config, user_name, allow_no_password, allow_plaintext_password)); } catch (Exception & e) { @@ -562,8 +563,10 @@ void UsersConfigAccessStorage::parseFromConfig(const Poco::Util::AbstractConfigu { try { + bool no_password_allowed = is_no_password_allowed_function(); + bool plaintext_password_allowed = is_plaintext_password_allowed_function(); std::vector> all_entities; - for (const auto & entity : parseUsers(config,is_no_password_allowed_function, is_plaintext_password_allowed_function)) + for (const auto & entity : parseUsers(config, no_password_allowed, plaintext_password_allowed)) all_entities.emplace_back(generateID(*entity), entity); for (const auto & entity : parseQuotas(config)) all_entities.emplace_back(generateID(*entity), entity); diff --git a/src/AggregateFunctions/AggregateFunctionFactory.h b/src/AggregateFunctions/AggregateFunctionFactory.h index ef5740733df..e5263a54d79 100644 --- a/src/AggregateFunctions/AggregateFunctionFactory.h +++ b/src/AggregateFunctions/AggregateFunctionFactory.h @@ -38,7 +38,8 @@ struct AggregateFunctionWithProperties AggregateFunctionWithProperties(const AggregateFunctionWithProperties &) = default; AggregateFunctionWithProperties & operator = (const AggregateFunctionWithProperties &) = default; - template > * = nullptr> + template + requires (!std::is_same_v) AggregateFunctionWithProperties(Creator creator_, AggregateFunctionProperties properties_ = {}) /// NOLINT : creator(std::forward(creator_)), properties(std::move(properties_)) { diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 22fe1f2ffff..b24181625d3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -569,6 +569,14 @@ if (ENABLE_TESTS) clickhouse_common_zookeeper string_utils) + if (TARGET ch_contrib::simdjson) + target_link_libraries(unit_tests_dbms PRIVATE ch_contrib::simdjson) + endif() + + if(TARGET ch_contrib::rapidjson) + target_include_directories(unit_tests_dbms PRIVATE ch_contrib::rapidjson) + endif() + if (TARGET ch_contrib::yaml_cpp) target_link_libraries(unit_tests_dbms PRIVATE ch_contrib::yaml_cpp) endif() diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index c575cd37a5f..4f1c1f4539e 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1092,10 +1092,11 @@ void ClientBase::sendData(Block & sample, const ColumnsDescription & columns_des try { + auto metadata = storage->getInMemoryMetadataPtr(); sendDataFromPipe( storage->read( sample.getNames(), - storage->getInMemoryMetadataPtr(), + storage->getStorageSnapshot(metadata), query_info, global_context, {}, diff --git a/src/Columns/ColumnAggregateFunction.cpp b/src/Columns/ColumnAggregateFunction.cpp index f27e103b304..59d56c6e437 100644 --- a/src/Columns/ColumnAggregateFunction.cpp +++ b/src/Columns/ColumnAggregateFunction.cpp @@ -297,7 +297,7 @@ ColumnPtr ColumnAggregateFunction::filter(const Filter & filter, ssize_t result_ { size_t size = data.size(); if (size != filter.size()) - throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filter.size(), size); if (size == 0) return cloneEmpty(); diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index be5d9065281..24da9644335 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -608,7 +608,7 @@ ColumnPtr ColumnArray::filterString(const Filter & filt, ssize_t result_size_hin { size_t col_size = getOffsets().size(); if (col_size != filt.size()) - throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), col_size); if (0 == col_size) return ColumnArray::create(data); @@ -676,7 +676,7 @@ ColumnPtr ColumnArray::filterGeneric(const Filter & filt, ssize_t result_size_hi { size_t size = getOffsets().size(); if (size != filt.size()) - throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size); if (size == 0) return ColumnArray::create(data); @@ -1189,4 +1189,12 @@ void ColumnArray::gather(ColumnGathererStream & gatherer) gatherer.gather(*this); } +size_t ColumnArray::getNumberOfDimensions() const +{ + const auto * nested_array = checkAndGetColumn(*data); + if (!nested_array) + return 1; + return 1 + nested_array->getNumberOfDimensions(); /// Every modern C++ compiler optimizes tail recursion. +} + } diff --git a/src/Columns/ColumnArray.h b/src/Columns/ColumnArray.h index cc80d1300ce..3f41ae9cd8a 100644 --- a/src/Columns/ColumnArray.h +++ b/src/Columns/ColumnArray.h @@ -169,6 +169,8 @@ public: bool isCollationSupported() const override { return getData().isCollationSupported(); } + size_t getNumberOfDimensions() const; + private: WrappedPtr data; WrappedPtr offsets; diff --git a/src/Columns/ColumnDecimal.cpp b/src/Columns/ColumnDecimal.cpp index f9feb8f10b9..4290a7a4cb1 100644 --- a/src/Columns/ColumnDecimal.cpp +++ b/src/Columns/ColumnDecimal.cpp @@ -266,7 +266,7 @@ ColumnPtr ColumnDecimal::filter(const IColumn::Filter & filt, ssize_t result_ { size_t size = data.size(); if (size != filt.size()) - throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size); auto res = this->create(0, scale); Container & res_data = res->getData(); diff --git a/src/Columns/ColumnFixedString.cpp b/src/Columns/ColumnFixedString.cpp index d0a735a5580..de6324ca7ce 100644 --- a/src/Columns/ColumnFixedString.cpp +++ b/src/Columns/ColumnFixedString.cpp @@ -207,7 +207,7 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result { size_t col_size = size(); if (col_size != filt.size()) - throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), col_size); auto res = ColumnFixedString::create(n); diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index 41ad099818e..52e5e43fa48 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -144,15 +144,15 @@ public: double getRatioOfDefaultRows(double sample_ratio) const override { - return null_map->getRatioOfDefaultRows(sample_ratio); + return getRatioOfDefaultRowsImpl(sample_ratio); } void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override { - null_map->getIndicesOfNonDefaultRows(indices, from, limit); + getIndicesOfNonDefaultRowsImpl(indices, from, limit); } - ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override; + ColumnPtr createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override; bool isNullable() const override { return true; } bool isFixedAndContiguous() const override { return false; } diff --git a/src/Columns/ColumnObject.cpp b/src/Columns/ColumnObject.cpp new file mode 100644 index 00000000000..bfa8ffe6358 --- /dev/null +++ b/src/Columns/ColumnObject.cpp @@ -0,0 +1,780 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int ILLEGAL_COLUMN; + extern const int DUPLICATE_COLUMN; + extern const int NUMBER_OF_DIMENSIONS_MISMATHED; + extern const int NOT_IMPLEMENTED; + extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; +} + +namespace +{ + +/// Recreates column with default scalar values and keeps sizes of arrays. +ColumnPtr recreateColumnWithDefaultValues( + const ColumnPtr & column, const DataTypePtr & scalar_type, size_t num_dimensions) +{ + const auto * column_array = checkAndGetColumn(column.get()); + if (column_array && num_dimensions) + { + return ColumnArray::create( + recreateColumnWithDefaultValues( + column_array->getDataPtr(), scalar_type, num_dimensions - 1), + IColumn::mutate(column_array->getOffsetsPtr())); + } + + return createArrayOfType(scalar_type, num_dimensions)->createColumn()->cloneResized(column->size()); +} + +/// Replaces NULL fields to given field or empty array. +class FieldVisitorReplaceNull : public StaticVisitor +{ +public: + explicit FieldVisitorReplaceNull( + const Field & replacement_, size_t num_dimensions_) + : replacement(replacement_) + , num_dimensions(num_dimensions_) + { + } + + Field operator()(const Null &) const + { + return num_dimensions + ? createEmptyArrayField(num_dimensions) + : replacement; + } + + Field operator()(const Array & x) const + { + assert(num_dimensions > 0); + const size_t size = x.size(); + Array res(size); + for (size_t i = 0; i < size; ++i) + res[i] = applyVisitor(FieldVisitorReplaceNull(replacement, num_dimensions - 1), x[i]); + return res; + } + + template + Field operator()(const T & x) const { return x; } + +private: + const Field & replacement; + size_t num_dimensions; +}; + +/// Calculates number of dimensions in array field. +/// Returns 0 for scalar fields. +class FieldVisitorToNumberOfDimensions : public StaticVisitor +{ +public: + size_t operator()(const Array & x) const + { + const size_t size = x.size(); + std::optional dimensions; + + for (size_t i = 0; i < size; ++i) + { + /// Do not count Nulls, because they will be replaced by default + /// values with proper number of dimensions. + if (x[i].isNull()) + continue; + + size_t current_dimensions = applyVisitor(*this, x[i]); + if (!dimensions) + dimensions = current_dimensions; + else if (current_dimensions != *dimensions) + throw Exception(ErrorCodes::NUMBER_OF_DIMENSIONS_MISMATHED, + "Number of dimensions mismatched among array elements"); + } + + return 1 + dimensions.value_or(0); + } + + template + size_t operator()(const T &) const { return 0; } +}; + +/// Visitor that allows to get type of scalar field +/// or least common type of scalars in array. +/// More optimized version of FieldToDataType. +class FieldVisitorToScalarType : public StaticVisitor<> +{ +public: + using FieldType = Field::Types::Which; + + void operator()(const Array & x) + { + size_t size = x.size(); + for (size_t i = 0; i < size; ++i) + applyVisitor(*this, x[i]); + } + + void operator()(const UInt64 & x) + { + field_types.insert(FieldType::UInt64); + if (x <= std::numeric_limits::max()) + type_indexes.insert(TypeIndex::UInt8); + else if (x <= std::numeric_limits::max()) + type_indexes.insert(TypeIndex::UInt16); + else if (x <= std::numeric_limits::max()) + type_indexes.insert(TypeIndex::UInt32); + else + type_indexes.insert(TypeIndex::UInt64); + } + + void operator()(const Int64 & x) + { + field_types.insert(FieldType::Int64); + if (x <= std::numeric_limits::max() && x >= std::numeric_limits::min()) + type_indexes.insert(TypeIndex::Int8); + else if (x <= std::numeric_limits::max() && x >= std::numeric_limits::min()) + type_indexes.insert(TypeIndex::Int16); + else if (x <= std::numeric_limits::max() && x >= std::numeric_limits::min()) + type_indexes.insert(TypeIndex::Int32); + else + type_indexes.insert(TypeIndex::Int64); + } + + void operator()(const Null &) + { + have_nulls = true; + } + + template + void operator()(const T &) + { + field_types.insert(Field::TypeToEnum>::value); + type_indexes.insert(TypeToTypeIndex>); + } + + DataTypePtr getScalarType() const { return getLeastSupertype(type_indexes, true); } + bool haveNulls() const { return have_nulls; } + bool needConvertField() const { return field_types.size() > 1; } + +private: + TypeIndexSet type_indexes; + std::unordered_set field_types; + bool have_nulls = false; +}; + +} + +FieldInfo getFieldInfo(const Field & field) +{ + FieldVisitorToScalarType to_scalar_type_visitor; + applyVisitor(to_scalar_type_visitor, field); + + return + { + to_scalar_type_visitor.getScalarType(), + to_scalar_type_visitor.haveNulls(), + to_scalar_type_visitor.needConvertField(), + applyVisitor(FieldVisitorToNumberOfDimensions(), field), + }; +} + +ColumnObject::Subcolumn::Subcolumn(MutableColumnPtr && data_, bool is_nullable_) + : least_common_type(getDataTypeByColumn(*data_)) + , is_nullable(is_nullable_) +{ + data.push_back(std::move(data_)); +} + +ColumnObject::Subcolumn::Subcolumn( + size_t size_, bool is_nullable_) + : least_common_type(std::make_shared()) + , is_nullable(is_nullable_) + , num_of_defaults_in_prefix(size_) +{ +} + +size_t ColumnObject::Subcolumn::Subcolumn::size() const +{ + size_t res = num_of_defaults_in_prefix; + for (const auto & part : data) + res += part->size(); + return res; +} + +size_t ColumnObject::Subcolumn::Subcolumn::byteSize() const +{ + size_t res = 0; + for (const auto & part : data) + res += part->byteSize(); + return res; +} + +size_t ColumnObject::Subcolumn::Subcolumn::allocatedBytes() const +{ + size_t res = 0; + for (const auto & part : data) + res += part->allocatedBytes(); + return res; +} + +void ColumnObject::Subcolumn::checkTypes() const +{ + DataTypes prefix_types; + prefix_types.reserve(data.size()); + for (size_t i = 0; i < data.size(); ++i) + { + auto current_type = getDataTypeByColumn(*data[i]); + prefix_types.push_back(current_type); + auto prefix_common_type = getLeastSupertype(prefix_types); + if (!prefix_common_type->equals(*current_type)) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Data type {} of column at position {} cannot represent all columns from i-th prefix", + current_type->getName(), i); + } +} + +void ColumnObject::Subcolumn::insert(Field field) +{ + auto info = getFieldInfo(field); + insert(std::move(field), std::move(info)); +} + +void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) +{ + auto base_type = info.scalar_type; + + if (isNothing(base_type) && info.num_dimensions == 0) + { + insertDefault(); + return; + } + + auto column_dim = getNumberOfDimensions(*least_common_type); + auto value_dim = info.num_dimensions; + + if (isNothing(least_common_type)) + column_dim = value_dim; + + if (field.isNull()) + value_dim = column_dim; + + if (value_dim != column_dim) + throw Exception(ErrorCodes::NUMBER_OF_DIMENSIONS_MISMATHED, + "Dimension of types mismatched between inserted value and column. " + "Dimension of value: {}. Dimension of column: {}", + value_dim, column_dim); + + if (is_nullable) + base_type = makeNullable(base_type); + + if (!is_nullable && info.have_nulls) + field = applyVisitor(FieldVisitorReplaceNull(base_type->getDefault(), value_dim), std::move(field)); + + auto value_type = createArrayOfType(base_type, value_dim); + bool type_changed = false; + + if (data.empty()) + { + data.push_back(value_type->createColumn()); + least_common_type = value_type; + } + else if (!least_common_type->equals(*value_type)) + { + value_type = getLeastSupertype(DataTypes{value_type, least_common_type}, true); + type_changed = true; + if (!least_common_type->equals(*value_type)) + { + data.push_back(value_type->createColumn()); + least_common_type = value_type; + } + } + + if (type_changed || info.need_convert) + field = convertFieldToTypeOrThrow(field, *value_type); + + data.back()->insert(field); +} + +void ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn & src, size_t start, size_t length) +{ + assert(src.isFinalized()); + + const auto & src_column = src.data.back(); + const auto & src_type = src.least_common_type; + + if (data.empty()) + { + least_common_type = src_type; + data.push_back(src_type->createColumn()); + data.back()->insertRangeFrom(*src_column, start, length); + } + else if (least_common_type->equals(*src_type)) + { + data.back()->insertRangeFrom(*src_column, start, length); + } + else + { + auto new_least_common_type = getLeastSupertype(DataTypes{least_common_type, src_type}, true); + auto casted_column = castColumn({src_column, src_type, ""}, new_least_common_type); + + if (!least_common_type->equals(*new_least_common_type)) + { + least_common_type = new_least_common_type; + data.push_back(least_common_type->createColumn()); + } + + data.back()->insertRangeFrom(*casted_column, start, length); + } +} + +void ColumnObject::Subcolumn::finalize() +{ + if (isFinalized() || data.empty()) + return; + + const auto & to_type = least_common_type; + auto result_column = to_type->createColumn(); + + if (num_of_defaults_in_prefix) + result_column->insertManyDefaults(num_of_defaults_in_prefix); + + for (auto & part : data) + { + auto from_type = getDataTypeByColumn(*part); + size_t part_size = part->size(); + + if (!from_type->equals(*to_type)) + { + auto offsets = ColumnUInt64::create(); + auto & offsets_data = offsets->getData(); + + /// We need to convert only non-default values and then recreate column + /// with default value of new type, because default values (which represents misses in data) + /// may be inconsistent between types (e.g "0" in UInt64 and empty string in String). + + part->getIndicesOfNonDefaultRows(offsets_data, 0, part_size); + + if (offsets->size() == part_size) + { + part = castColumn({part, from_type, ""}, to_type); + } + else + { + auto values = part->index(*offsets, offsets->size()); + values = castColumn({values, from_type, ""}, to_type); + part = values->createWithOffsets(offsets_data, to_type->getDefault(), part_size, /*shift=*/ 0); + } + } + + result_column->insertRangeFrom(*part, 0, part_size); + } + + data = { std::move(result_column) }; + num_of_defaults_in_prefix = 0; +} + +void ColumnObject::Subcolumn::insertDefault() +{ + if (data.empty()) + ++num_of_defaults_in_prefix; + else + data.back()->insertDefault(); +} + +void ColumnObject::Subcolumn::insertManyDefaults(size_t length) +{ + if (data.empty()) + num_of_defaults_in_prefix += length; + else + data.back()->insertManyDefaults(length); +} + +void ColumnObject::Subcolumn::popBack(size_t n) +{ + assert(n <= size()); + + size_t num_removed = 0; + for (auto it = data.rbegin(); it != data.rend(); ++it) + { + if (n == 0) + break; + + auto & column = *it; + if (n < column->size()) + { + column->popBack(n); + n = 0; + } + else + { + ++num_removed; + n -= column->size(); + } + } + + data.resize(data.size() - num_removed); + num_of_defaults_in_prefix -= n; +} + +Field ColumnObject::Subcolumn::getLastField() const +{ + if (data.empty()) + return Field(); + + const auto & last_part = data.back(); + assert(!last_part->empty()); + return (*last_part)[last_part->size() - 1]; +} + +ColumnObject::Subcolumn ColumnObject::Subcolumn::recreateWithDefaultValues(const FieldInfo & field_info) const +{ + auto scalar_type = field_info.scalar_type; + if (is_nullable) + scalar_type = makeNullable(scalar_type); + + Subcolumn new_subcolumn; + new_subcolumn.least_common_type = createArrayOfType(scalar_type, field_info.num_dimensions); + new_subcolumn.is_nullable = is_nullable; + new_subcolumn.num_of_defaults_in_prefix = num_of_defaults_in_prefix; + new_subcolumn.data.reserve(data.size()); + + for (const auto & part : data) + new_subcolumn.data.push_back(recreateColumnWithDefaultValues( + part, scalar_type, field_info.num_dimensions)); + + return new_subcolumn; +} + +IColumn & ColumnObject::Subcolumn::getFinalizedColumn() +{ + assert(isFinalized()); + return *data[0]; +} + +const IColumn & ColumnObject::Subcolumn::getFinalizedColumn() const +{ + assert(isFinalized()); + return *data[0]; +} + +const ColumnPtr & ColumnObject::Subcolumn::getFinalizedColumnPtr() const +{ + assert(isFinalized()); + return data[0]; +} + +ColumnObject::ColumnObject(bool is_nullable_) + : is_nullable(is_nullable_) + , num_rows(0) +{ +} + +ColumnObject::ColumnObject(SubcolumnsTree && subcolumns_, bool is_nullable_) + : is_nullable(is_nullable_) + , subcolumns(std::move(subcolumns_)) + , num_rows(subcolumns.empty() ? 0 : (*subcolumns.begin())->data.size()) + +{ + checkConsistency(); +} + +void ColumnObject::checkConsistency() const +{ + if (subcolumns.empty()) + return; + + for (const auto & leaf : subcolumns) + { + if (num_rows != leaf->data.size()) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Sizes of subcolumns are inconsistent in ColumnObject." + " Subcolumn '{}' has {} rows, but expected size is {}", + leaf->path.getPath(), leaf->data.size(), num_rows); + } + } +} + +size_t ColumnObject::size() const +{ +#ifndef NDEBUG + checkConsistency(); +#endif + return num_rows; +} + +MutableColumnPtr ColumnObject::cloneResized(size_t new_size) const +{ + /// cloneResized with new_size == 0 is used for cloneEmpty(). + if (new_size != 0) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "ColumnObject doesn't support resize to non-zero length"); + + return ColumnObject::create(is_nullable); +} + +size_t ColumnObject::byteSize() const +{ + size_t res = 0; + for (const auto & entry : subcolumns) + res += entry->data.byteSize(); + return res; +} + +size_t ColumnObject::allocatedBytes() const +{ + size_t res = 0; + for (const auto & entry : subcolumns) + res += entry->data.allocatedBytes(); + return res; +} + +void ColumnObject::forEachSubcolumn(ColumnCallback callback) +{ + if (!isFinalized()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot iterate over non-finalized ColumnObject"); + + for (auto & entry : subcolumns) + callback(entry->data.data.back()); +} + +void ColumnObject::insert(const Field & field) +{ + const auto & object = field.get(); + + HashSet inserted; + size_t old_size = size(); + for (const auto & [key_str, value] : object) + { + PathInData key(key_str); + inserted.insert(key_str); + if (!hasSubcolumn(key)) + addSubcolumn(key, old_size); + + auto & subcolumn = getSubcolumn(key); + subcolumn.insert(value); + } + + for (auto & entry : subcolumns) + if (!inserted.has(entry->path.getPath())) + entry->data.insertDefault(); + + ++num_rows; +} + +void ColumnObject::insertDefault() +{ + for (auto & entry : subcolumns) + entry->data.insertDefault(); + + ++num_rows; +} + +Field ColumnObject::operator[](size_t n) const +{ + if (!isFinalized()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get Field from non-finalized ColumnObject"); + + Object object; + for (const auto & entry : subcolumns) + object[entry->path.getPath()] = (*entry->data.data.back())[n]; + + return object; +} + +void ColumnObject::get(size_t n, Field & res) const +{ + if (!isFinalized()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get Field from non-finalized ColumnObject"); + + auto & object = res.get(); + for (const auto & entry : subcolumns) + { + auto it = object.try_emplace(entry->path.getPath()).first; + entry->data.data.back()->get(n, it->second); + } +} + +void ColumnObject::insertRangeFrom(const IColumn & src, size_t start, size_t length) +{ + const auto & src_object = assert_cast(src); + + for (auto & entry : subcolumns) + { + if (src_object.hasSubcolumn(entry->path)) + entry->data.insertRangeFrom(src_object.getSubcolumn(entry->path), start, length); + else + entry->data.insertManyDefaults(length); + } + + num_rows += length; + finalize(); +} + +ColumnPtr ColumnObject::replicate(const Offsets & offsets) const +{ + if (!isFinalized()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot replicate non-finalized ColumnObject"); + + auto res_column = ColumnObject::create(is_nullable); + for (const auto & entry : subcolumns) + { + auto replicated_data = entry->data.data.back()->replicate(offsets)->assumeMutable(); + res_column->addSubcolumn(entry->path, std::move(replicated_data)); + } + + return res_column; +} + +void ColumnObject::popBack(size_t length) +{ + for (auto & entry : subcolumns) + entry->data.popBack(length); + + num_rows -= length; +} + +const ColumnObject::Subcolumn & ColumnObject::getSubcolumn(const PathInData & key) const +{ + if (const auto * node = subcolumns.findLeaf(key)) + return node->data; + + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in ColumnObject", key.getPath()); +} + +ColumnObject::Subcolumn & ColumnObject::getSubcolumn(const PathInData & key) +{ + if (const auto * node = subcolumns.findLeaf(key)) + return const_cast(node)->data; + + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in ColumnObject", key.getPath()); +} + +bool ColumnObject::hasSubcolumn(const PathInData & key) const +{ + return subcolumns.findLeaf(key) != nullptr; +} + +void ColumnObject::addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn) +{ + size_t new_size = subcolumn->size(); + bool inserted = subcolumns.add(key, Subcolumn(std::move(subcolumn), is_nullable)); + + if (!inserted) + throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Subcolumn '{}' already exists", key.getPath()); + + if (num_rows == 0) + num_rows = new_size; + else if (new_size != num_rows) + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, + "Size of subcolumn {} ({}) is inconsistent with column size ({})", + key.getPath(), new_size, num_rows); +} + +void ColumnObject::addSubcolumn(const PathInData & key, size_t new_size) +{ + bool inserted = subcolumns.add(key, Subcolumn(new_size, is_nullable)); + if (!inserted) + throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Subcolumn '{}' already exists", key.getPath()); + + if (num_rows == 0) + num_rows = new_size; + else if (new_size != num_rows) + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, + "Required size of subcolumn {} ({}) is inconsistent with column size ({})", + key.getPath(), new_size, num_rows); +} + +void ColumnObject::addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size) +{ + if (!key.hasNested()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Cannot add Nested subcolumn, because path doesn't contain Nested"); + + bool inserted = false; + /// We find node that represents the same Nested type as @key. + const auto * nested_node = subcolumns.findBestMatch(key); + + if (nested_node) + { + /// Find any leaf of Nested subcolumn. + const auto * leaf = subcolumns.findLeaf(nested_node, [&](const auto &) { return true; }); + assert(leaf); + + /// Recreate subcolumn with default values and the same sizes of arrays. + auto new_subcolumn = leaf->data.recreateWithDefaultValues(field_info); + + /// It's possible that we have already inserted value from current row + /// to this subcolumn. So, adjust size to expected. + if (new_subcolumn.size() > new_size) + new_subcolumn.popBack(new_subcolumn.size() - new_size); + + assert(new_subcolumn.size() == new_size); + inserted = subcolumns.add(key, new_subcolumn); + } + else + { + /// If node was not found just add subcolumn with empty arrays. + inserted = subcolumns.add(key, Subcolumn(new_size, is_nullable)); + } + + if (!inserted) + throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Subcolumn '{}' already exists", key.getPath()); + + if (num_rows == 0) + num_rows = new_size; +} + +PathsInData ColumnObject::getKeys() const +{ + PathsInData keys; + keys.reserve(subcolumns.size()); + for (const auto & entry : subcolumns) + keys.emplace_back(entry->path); + return keys; +} + +bool ColumnObject::isFinalized() const +{ + return std::all_of(subcolumns.begin(), subcolumns.end(), + [](const auto & entry) { return entry->data.isFinalized(); }); +} + +void ColumnObject::finalize() +{ + size_t old_size = size(); + SubcolumnsTree new_subcolumns; + for (auto && entry : subcolumns) + { + const auto & least_common_type = entry->data.getLeastCommonType(); + + /// Do not add subcolumns, which consists only from NULLs. + if (isNothing(getBaseTypeOfArray(least_common_type))) + continue; + + entry->data.finalize(); + new_subcolumns.add(entry->path, entry->data); + } + + /// If all subcolumns were skipped add a dummy subcolumn, + /// because Tuple type must have at least one element. + if (new_subcolumns.empty()) + new_subcolumns.add(PathInData{COLUMN_NAME_DUMMY}, Subcolumn{ColumnUInt8::create(old_size, 0), is_nullable}); + + std::swap(subcolumns, new_subcolumns); + checkObjectHasNoAmbiguosPaths(getKeys()); +} + +} diff --git a/src/Columns/ColumnObject.h b/src/Columns/ColumnObject.h new file mode 100644 index 00000000000..06d946f2ea8 --- /dev/null +++ b/src/Columns/ColumnObject.h @@ -0,0 +1,219 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +/// Info that represents a scalar or array field in a decomposed view. +/// It allows to recreate field with different number +/// of dimensions or nullability. +struct FieldInfo +{ + /// The common type of of all scalars in field. + DataTypePtr scalar_type; + + /// Do we have NULL scalar in field. + bool have_nulls; + + /// If true then we have scalars with different types in array and + /// we need to convert scalars to the common type. + bool need_convert; + + /// Number of dimension in array. 0 if field is scalar. + size_t num_dimensions; +}; + +FieldInfo getFieldInfo(const Field & field); + +/** A column that represents object with dynamic set of subcolumns. + * Subcolumns are identified by paths in document and are stored in + * a trie-like structure. ColumnObject is not suitable for writing into tables + * and it should be converted to Tuple with fixed set of subcolumns before that. + */ +class ColumnObject final : public COWHelper +{ +public: + /** Class that represents one subcolumn. + * It stores values in several parts of column + * and keeps current common type of all parts. + * We add a new column part with a new type, when we insert a field, + * which can't be converted to the current common type. + * After insertion of all values subcolumn should be finalized + * for writing and other operations. + */ + class Subcolumn + { + public: + Subcolumn() = default; + Subcolumn(size_t size_, bool is_nullable_); + Subcolumn(MutableColumnPtr && data_, bool is_nullable_); + + size_t size() const; + size_t byteSize() const; + size_t allocatedBytes() const; + + bool isFinalized() const { return data.size() == 1 && num_of_defaults_in_prefix == 0; } + const DataTypePtr & getLeastCommonType() const { return least_common_type; } + + /// Checks the consistency of column's parts stored in @data. + void checkTypes() const; + + /// Inserts a field, which scalars can be arbitrary, but number of + /// dimensions should be consistent with current common type. + void insert(Field field); + void insert(Field field, FieldInfo info); + + void insertDefault(); + void insertManyDefaults(size_t length); + void insertRangeFrom(const Subcolumn & src, size_t start, size_t length); + void popBack(size_t n); + + /// Converts all column's parts to the common type and + /// creates a single column that stores all values. + void finalize(); + + /// Returns last inserted field. + Field getLastField() const; + + /// Recreates subcolumn with default scalar values and keeps sizes of arrays. + /// Used to create columns of type Nested with consistent array sizes. + Subcolumn recreateWithDefaultValues(const FieldInfo & field_info) const; + + /// Returns single column if subcolumn in finalizes. + /// Otherwise -- undefined behaviour. + IColumn & getFinalizedColumn(); + const IColumn & getFinalizedColumn() const; + const ColumnPtr & getFinalizedColumnPtr() const; + + friend class ColumnObject; + + private: + /// Current least common type of all values inserted to this subcolumn. + DataTypePtr least_common_type; + + /// If true then common type type of subcolumn is Nullable + /// and default values are NULLs. + bool is_nullable = false; + + /// Parts of column. Parts should be in increasing order in terms of subtypes/supertypes. + /// That means that the least common type for i-th prefix is the type of i-th part + /// and it's the supertype for all type of column from 0 to i-1. + std::vector data; + + /// Until we insert any non-default field we don't know further + /// least common type and we count number of defaults in prefix, + /// which will be converted to the default type of final common type. + size_t num_of_defaults_in_prefix = 0; + }; + + using SubcolumnsTree = SubcolumnsTree; + +private: + /// If true then all subcolumns are nullable. + const bool is_nullable; + + SubcolumnsTree subcolumns; + size_t num_rows; + +public: + static constexpr auto COLUMN_NAME_DUMMY = "_dummy"; + + explicit ColumnObject(bool is_nullable_); + ColumnObject(SubcolumnsTree && subcolumns_, bool is_nullable_); + + /// Checks that all subcolumns have consistent sizes. + void checkConsistency() const; + + bool hasSubcolumn(const PathInData & key) const; + + const Subcolumn & getSubcolumn(const PathInData & key) const; + Subcolumn & getSubcolumn(const PathInData & key); + + void incrementNumRows() { ++num_rows; } + + /// Adds a subcolumn from existing IColumn. + void addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn); + + /// Adds a subcolumn of specific size with default values. + void addSubcolumn(const PathInData & key, size_t new_size); + + /// Adds a subcolumn of type Nested of specific size with default values. + /// It cares about consistency of sizes of Nested arrays. + void addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size); + + const SubcolumnsTree & getSubcolumns() const { return subcolumns; } + SubcolumnsTree & getSubcolumns() { return subcolumns; } + PathsInData getKeys() const; + + /// Finalizes all subcolumns. + void finalize(); + bool isFinalized() const; + + /// Part of interface + + const char * getFamilyName() const override { return "Object"; } + TypeIndex getDataType() const override { return TypeIndex::Object; } + + size_t size() const override; + MutableColumnPtr cloneResized(size_t new_size) const override; + size_t byteSize() const override; + size_t allocatedBytes() const override; + void forEachSubcolumn(ColumnCallback callback) override; + void insert(const Field & field) override; + void insertDefault() override; + void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; + ColumnPtr replicate(const Offsets & offsets) const override; + void popBack(size_t length) override; + Field operator[](size_t n) const override; + void get(size_t n, Field & res) const override; + + /// All other methods throw exception. + + ColumnPtr decompress() const override { throwMustBeConcrete(); } + StringRef getDataAt(size_t) const override { throwMustBeConcrete(); } + bool isDefaultAt(size_t) const override { throwMustBeConcrete(); } + void insertData(const char *, size_t) override { throwMustBeConcrete(); } + StringRef serializeValueIntoArena(size_t, Arena &, char const *&) const override { throwMustBeConcrete(); } + const char * deserializeAndInsertFromArena(const char *) override { throwMustBeConcrete(); } + const char * skipSerializedInArena(const char *) const override { throwMustBeConcrete(); } + void updateHashWithValue(size_t, SipHash &) const override { throwMustBeConcrete(); } + void updateWeakHash32(WeakHash32 &) const override { throwMustBeConcrete(); } + void updateHashFast(SipHash &) const override { throwMustBeConcrete(); } + ColumnPtr filter(const Filter &, ssize_t) const override { throwMustBeConcrete(); } + void expand(const Filter &, bool) override { throwMustBeConcrete(); } + ColumnPtr permute(const Permutation &, size_t) const override { throwMustBeConcrete(); } + ColumnPtr index(const IColumn &, size_t) const override { throwMustBeConcrete(); } + int compareAt(size_t, size_t, const IColumn &, int) const override { throwMustBeConcrete(); } + void compareColumn(const IColumn &, size_t, PaddedPODArray *, PaddedPODArray &, int, int) const override { throwMustBeConcrete(); } + bool hasEqualValues() const override { throwMustBeConcrete(); } + void getPermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &) const override { throwMustBeConcrete(); } + void updatePermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &, EqualRanges &) const override { throwMustBeConcrete(); } + MutableColumns scatter(ColumnIndex, const Selector &) const override { throwMustBeConcrete(); } + void gather(ColumnGathererStream &) override { throwMustBeConcrete(); } + void getExtremes(Field &, Field &) const override { throwMustBeConcrete(); } + size_t byteSizeAt(size_t) const override { throwMustBeConcrete(); } + double getRatioOfDefaultRows(double) const override { throwMustBeConcrete(); } + void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeConcrete(); } + +private: + [[noreturn]] static void throwMustBeConcrete() + { + throw Exception("ColumnObject must be converted to ColumnTuple before use", ErrorCodes::LOGICAL_ERROR); + } +}; + +} diff --git a/src/Columns/ColumnSparse.cpp b/src/Columns/ColumnSparse.cpp index 79ec06d7882..d2cc8223a91 100644 --- a/src/Columns/ColumnSparse.cpp +++ b/src/Columns/ColumnSparse.cpp @@ -288,7 +288,7 @@ void ColumnSparse::popBack(size_t n) ColumnPtr ColumnSparse::filter(const Filter & filt, ssize_t) const { if (_size != filt.size()) - throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), _size); if (offsets->empty()) { diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index 7431637ff0e..dded5ff6c99 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -381,7 +381,7 @@ ColumnPtr ColumnVector::filter(const IColumn::Filter & filt, ssize_t result_s { size_t size = data.size(); if (size != filt.size()) - throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size); auto res = this->create(); Container & res_data = res->getData(); @@ -450,7 +450,7 @@ void ColumnVector::applyZeroMap(const IColumn::Filter & filt, bool inverted) { size_t size = data.size(); if (size != filt.size()) - throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size); const UInt8 * filt_pos = filt.data(); const UInt8 * filt_end = filt_pos + size; diff --git a/src/Columns/ColumnsCommon.cpp b/src/Columns/ColumnsCommon.cpp index 701b888fb25..0a9201f7543 100644 --- a/src/Columns/ColumnsCommon.cpp +++ b/src/Columns/ColumnsCommon.cpp @@ -192,7 +192,7 @@ namespace { const size_t size = src_offsets.size(); if (size != filt.size()) - throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH); + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size); ResultOffsetsBuilder result_offsets_builder(res_offsets); diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index a5b2bbedc8a..50afc90f481 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -883,8 +883,8 @@ public: return toDayNum(years_lut[year - DATE_LUT_MIN_YEAR]); } - template || std::is_same_v>> + template + requires std::is_same_v || std::is_same_v inline auto toStartOfQuarterInterval(Date d, UInt64 quarters) const { if (quarters == 1) @@ -892,8 +892,8 @@ public: return toStartOfMonthInterval(d, quarters * 3); } - template || std::is_same_v>> + template + requires std::is_same_v || std::is_same_v inline auto toStartOfMonthInterval(Date d, UInt64 months) const { if (months == 1) @@ -906,8 +906,8 @@ public: return toDayNum(years_months_lut[month_total_index / months * months]); } - template || std::is_same_v>> + template + requires std::is_same_v || std::is_same_v inline auto toStartOfWeekInterval(Date d, UInt64 weeks) const { if (weeks == 1) @@ -920,8 +920,8 @@ public: return ExtendedDayNum(4 + (d - 4) / days * days); } - template || std::is_same_v>> + template + requires std::is_same_v || std::is_same_v inline Time toStartOfDayInterval(Date d, UInt64 days) const { if (days == 1) @@ -1219,10 +1219,8 @@ public: /// If resulting month has less deys than source month, then saturation can happen. /// Example: 31 Aug + 1 month = 30 Sep. - template < - typename DateTime, - typename - = std::enable_if_t || std::is_same_v || std::is_same_v>> + template + requires std::is_same_v || std::is_same_v || std::is_same_v inline Time NO_SANITIZE_UNDEFINED addMonths(DateTime t, Int64 delta) const { const auto result_day = addMonthsIndex(t, delta); @@ -1247,8 +1245,8 @@ public: return res; } - template || std::is_same_v>> + template + requires std::is_same_v || std::is_same_v inline auto NO_SANITIZE_UNDEFINED addMonths(Date d, Int64 delta) const { if constexpr (std::is_same_v) @@ -1280,10 +1278,8 @@ public: } /// Saturation can occur if 29 Feb is mapped to non-leap year. - template < - typename DateTime, - typename - = std::enable_if_t || std::is_same_v || std::is_same_v>> + template + requires std::is_same_v || std::is_same_v || std::is_same_v inline Time addYears(DateTime t, Int64 delta) const { auto result_day = addYearsIndex(t, delta); @@ -1308,8 +1304,8 @@ public: return res; } - template || std::is_same_v>> + template + requires std::is_same_v || std::is_same_v inline auto addYears(Date d, Int64 delta) const { if constexpr (std::is_same_v) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 09167ea4849..3ed13ac04ea 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -613,6 +613,7 @@ M(642, CANNOT_PACK_ARCHIVE) \ M(643, CANNOT_UNPACK_ARCHIVE) \ M(644, REMOTE_FS_OBJECT_CACHE_ERROR) \ + M(645, NUMBER_OF_DIMENSIONS_MISMATHED) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Common/Exception.h b/src/Common/Exception.h index fc4f6d1295d..0bf89e7a447 100644 --- a/src/Common/Exception.h +++ b/src/Common/Exception.h @@ -205,7 +205,8 @@ void rethrowFirstException(const Exceptions & exceptions); template -std::enable_if_t, T> exception_cast(std::exception_ptr e) +requires std::is_pointer_v +T exception_cast(std::exception_ptr e) { try { diff --git a/src/Common/FieldVisitorConvertToNumber.h b/src/Common/FieldVisitorConvertToNumber.h index 7bbb7f0708a..92da0f89844 100644 --- a/src/Common/FieldVisitorConvertToNumber.h +++ b/src/Common/FieldVisitorConvertToNumber.h @@ -46,6 +46,11 @@ public: throw Exception("Cannot convert Map to " + demangle(typeid(T).name()), ErrorCodes::CANNOT_CONVERT_TYPE); } + T operator() (const Object &) const + { + throw Exception("Cannot convert Object to " + demangle(typeid(T).name()), ErrorCodes::CANNOT_CONVERT_TYPE); + } + T operator() (const UInt64 & x) const { return T(x); } T operator() (const Int64 & x) const { return T(x); } T operator() (const Int128 & x) const { return T(x); } @@ -113,7 +118,8 @@ public: throw Exception("Cannot convert AggregateFunctionStateData to " + demangle(typeid(T).name()), ErrorCodes::CANNOT_CONVERT_TYPE); } - template > > + template + requires is_big_int_v T operator() (const U & x) const { if constexpr (is_decimal) diff --git a/src/Common/FieldVisitorDump.cpp b/src/Common/FieldVisitorDump.cpp index 6c869e05fd4..fc3d56c3503 100644 --- a/src/Common/FieldVisitorDump.cpp +++ b/src/Common/FieldVisitorDump.cpp @@ -95,6 +95,23 @@ String FieldVisitorDump::operator() (const Map & x) const return wb.str(); } +String FieldVisitorDump::operator() (const Object & x) const +{ + WriteBufferFromOwnString wb; + + wb << "Object_("; + for (auto it = x.begin(); it != x.end(); ++it) + { + if (it != x.begin()) + wb << ", "; + wb << "(" << it->first << ", " << applyVisitor(*this, it->second) << ")"; + } + wb << ')'; + + return wb.str(); + +} + String FieldVisitorDump::operator() (const AggregateFunctionStateData & x) const { WriteBufferFromOwnString wb; diff --git a/src/Common/FieldVisitorDump.h b/src/Common/FieldVisitorDump.h index 0b1b311999e..dc67ccf7da3 100644 --- a/src/Common/FieldVisitorDump.h +++ b/src/Common/FieldVisitorDump.h @@ -22,6 +22,7 @@ public: String operator() (const Array & x) const; String operator() (const Tuple & x) const; String operator() (const Map & x) const; + String operator() (const Object & x) const; String operator() (const DecimalField & x) const; String operator() (const DecimalField & x) const; String operator() (const DecimalField & x) const; diff --git a/src/Common/FieldVisitorHash.cpp b/src/Common/FieldVisitorHash.cpp index 09b8b7908f3..b6750fdcd03 100644 --- a/src/Common/FieldVisitorHash.cpp +++ b/src/Common/FieldVisitorHash.cpp @@ -94,6 +94,19 @@ void FieldVisitorHash::operator() (const Array & x) const applyVisitor(*this, elem); } +void FieldVisitorHash::operator() (const Object & x) const +{ + UInt8 type = Field::Types::Object; + hash.update(type); + hash.update(x.size()); + + for (const auto & [key, value]: x) + { + hash.update(key); + applyVisitor(*this, value); + } +} + void FieldVisitorHash::operator() (const DecimalField & x) const { UInt8 type = Field::Types::Decimal32; diff --git a/src/Common/FieldVisitorHash.h b/src/Common/FieldVisitorHash.h index 82e831b961e..e574b0456eb 100644 --- a/src/Common/FieldVisitorHash.h +++ b/src/Common/FieldVisitorHash.h @@ -28,6 +28,7 @@ public: void operator() (const Array & x) const; void operator() (const Tuple & x) const; void operator() (const Map & x) const; + void operator() (const Object & x) const; void operator() (const DecimalField & x) const; void operator() (const DecimalField & x) const; void operator() (const DecimalField & x) const; diff --git a/src/Common/FieldVisitorSum.cpp b/src/Common/FieldVisitorSum.cpp index c3d7f4f8462..a8cb694d930 100644 --- a/src/Common/FieldVisitorSum.cpp +++ b/src/Common/FieldVisitorSum.cpp @@ -26,6 +26,7 @@ bool FieldVisitorSum::operator() (String &) const { throw Exception("Cannot sum bool FieldVisitorSum::operator() (Array &) const { throw Exception("Cannot sum Arrays", ErrorCodes::LOGICAL_ERROR); } bool FieldVisitorSum::operator() (Tuple &) const { throw Exception("Cannot sum Tuples", ErrorCodes::LOGICAL_ERROR); } bool FieldVisitorSum::operator() (Map &) const { throw Exception("Cannot sum Maps", ErrorCodes::LOGICAL_ERROR); } +bool FieldVisitorSum::operator() (Object &) const { throw Exception("Cannot sum Objects", ErrorCodes::LOGICAL_ERROR); } bool FieldVisitorSum::operator() (UUID &) const { throw Exception("Cannot sum UUIDs", ErrorCodes::LOGICAL_ERROR); } bool FieldVisitorSum::operator() (AggregateFunctionStateData &) const diff --git a/src/Common/FieldVisitorSum.h b/src/Common/FieldVisitorSum.h index 3e868e46f71..247ef5b43da 100644 --- a/src/Common/FieldVisitorSum.h +++ b/src/Common/FieldVisitorSum.h @@ -25,6 +25,7 @@ public: bool operator() (Array &) const; bool operator() (Tuple &) const; bool operator() (Map &) const; + bool operator() (Object &) const; bool operator() (UUID &) const; bool operator() (AggregateFunctionStateData &) const; bool operator() (bool &) const; @@ -36,7 +37,8 @@ public: return x.getValue() != T(0); } - template > > + template + requires is_big_int_v bool operator() (T & x) const { x += rhs.reinterpret(); diff --git a/src/Common/FieldVisitorToString.cpp b/src/Common/FieldVisitorToString.cpp index 6cc83f32a52..7d93cfba78f 100644 --- a/src/Common/FieldVisitorToString.cpp +++ b/src/Common/FieldVisitorToString.cpp @@ -126,5 +126,24 @@ String FieldVisitorToString::operator() (const Map & x) const return wb.str(); } +String FieldVisitorToString::operator() (const Object & x) const +{ + WriteBufferFromOwnString wb; + + wb << '{'; + for (auto it = x.begin(); it != x.end(); ++it) + { + if (it != x.begin()) + wb << ", "; + + writeDoubleQuoted(it->first, wb); + wb << ": " << applyVisitor(*this, it->second); + } + wb << '}'; + + return wb.str(); + +} + } diff --git a/src/Common/FieldVisitorToString.h b/src/Common/FieldVisitorToString.h index 991f7b4b2d7..324a4aa73d5 100644 --- a/src/Common/FieldVisitorToString.h +++ b/src/Common/FieldVisitorToString.h @@ -22,6 +22,7 @@ public: String operator() (const Array & x) const; String operator() (const Tuple & x) const; String operator() (const Map & x) const; + String operator() (const Object & x) const; String operator() (const DecimalField & x) const; String operator() (const DecimalField & x) const; String operator() (const DecimalField & x) const; diff --git a/src/Common/FieldVisitorWriteBinary.cpp b/src/Common/FieldVisitorWriteBinary.cpp index fc17b58b334..edabd26fd3a 100644 --- a/src/Common/FieldVisitorWriteBinary.cpp +++ b/src/Common/FieldVisitorWriteBinary.cpp @@ -66,6 +66,20 @@ void FieldVisitorWriteBinary::operator() (const Map & x, WriteBuffer & buf) cons } } +void FieldVisitorWriteBinary::operator() (const Object & x, WriteBuffer & buf) const +{ + const size_t size = x.size(); + writeBinary(size, buf); + + for (const auto & [key, value] : x) + { + const UInt8 type = value.getType(); + writeBinary(type, buf); + writeBinary(key, buf); + Field::dispatch([&buf] (const auto & val) { FieldVisitorWriteBinary()(val, buf); }, value); + } +} + void FieldVisitorWriteBinary::operator()(const bool & x, WriteBuffer & buf) const { writeBinary(UInt8(x), buf); diff --git a/src/Common/FieldVisitorWriteBinary.h b/src/Common/FieldVisitorWriteBinary.h index 155cf0e1050..ff2740383f7 100644 --- a/src/Common/FieldVisitorWriteBinary.h +++ b/src/Common/FieldVisitorWriteBinary.h @@ -21,6 +21,7 @@ public: void operator() (const Array & x, WriteBuffer & buf) const; void operator() (const Tuple & x, WriteBuffer & buf) const; void operator() (const Map & x, WriteBuffer & buf) const; + void operator() (const Object & x, WriteBuffer & buf) const; void operator() (const DecimalField & x, WriteBuffer & buf) const; void operator() (const DecimalField & x, WriteBuffer & buf) const; void operator() (const DecimalField & x, WriteBuffer & buf) const; diff --git a/src/Common/FileSegment.cpp b/src/Common/FileSegment.cpp index fc39b6f1c57..fb61a5bfc01 100644 --- a/src/Common/FileSegment.cpp +++ b/src/Common/FileSegment.cpp @@ -46,7 +46,16 @@ FileSegment::State FileSegment::state() const size_t FileSegment::getDownloadOffset() const { std::lock_guard segment_lock(mutex); - return range().left + downloaded_size; + return range().left + getDownloadedSize(segment_lock); +} + +size_t FileSegment::getDownloadedSize(std::lock_guard & /* segment_lock */) const +{ + if (download_state == State::DOWNLOADED) + return downloaded_size; + + std::lock_guard download_lock(download_mutex); + return downloaded_size; } String FileSegment::getCallerId() @@ -174,7 +183,12 @@ void FileSegment::write(const char * from, size_t size) try { cache_writer->write(from, size); + + std::lock_guard download_lock(download_mutex); + cache_writer->next(); + + downloaded_size += size; } catch (...) { @@ -189,9 +203,6 @@ void FileSegment::write(const char * from, size_t size) throw; } - - std::lock_guard segment_lock(mutex); - downloaded_size += size; } FileSegment::State FileSegment::wait() @@ -225,15 +236,15 @@ bool FileSegment::reserve(size_t size) { std::lock_guard segment_lock(mutex); + auto caller_id = getCallerId(); + if (downloader_id != caller_id) + throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Space can be reserved only by downloader (current: {}, expected: {})", caller_id, downloader_id); + if (downloaded_size + size > range().size()) throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Attempt to reserve space too much space ({}) for file segment with range: {} (downloaded size: {})", size, range().toString(), downloaded_size); - auto caller_id = getCallerId(); - if (downloader_id != caller_id) - throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Space can be reserved only by downloader (current: {}, expected: {})", caller_id, downloader_id); - assert(reserved_size >= downloaded_size); } @@ -323,7 +334,7 @@ void FileSegment::complete() if (download_state == State::SKIP_CACHE || detached) return; - if (downloaded_size == range().size() && download_state != State::DOWNLOADED) + if (download_state != State::DOWNLOADED && getDownloadedSize(segment_lock) == range().size()) setDownloaded(segment_lock); if (download_state == State::DOWNLOADING || download_state == State::EMPTY) @@ -350,10 +361,11 @@ void FileSegment::completeImpl(bool allow_non_strict_checking) if (!download_can_continue) { - if (!downloaded_size) + size_t current_downloaded_size = getDownloadedSize(segment_lock); + if (current_downloaded_size == 0) { download_state = State::SKIP_CACHE; - LOG_TEST(log, "Remove cell {} (downloaded: {})", range().toString(), downloaded_size); + LOG_TEST(log, "Remove cell {} (nothing downloaded)", range().toString()); cache->remove(key(), offset(), cache_lock, segment_lock); detached = true; @@ -366,7 +378,7 @@ void FileSegment::completeImpl(bool allow_non_strict_checking) * in FileSegmentsHolder represent a contiguous range, so we can resize * it only when nobody needs it. */ - LOG_TEST(log, "Resize cell {} to downloaded: {}", range().toString(), downloaded_size); + LOG_TEST(log, "Resize cell {} to downloaded: {}", range().toString(), current_downloaded_size); cache->reduceSizeToDownloaded(key(), offset(), cache_lock, segment_lock); detached = true; @@ -397,7 +409,7 @@ String FileSegment::getInfoForLog() const WriteBufferFromOwnString info; info << "File segment: " << range().toString() << ", "; info << "state: " << download_state << ", "; - info << "downloaded size: " << downloaded_size << ", "; + info << "downloaded size: " << getDownloadedSize(segment_lock) << ", "; info << "downloader id: " << downloader_id << ", "; info << "caller id: " << getCallerId(); diff --git a/src/Common/FileSegment.h b/src/Common/FileSegment.h index 753a30035fb..ff854adf089 100644 --- a/src/Common/FileSegment.h +++ b/src/Common/FileSegment.h @@ -129,6 +129,7 @@ private: void setDownloaded(std::lock_guard & segment_lock); static String getCallerIdImpl(bool allow_non_strict_checking = false); void resetDownloaderImpl(std::lock_guard & segment_lock); + size_t getDownloadedSize(std::lock_guard & segment_lock) const; const Range segment_range; @@ -144,6 +145,14 @@ private: mutable std::mutex mutex; std::condition_variable cv; + /// Protects downloaded_size access with actual write into fs. + /// downloaded_size is not protected by download_mutex in methods which + /// can never be run in parallel to FileSegment::write() method + /// as downloaded_size is updated only in FileSegment::write() method. + /// Such methods are identified by isDownloader() check at their start, + /// e.g. they are executed strictly by the same thread, sequentially. + mutable std::mutex download_mutex; + Key file_key; IFileCache * cache; diff --git a/src/Common/FrequencyHolder.h b/src/Common/FrequencyHolder.h index a98ae0452d3..88da9d81309 100644 --- a/src/Common/FrequencyHolder.h +++ b/src/Common/FrequencyHolder.h @@ -1,5 +1,11 @@ #pragma once +#include +#include + +#include +#include + #include #include #include @@ -10,11 +16,6 @@ #include #include -#include -#include - -#include -#include namespace DB { @@ -34,7 +35,6 @@ namespace ErrorCodes class FrequencyHolder { - public: struct Language { @@ -52,6 +52,7 @@ public: public: using Map = HashMap; using Container = std::vector; + using EncodingMap = HashMap; using EncodingContainer = std::vector; @@ -61,6 +62,30 @@ public: return instance; } + const Map & getEmotionalDict() const + { + return emotional_dict; + } + + const EncodingContainer & getEncodingsFrequency() const + { + return encodings_freq; + } + + const Container & getProgrammingFrequency() const + { + return programming_freq; + } + +private: + + FrequencyHolder() + { + loadEmotionalDict(); + loadEncodingsFrequency(); + loadProgrammingFrequency(); + } + void loadEncodingsFrequency() { Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency"); @@ -119,7 +144,6 @@ public: LOG_TRACE(log, "Charset frequencies was added, charsets count: {}", encodings_freq.size()); } - void loadEmotionalDict() { Poco::Logger * log = &Poco::Logger::get("EmotionalDict"); @@ -158,7 +182,6 @@ public: LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count)); } - void loadProgrammingFrequency() { Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency"); @@ -211,42 +234,10 @@ public: LOG_TRACE(log, "Programming languages frequencies was added"); } - const Map & getEmotionalDict() - { - std::lock_guard lock(mutex); - if (emotional_dict.empty()) - loadEmotionalDict(); - - return emotional_dict; - } - - - const EncodingContainer & getEncodingsFrequency() - { - std::lock_guard lock(mutex); - if (encodings_freq.empty()) - loadEncodingsFrequency(); - - return encodings_freq; - } - - const Container & getProgrammingFrequency() - { - std::lock_guard lock(mutex); - if (programming_freq.empty()) - loadProgrammingFrequency(); - - return programming_freq; - } - - -private: Arena string_pool; Map emotional_dict; Container programming_freq; EncodingContainer encodings_freq; - - std::mutex mutex; }; } diff --git a/src/Common/IntervalTree.h b/src/Common/IntervalTree.h index b8334653754..e11edde53a9 100644 --- a/src/Common/IntervalTree.h +++ b/src/Common/IntervalTree.h @@ -130,6 +130,7 @@ public: IntervalTree() { nodes.resize(1); } template , bool> = true> + requires std::is_same_v ALWAYS_INLINE bool emplace(Interval interval) { assert(!tree_is_built); diff --git a/src/Common/JSONBuilder.h b/src/Common/JSONBuilder.h index fd4c66b4a9e..9a218fcf08b 100644 --- a/src/Common/JSONBuilder.h +++ b/src/Common/JSONBuilder.h @@ -76,7 +76,8 @@ public: void add(const char * value) { add(std::make_unique(value)); } void add(bool value) { add(std::make_unique(std::move(value))); } - template , bool> = true> + template + requires std::is_arithmetic_v void add(T value) { add(std::make_unique>(value)); } void format(const FormatSettings & settings, FormatContext & context) override; @@ -100,7 +101,8 @@ public: void add(std::string key, std::string_view value) { add(std::move(key), std::make_unique(value)); } void add(std::string key, bool value) { add(std::move(key), std::make_unique(std::move(value))); } - template , bool> = true> + template + requires std::is_arithmetic_v void add(std::string key, T value) { add(std::move(key), std::make_unique>(value)); } void format(const FormatSettings & settings, FormatContext & context) override; diff --git a/src/Functions/DummyJSONParser.h b/src/Common/JSONParsers/DummyJSONParser.h similarity index 100% rename from src/Functions/DummyJSONParser.h rename to src/Common/JSONParsers/DummyJSONParser.h diff --git a/src/Functions/RapidJSONParser.h b/src/Common/JSONParsers/RapidJSONParser.h similarity index 100% rename from src/Functions/RapidJSONParser.h rename to src/Common/JSONParsers/RapidJSONParser.h diff --git a/src/Functions/SimdJSONParser.h b/src/Common/JSONParsers/SimdJSONParser.h similarity index 100% rename from src/Functions/SimdJSONParser.h rename to src/Common/JSONParsers/SimdJSONParser.h diff --git a/src/Common/StringSearcher.h b/src/Common/StringSearcher.h index 5cdbbc22e12..b556ace75a7 100644 --- a/src/Common/StringSearcher.h +++ b/src/Common/StringSearcher.h @@ -82,7 +82,8 @@ private: #endif public: - template > + template + requires (sizeof(CharT) == 1) StringSearcher(const CharT * needle_, const size_t needle_size_) : needle{reinterpret_cast(needle_)}, needle_size{needle_size_} { @@ -191,7 +192,8 @@ public: #endif } - template > + template + requires (sizeof(CharT) == 1) ALWAYS_INLINE bool compareTrivial(const CharT * haystack_pos, const CharT * const haystack_end, const uint8_t * needle_pos) const { while (haystack_pos < haystack_end && needle_pos < needle_end) @@ -217,7 +219,8 @@ public: return needle_pos == needle_end; } - template > + template + requires (sizeof(CharT) == 1) ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * haystack_end, const CharT * pos) const { @@ -262,7 +265,8 @@ public: /** Returns haystack_end if not found. */ - template > + template + requires (sizeof(CharT) == 1) const CharT * search(const CharT * haystack, const CharT * const haystack_end) const { if (0 == needle_size) @@ -338,7 +342,8 @@ public: return haystack_end; } - template > + template + requires (sizeof(CharT) == 1) const CharT * search(const CharT * haystack, const size_t haystack_size) const { return search(haystack, haystack + haystack_size); @@ -367,7 +372,8 @@ private: #endif public: - template > + template + requires (sizeof(CharT) == 1) StringSearcher(const CharT * needle_, const size_t needle_size) : needle{reinterpret_cast(needle_)}, needle_end{needle + needle_size} { @@ -399,7 +405,8 @@ public: #endif } - template > + template + requires (sizeof(CharT) == 1) ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * /*haystack_end*/, const CharT * pos) const { #ifdef __SSE4_1__ @@ -453,7 +460,8 @@ public: return false; } - template > + template + requires (sizeof(CharT) == 1) const CharT * search(const CharT * haystack, const CharT * const haystack_end) const { if (needle == needle_end) @@ -540,7 +548,8 @@ public: return haystack_end; } - template > + template + requires (sizeof(CharT) == 1) const CharT * search(const CharT * haystack, const size_t haystack_size) const { return search(haystack, haystack + haystack_size); @@ -568,7 +577,8 @@ private: #endif public: - template > + template + requires (sizeof(CharT) == 1) StringSearcher(const CharT * needle_, const size_t needle_size) : needle{reinterpret_cast(needle_)}, needle_end{needle + needle_size} { @@ -596,7 +606,8 @@ public: #endif } - template > + template + requires (sizeof(CharT) == 1) ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * /*haystack_end*/, const CharT * pos) const { #ifdef __SSE4_1__ @@ -642,7 +653,8 @@ public: return false; } - template > + template + requires (sizeof(CharT) == 1) const CharT * search(const CharT * haystack, const CharT * const haystack_end) const { if (needle == needle_end) @@ -722,7 +734,8 @@ public: return haystack_end; } - template > + template + requires (sizeof(CharT) == 1) const CharT * search(const CharT * haystack, const size_t haystack_size) const { return search(haystack, haystack + haystack_size); @@ -740,7 +753,8 @@ class TokenSearcher : public StringSearcherBase size_t needle_size; public: - template > + template + requires (sizeof(CharT) == 1) TokenSearcher(const CharT * needle_, const size_t needle_size_) : searcher{needle_, needle_size_}, needle_size(needle_size_) @@ -752,7 +766,8 @@ public: } - template > + template + requires (sizeof(CharT) == 1) ALWAYS_INLINE bool compare(const CharT * haystack, const CharT * haystack_end, const CharT * pos) const { // use searcher only if pos is in the beginning of token and pos + searcher.needle_size is end of token. @@ -762,7 +777,8 @@ public: return false; } - template > + template + requires (sizeof(CharT) == 1) const CharT * search(const CharT * haystack, const CharT * const haystack_end) const { // use searcher.search(), then verify that returned value is a token @@ -781,13 +797,15 @@ public: return haystack_end; } - template > + template + requires (sizeof(CharT) == 1) const CharT * search(const CharT * haystack, const size_t haystack_size) const { return search(haystack, haystack + haystack_size); } - template > + template + requires (sizeof(CharT) == 1) ALWAYS_INLINE bool isToken(const CharT * haystack, const CharT * const haystack_end, const CharT* p) const { return (p == haystack || isTokenSeparator(*(p - 1))) @@ -819,11 +837,13 @@ struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase { const char * const needle; - template > + template + requires (sizeof(CharT) == 1) LibCASCIICaseSensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */) : needle(reinterpret_cast(needle_)) {} - template > + template + requires (sizeof(CharT) == 1) const CharT * search(const CharT * haystack, const CharT * const haystack_end) const { const auto * res = strstr(reinterpret_cast(haystack), reinterpret_cast(needle)); @@ -832,7 +852,8 @@ struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase return reinterpret_cast(res); } - template > + template + requires (sizeof(CharT) == 1) const CharT * search(const CharT * haystack, const size_t haystack_size) const { return search(haystack, haystack + haystack_size); @@ -843,11 +864,13 @@ struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase { const char * const needle; - template > + template + requires (sizeof(CharT) == 1) LibCASCIICaseInsensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */) : needle(reinterpret_cast(needle_)) {} - template > + template + requires (sizeof(CharT) == 1) const CharT * search(const CharT * haystack, const CharT * const haystack_end) const { const auto * res = strcasestr(reinterpret_cast(haystack), reinterpret_cast(needle)); @@ -856,7 +879,8 @@ struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase return reinterpret_cast(res); } - template > + template + requires (sizeof(CharT) == 1) const CharT * search(const CharT * haystack, const size_t haystack_size) const { return search(haystack, haystack + haystack_size); diff --git a/src/Common/ThreadProfileEvents.cpp b/src/Common/ThreadProfileEvents.cpp index 90a9d6ede23..e28e5d3e85e 100644 --- a/src/Common/ThreadProfileEvents.cpp +++ b/src/Common/ThreadProfileEvents.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -21,6 +20,8 @@ #include #include +#include + #include @@ -247,9 +248,9 @@ static_assert(sizeof(raw_events_info) / sizeof(raw_events_info[0]) == NUMBER_OF_ #undef CACHE_EVENT // A map of event name -> event index, to parse event list in settings. -static std::unordered_map populateEventMap() +static std::unordered_map populateEventMap() { - std::unordered_map name_to_index; + std::unordered_map name_to_index; name_to_index.reserve(NUMBER_OF_RAW_EVENTS); for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i) @@ -455,10 +456,10 @@ std::vector PerfEventsCounters::eventIndicesFromString(const std::string return result; } + std::vector event_names; + boost::split(event_names, events_list, [](char c) { return c == ','; }); - std::istringstream iss(events_list); // STYLE_CHECK_ALLOW_STD_STRING_STREAM - std::string event_name; - while (std::getline(iss, event_name, ',')) + for (auto & event_name : event_names) { // Allow spaces at the beginning of the token, so that you can write 'a, b'. event_name.erase(0, event_name.find_first_not_of(' ')); diff --git a/src/Common/UTF8Helpers.h b/src/Common/UTF8Helpers.h index 75cfd8fe070..a940ddc0234 100644 --- a/src/Common/UTF8Helpers.h +++ b/src/Common/UTF8Helpers.h @@ -75,7 +75,8 @@ inline size_t countCodePoints(const UInt8 * data, size_t size) } -template > +template +requires (sizeof(CharT) == 1) size_t convertCodePointToUTF8(int code_point, CharT * out_bytes, size_t out_length) { static const Poco::UTF8Encoding utf8; @@ -84,7 +85,8 @@ size_t convertCodePointToUTF8(int code_point, CharT * out_bytes, size_t out_leng return res; } -template > +template +requires (sizeof(CharT) == 1) std::optional convertUTF8ToCodePoint(const CharT * in_bytes, size_t in_length) { static const Poco::UTF8Encoding utf8; diff --git a/src/Common/config.h.in b/src/Common/config.h.in index edade4ce2be..d8d308c59bd 100644 --- a/src/Common/config.h.in +++ b/src/Common/config.h.in @@ -13,6 +13,9 @@ #cmakedefine01 USE_CASSANDRA #cmakedefine01 USE_SENTRY #cmakedefine01 USE_GRPC +#cmakedefine01 USE_SIMDJSON +#cmakedefine01 USE_RAPIDJSON + #cmakedefine01 USE_DATASKETCHES #cmakedefine01 USE_YAML_CPP #cmakedefine01 CLICKHOUSE_SPLIT_BINARY diff --git a/src/Common/typeid_cast.h b/src/Common/typeid_cast.h index b656c6548ea..465de425532 100644 --- a/src/Common/typeid_cast.h +++ b/src/Common/typeid_cast.h @@ -25,7 +25,8 @@ namespace DB * In the rest, behaves like a dynamic_cast. */ template -std::enable_if_t, To> typeid_cast(From & from) +requires std::is_reference_v +To typeid_cast(From & from) { try { @@ -43,7 +44,8 @@ std::enable_if_t, To> typeid_cast(From & from) template -std::enable_if_t, To> typeid_cast(From * from) +requires std::is_pointer_v +To typeid_cast(From * from) { try { @@ -60,7 +62,8 @@ std::enable_if_t, To> typeid_cast(From * from) template -std::enable_if_t, To> typeid_cast(const std::shared_ptr & from) +requires is_shared_ptr_v +To typeid_cast(const std::shared_ptr & from) { try { diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index f7d6761124f..5c93d6719fa 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -726,18 +726,6 @@ void convertToFullIfSparse(Block & block) column.column = recursiveRemoveSparse(column.column); } -ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & column) -{ - auto current_column = block.getByName(column.getNameInStorage()).column; - current_column = current_column->decompress(); - - if (column.isSubcolumn()) - return column.getTypeInStorage()->getSubcolumn(column.getSubcolumnName(), current_column); - - return current_column; -} - - Block materializeBlock(const Block & block) { if (!block) diff --git a/src/Core/Block.h b/src/Core/Block.h index d7b02f44f13..66e16b70f47 100644 --- a/src/Core/Block.h +++ b/src/Core/Block.h @@ -196,10 +196,6 @@ void getBlocksDifference(const Block & lhs, const Block & rhs, std::string & out void convertToFullIfSparse(Block & block); -/// Helps in-memory storages to extract columns from block. -/// Properly handles cases, when column is a subcolumn and when it is compressed. -ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & column); - /// Converts columns-constants to full columns ("materializes" them). Block materializeBlock(const Block & block); void materializeBlockInplace(Block & block); diff --git a/src/Core/DecimalComparison.h b/src/Core/DecimalComparison.h index b8498b6c84b..530722a2519 100644 --- a/src/Core/DecimalComparison.h +++ b/src/Core/DecimalComparison.h @@ -115,8 +115,8 @@ private: } template - static std::enable_if_t && is_decimal, Shift> - getScales(const DataTypePtr & left_type, const DataTypePtr & right_type) + requires is_decimal && is_decimal + static Shift getScales(const DataTypePtr & left_type, const DataTypePtr & right_type) { const DataTypeDecimalBase * decimal0 = checkDecimalBase(*left_type); const DataTypeDecimalBase * decimal1 = checkDecimalBase(*right_type); @@ -137,8 +137,8 @@ private: } template - static std::enable_if_t && !is_decimal, Shift> - getScales(const DataTypePtr & left_type, const DataTypePtr &) + requires is_decimal && (!is_decimal) + static Shift getScales(const DataTypePtr & left_type, const DataTypePtr &) { Shift shift; const DataTypeDecimalBase * decimal0 = checkDecimalBase(*left_type); @@ -148,8 +148,8 @@ private: } template - static std::enable_if_t && is_decimal, Shift> - getScales(const DataTypePtr &, const DataTypePtr & right_type) + requires (!is_decimal) && is_decimal + static Shift getScales(const DataTypePtr &, const DataTypePtr & right_type) { Shift shift; const DataTypeDecimalBase * decimal1 = checkDecimalBase(*right_type); diff --git a/src/Core/Field.cpp b/src/Core/Field.cpp index 70a1458c9f0..2f37d2ea951 100644 --- a/src/Core/Field.cpp +++ b/src/Core/Field.cpp @@ -99,6 +99,12 @@ inline Field getBinaryValue(UInt8 type, ReadBuffer & buf) readBinary(value, buf); return value; } + case Field::Types::Object: + { + Object value; + readBinary(value, buf); + return value; + } case Field::Types::AggregateFunctionState: { AggregateFunctionStateData value; @@ -208,6 +214,40 @@ void writeText(const Map & x, WriteBuffer & buf) writeFieldText(Field(x), buf); } +void readBinary(Object & x, ReadBuffer & buf) +{ + size_t size; + readBinary(size, buf); + + for (size_t index = 0; index < size; ++index) + { + UInt8 type; + String key; + readBinary(type, buf); + readBinary(key, buf); + x[key] = getBinaryValue(type, buf); + } +} + +void writeBinary(const Object & x, WriteBuffer & buf) +{ + const size_t size = x.size(); + writeBinary(size, buf); + + for (const auto & [key, value] : x) + { + const UInt8 type = value.getType(); + writeBinary(type, buf); + writeBinary(key, buf); + Field::dispatch([&buf] (const auto & val) { FieldVisitorWriteBinary()(val, buf); }, value); + } +} + +void writeText(const Object & x, WriteBuffer & buf) +{ + writeFieldText(Field(x), buf); +} + template void readQuoted(DecimalField & x, ReadBuffer & buf) { diff --git a/src/Core/Field.h b/src/Core/Field.h index 9b830771c5f..2f4e648d379 100644 --- a/src/Core/Field.h +++ b/src/Core/Field.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -49,10 +50,22 @@ DEFINE_FIELD_VECTOR(Array); DEFINE_FIELD_VECTOR(Tuple); /// An array with the following structure: [(key1, value1), (key2, value2), ...] -DEFINE_FIELD_VECTOR(Map); +DEFINE_FIELD_VECTOR(Map); /// TODO: use map instead of vector. #undef DEFINE_FIELD_VECTOR +using FieldMap = std::map, AllocatorWithMemoryTracking>>; + +#define DEFINE_FIELD_MAP(X) \ +struct X : public FieldMap \ +{ \ + using FieldMap::FieldMap; \ +} + +DEFINE_FIELD_MAP(Object); + +#undef DEFINE_FIELD_MAP + struct AggregateFunctionStateData { String name; /// Name with arguments. @@ -219,6 +232,7 @@ template <> struct NearestFieldTypeImpl { using Type = String; }; template <> struct NearestFieldTypeImpl { using Type = Array; }; template <> struct NearestFieldTypeImpl { using Type = Tuple; }; template <> struct NearestFieldTypeImpl { using Type = Map; }; +template <> struct NearestFieldTypeImpl { using Type = Object; }; template <> struct NearestFieldTypeImpl { using Type = UInt64; }; template <> struct NearestFieldTypeImpl { using Type = Null; }; @@ -283,6 +297,7 @@ public: Map = 26, UUID = 27, Bool = 28, + Object = 29, }; }; @@ -472,6 +487,7 @@ public: case Types::Array: return get() < rhs.get(); case Types::Tuple: return get() < rhs.get(); case Types::Map: return get() < rhs.get(); + case Types::Object: return get() < rhs.get(); case Types::Decimal32: return get>() < rhs.get>(); case Types::Decimal64: return get>() < rhs.get>(); case Types::Decimal128: return get>() < rhs.get>(); @@ -510,6 +526,7 @@ public: case Types::Array: return get() <= rhs.get(); case Types::Tuple: return get() <= rhs.get(); case Types::Map: return get() <= rhs.get(); + case Types::Object: return get() <= rhs.get(); case Types::Decimal32: return get>() <= rhs.get>(); case Types::Decimal64: return get>() <= rhs.get>(); case Types::Decimal128: return get>() <= rhs.get>(); @@ -548,6 +565,7 @@ public: case Types::Array: return get() == rhs.get(); case Types::Tuple: return get() == rhs.get(); case Types::Map: return get() == rhs.get(); + case Types::Object: return get() == rhs.get(); case Types::UInt128: return get() == rhs.get(); case Types::UInt256: return get() == rhs.get(); case Types::Int128: return get() == rhs.get(); @@ -597,6 +615,7 @@ public: bool value = bool(field.template get()); return f(value); } + case Types::Object: return f(field.template get()); case Types::Decimal32: return f(field.template get>()); case Types::Decimal64: return f(field.template get>()); case Types::Decimal128: return f(field.template get>()); @@ -713,6 +732,9 @@ private: case Types::Map: destroy(); break; + case Types::Object: + destroy(); + break; case Types::AggregateFunctionState: destroy(); break; @@ -737,26 +759,27 @@ private: using Row = std::vector; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Null; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::UInt64; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::UInt128; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::UInt256; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Int64; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Int128; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Int256; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::UUID; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Float64; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::String; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Array; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Tuple; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Map; }; -template <> struct Field::TypeToEnum>{ static const Types::Which value = Types::Decimal32; }; -template <> struct Field::TypeToEnum>{ static const Types::Which value = Types::Decimal64; }; -template <> struct Field::TypeToEnum>{ static const Types::Which value = Types::Decimal128; }; -template <> struct Field::TypeToEnum>{ static const Types::Which value = Types::Decimal256; }; -template <> struct Field::TypeToEnum>{ static const Types::Which value = Types::Decimal64; }; -template <> struct Field::TypeToEnum{ static const Types::Which value = Types::AggregateFunctionState; }; -template <> struct Field::TypeToEnum{ static const Types::Which value = Types::Bool; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Null; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::UInt64; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::UInt128; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::UInt256; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Int64; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Int128; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Int256; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::UUID; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Float64; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::String; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Array; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Tuple; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Map; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Object; }; +template <> struct Field::TypeToEnum>{ static constexpr Types::Which value = Types::Decimal32; }; +template <> struct Field::TypeToEnum>{ static constexpr Types::Which value = Types::Decimal64; }; +template <> struct Field::TypeToEnum>{ static constexpr Types::Which value = Types::Decimal128; }; +template <> struct Field::TypeToEnum>{ static constexpr Types::Which value = Types::Decimal256; }; +template <> struct Field::TypeToEnum>{ static constexpr Types::Which value = Types::Decimal64; }; +template <> struct Field::TypeToEnum{ static constexpr Types::Which value = Types::AggregateFunctionState; }; +template <> struct Field::TypeToEnum{ static constexpr Types::Which value = Types::Bool; }; template <> struct Field::EnumToType { using Type = Null; }; template <> struct Field::EnumToType { using Type = UInt64; }; @@ -771,6 +794,7 @@ template <> struct Field::EnumToType { using Type = Strin template <> struct Field::EnumToType { using Type = Array; }; template <> struct Field::EnumToType { using Type = Tuple; }; template <> struct Field::EnumToType { using Type = Map; }; +template <> struct Field::EnumToType { using Type = Object; }; template <> struct Field::EnumToType { using Type = DecimalField; }; template <> struct Field::EnumToType { using Type = DecimalField; }; template <> struct Field::EnumToType { using Type = DecimalField; }; @@ -931,34 +955,39 @@ class WriteBuffer; /// It is assumed that all elements of the array have the same type. void readBinary(Array & x, ReadBuffer & buf); - [[noreturn]] inline void readText(Array &, ReadBuffer &) { throw Exception("Cannot read Array.", ErrorCodes::NOT_IMPLEMENTED); } [[noreturn]] inline void readQuoted(Array &, ReadBuffer &) { throw Exception("Cannot read Array.", ErrorCodes::NOT_IMPLEMENTED); } /// It is assumed that all elements of the array have the same type. /// Also write size and type into buf. UInt64 and Int64 is written in variadic size form void writeBinary(const Array & x, WriteBuffer & buf); - void writeText(const Array & x, WriteBuffer & buf); - [[noreturn]] inline void writeQuoted(const Array &, WriteBuffer &) { throw Exception("Cannot write Array quoted.", ErrorCodes::NOT_IMPLEMENTED); } void readBinary(Tuple & x, ReadBuffer & buf); - [[noreturn]] inline void readText(Tuple &, ReadBuffer &) { throw Exception("Cannot read Tuple.", ErrorCodes::NOT_IMPLEMENTED); } [[noreturn]] inline void readQuoted(Tuple &, ReadBuffer &) { throw Exception("Cannot read Tuple.", ErrorCodes::NOT_IMPLEMENTED); } void writeBinary(const Tuple & x, WriteBuffer & buf); - void writeText(const Tuple & x, WriteBuffer & buf); +[[noreturn]] inline void writeQuoted(const Tuple &, WriteBuffer &) { throw Exception("Cannot write Tuple quoted.", ErrorCodes::NOT_IMPLEMENTED); } void readBinary(Map & x, ReadBuffer & buf); [[noreturn]] inline void readText(Map &, ReadBuffer &) { throw Exception("Cannot read Map.", ErrorCodes::NOT_IMPLEMENTED); } [[noreturn]] inline void readQuoted(Map &, ReadBuffer &) { throw Exception("Cannot read Map.", ErrorCodes::NOT_IMPLEMENTED); } + void writeBinary(const Map & x, WriteBuffer & buf); void writeText(const Map & x, WriteBuffer & buf); [[noreturn]] inline void writeQuoted(const Map &, WriteBuffer &) { throw Exception("Cannot write Map quoted.", ErrorCodes::NOT_IMPLEMENTED); } +void readBinary(Object & x, ReadBuffer & buf); +[[noreturn]] inline void readText(Object &, ReadBuffer &) { throw Exception("Cannot read Object.", ErrorCodes::NOT_IMPLEMENTED); } +[[noreturn]] inline void readQuoted(Object &, ReadBuffer &) { throw Exception("Cannot read Object.", ErrorCodes::NOT_IMPLEMENTED); } + +void writeBinary(const Object & x, WriteBuffer & buf); +void writeText(const Object & x, WriteBuffer & buf); +[[noreturn]] inline void writeQuoted(const Object &, WriteBuffer &) { throw Exception("Cannot write Object quoted.", ErrorCodes::NOT_IMPLEMENTED); } + __attribute__ ((noreturn)) inline void writeText(const AggregateFunctionStateData &, WriteBuffer &) { // This probably doesn't make any sense, but we have to have it for @@ -977,8 +1006,6 @@ void readQuoted(DecimalField & x, ReadBuffer & buf); void writeFieldText(const Field & x, WriteBuffer & buf); -[[noreturn]] inline void writeQuoted(const Tuple &, WriteBuffer &) { throw Exception("Cannot write Tuple quoted.", ErrorCodes::NOT_IMPLEMENTED); } - String toString(const Field & x); } diff --git a/src/Core/MultiEnum.h b/src/Core/MultiEnum.h index 40cf166ccf7..1c013cc0dc1 100644 --- a/src/Core/MultiEnum.h +++ b/src/Core/MultiEnum.h @@ -53,7 +53,8 @@ struct MultiEnum return bitset; } - template >> + template + requires std::is_convertible_v void setValue(ValueType new_value) { // Can't set value from any enum avoid confusion @@ -66,7 +67,8 @@ struct MultiEnum return bitset == other.bitset; } - template >> + template + requires std::is_convertible_v bool operator==(ValueType other) const { // Shouldn't be comparable with any enum to avoid confusion @@ -80,13 +82,15 @@ struct MultiEnum return !(*this == other); } - template >> + template + requires std::is_convertible_v friend bool operator==(ValueType left, MultiEnum right) { return right.operator==(left); } - template >::type> + template + requires (!std::is_same_v) friend bool operator!=(L left, MultiEnum right) { return !(right.operator==(left)); diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 3403719d1dd..8ad29d59605 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -473,6 +473,7 @@ class IColumn; M(Bool, allow_experimental_geo_types, false, "Allow geo data types such as Point, Ring, Polygon, MultiPolygon", 0) \ M(Bool, data_type_default_nullable, false, "Data types without NULL or NOT NULL will make Nullable", 0) \ M(Bool, cast_keep_nullable, false, "CAST operator keep Nullable for result data type", 0) \ + M(Bool, cast_ipv4_ipv6_default_on_conversion_error, false, "CAST operator into IPv4, CAST operator into IPV6 type, toIPv4, toIPv6 functions will return default value instead of throwing exception on conversion error.", 0) \ M(Bool, alter_partition_verbose_result, false, "Output information about affected parts. Currently works only for FREEZE and ATTACH commands.", 0) \ M(Bool, allow_experimental_database_materialized_mysql, false, "Allow to create database with Engine=MaterializedMySQL(...).", 0) \ M(Bool, allow_experimental_database_materialized_postgresql, false, "Allow to create database with Engine=MaterializedPostgreSQL(...).", 0) \ @@ -492,6 +493,7 @@ class IColumn; M(Bool, force_optimize_projection, false, "If projection optimization is enabled, SELECT queries need to use projection", 0) \ M(Bool, async_socket_for_remote, true, "Asynchronously read from socket executing remote query", 0) \ M(Bool, insert_null_as_default, true, "Insert DEFAULT values instead of NULL in INSERT SELECT (UNION ALL)", 0) \ + M(Bool, describe_extend_object_types, false, "Deduce concrete type of columns of type Object in DESCRIBE query", 0) \ M(Bool, describe_include_subcolumns, false, "If true, subcolumns of all table columns will be included into result of DESCRIBE query", 0) \ \ M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \ @@ -508,6 +510,7 @@ class IColumn; M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \ M(UInt64, max_distributed_depth, 5, "Maximum distributed query depth", 0) \ M(Bool, database_replicated_always_detach_permanently, false, "Execute DETACH TABLE as DETACH TABLE PERMANENTLY if database engine is Replicated", 0) \ + M(Bool, database_replicated_allow_only_replicated_engine, false, "Allow to create only Replicated tables in database with engine Replicated", 0) \ M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, "Format of distributed DDL query result", 0) \ M(UInt64, distributed_ddl_entry_format_version, 1, "Version of DDL entry to write into ZooKeeper", 0) \ \ @@ -567,6 +570,7 @@ class IColumn; /** Experimental functions */ \ M(Bool, allow_experimental_funnel_functions, false, "Enable experimental functions for funnel analysis.", 0) \ M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \ + M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ M(String, insert_deduplication_token, "", "If not empty, used for duplicate detection instead of data digest", 0) \ // End of COMMON_SETTINGS // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS. diff --git a/src/Core/Types.h b/src/Core/Types.h index 9d3ff15d29c..92546d7d07a 100644 --- a/src/Core/Types.h +++ b/src/Core/Types.h @@ -87,6 +87,7 @@ enum class TypeIndex AggregateFunction, LowCardinality, Map, + Object, }; #if !defined(__clang__) #pragma GCC diagnostic pop diff --git a/src/Core/config_core.h.in b/src/Core/config_core.h.in index 5d37f8cf361..3fc2503aaa5 100644 --- a/src/Core/config_core.h.in +++ b/src/Core/config_core.h.in @@ -15,6 +15,8 @@ #cmakedefine01 USE_NURAFT #cmakedefine01 USE_NLP #cmakedefine01 USE_KRB5 +#cmakedefine01 USE_SIMDJSON +#cmakedefine01 USE_RAPIDJSON #cmakedefine01 USE_FILELOG #cmakedefine01 USE_ODBC #cmakedefine01 USE_REPLXX diff --git a/src/Core/iostream_debug_helpers.h b/src/Core/iostream_debug_helpers.h index 8aafe0b6c9c..e40bf74583e 100644 --- a/src/Core/iostream_debug_helpers.h +++ b/src/Core/iostream_debug_helpers.h @@ -7,7 +7,8 @@ namespace DB // Use template to disable implicit casting for certain overloaded types such as Field, which leads // to overload resolution ambiguity. class Field; -template >> +template +requires std::is_same_v std::ostream & operator<<(std::ostream & stream, const T & what); struct NameAndTypePair; diff --git a/src/DataTypes/CMakeLists.txt b/src/DataTypes/CMakeLists.txt index a6176efc7f3..4a60d6c54cf 100644 --- a/src/DataTypes/CMakeLists.txt +++ b/src/DataTypes/CMakeLists.txt @@ -1,3 +1,5 @@ +add_subdirectory (Serializations) + if (ENABLE_EXAMPLES) - add_subdirectory(examples) + add_subdirectory (examples) endif () diff --git a/src/DataTypes/DataTypeFactory.cpp b/src/DataTypes/DataTypeFactory.cpp index 582b42accd9..ce501f4333d 100644 --- a/src/DataTypes/DataTypeFactory.cpp +++ b/src/DataTypes/DataTypeFactory.cpp @@ -213,6 +213,7 @@ DataTypeFactory::DataTypeFactory() registerDataTypeDomainSimpleAggregateFunction(*this); registerDataTypeDomainGeo(*this); registerDataTypeMap(*this); + registerDataTypeObject(*this); } DataTypeFactory & DataTypeFactory::instance() diff --git a/src/DataTypes/DataTypeFactory.h b/src/DataTypes/DataTypeFactory.h index e7b638b6d7b..704d8926bf0 100644 --- a/src/DataTypes/DataTypeFactory.h +++ b/src/DataTypes/DataTypeFactory.h @@ -87,5 +87,6 @@ void registerDataTypeDomainIPv4AndIPv6(DataTypeFactory & factory); void registerDataTypeDomainBool(DataTypeFactory & factory); void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory); void registerDataTypeDomainGeo(DataTypeFactory & factory); +void registerDataTypeObject(DataTypeFactory & factory); } diff --git a/src/DataTypes/DataTypeObject.cpp b/src/DataTypes/DataTypeObject.cpp new file mode 100644 index 00000000000..9203c6764ea --- /dev/null +++ b/src/DataTypes/DataTypeObject.cpp @@ -0,0 +1,83 @@ +#include +#include +#include + +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int UNEXPECTED_AST_STRUCTURE; +} + +DataTypeObject::DataTypeObject(const String & schema_format_, bool is_nullable_) + : schema_format(Poco::toLower(schema_format_)) + , is_nullable(is_nullable_) + , default_serialization(getObjectSerialization(schema_format)) +{ +} + +bool DataTypeObject::equals(const IDataType & rhs) const +{ + if (const auto * object = typeid_cast(&rhs)) + return schema_format == object->schema_format && is_nullable == object->is_nullable; + return false; +} + +SerializationPtr DataTypeObject::doGetDefaultSerialization() const +{ + return default_serialization; +} + +String DataTypeObject::doGetName() const +{ + WriteBufferFromOwnString out; + if (is_nullable) + out << "Object(Nullable(" << quote << schema_format << "))"; + else + out << "Object(" << quote << schema_format << ")"; + return out.str(); +} + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Object data type family must have one argument - name of schema format"); + + ASTPtr schema_argument = arguments->children[0]; + bool is_nullable = false; + + if (const auto * func = schema_argument->as()) + { + if (func->name != "Nullable" || func->arguments->children.size() != 1) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, + "Expected 'Nullable()' as parameter for type Object", func->name); + + schema_argument = func->arguments->children[0]; + is_nullable = true; + } + + const auto * literal = schema_argument->as(); + if (!literal || literal->value.getType() != Field::Types::String) + throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE, + "Object data type family must have a const string as its schema name parameter"); + + return std::make_shared(literal->value.get(), is_nullable); +} + +void registerDataTypeObject(DataTypeFactory & factory) +{ + factory.registerDataType("Object", create); + factory.registerSimpleDataType("JSON", + [] { return std::make_shared("JSON", false); }, + DataTypeFactory::CaseInsensitive); +} + +} diff --git a/src/DataTypes/DataTypeObject.h b/src/DataTypes/DataTypeObject.h new file mode 100644 index 00000000000..b4b31f0b8ea --- /dev/null +++ b/src/DataTypes/DataTypeObject.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +class DataTypeObject : public IDataType +{ +private: + String schema_format; + bool is_nullable; + SerializationPtr default_serialization; + +public: + DataTypeObject(const String & schema_format_, bool is_nullable_); + + const char * getFamilyName() const override { return "Object"; } + String doGetName() const override; + TypeIndex getTypeId() const override { return TypeIndex::Object; } + + MutableColumnPtr createColumn() const override { return ColumnObject::create(is_nullable); } + + Field getDefault() const override + { + throw Exception("Method getDefault() is not implemented for data type " + getName(), ErrorCodes::NOT_IMPLEMENTED); + } + + bool haveSubtypes() const override { return false; } + bool equals(const IDataType & rhs) const override; + bool isParametric() const override { return true; } + + SerializationPtr doGetDefaultSerialization() const override; + + bool hasNullableSubcolumns() const { return is_nullable; } +}; + +} diff --git a/src/DataTypes/FieldToDataType.cpp b/src/DataTypes/FieldToDataType.cpp index 8ca5ffac7c5..283d1b1e41a 100644 --- a/src/DataTypes/FieldToDataType.cpp +++ b/src/DataTypes/FieldToDataType.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -108,12 +109,11 @@ DataTypePtr FieldToDataType::operator() (const Array & x) const element_types.reserve(x.size()); for (const Field & elem : x) - element_types.emplace_back(applyVisitor(FieldToDataType(), elem)); + element_types.emplace_back(applyVisitor(FieldToDataType(allow_convertion_to_string), elem)); - return std::make_shared(getLeastSupertype(element_types)); + return std::make_shared(getLeastSupertype(element_types, allow_convertion_to_string)); } - DataTypePtr FieldToDataType::operator() (const Tuple & tuple) const { if (tuple.empty()) @@ -123,7 +123,7 @@ DataTypePtr FieldToDataType::operator() (const Tuple & tuple) const element_types.reserve(tuple.size()); for (const auto & element : tuple) - element_types.push_back(applyVisitor(FieldToDataType(), element)); + element_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), element)); return std::make_shared(element_types); } @@ -139,11 +139,19 @@ DataTypePtr FieldToDataType::operator() (const Map & map) const { const auto & tuple = elem.safeGet(); assert(tuple.size() == 2); - key_types.push_back(applyVisitor(FieldToDataType(), tuple[0])); - value_types.push_back(applyVisitor(FieldToDataType(), tuple[1])); + key_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), tuple[0])); + value_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), tuple[1])); } - return std::make_shared(getLeastSupertype(key_types), getLeastSupertype(value_types)); + return std::make_shared( + getLeastSupertype(key_types, allow_convertion_to_string), + getLeastSupertype(value_types, allow_convertion_to_string)); +} + +DataTypePtr FieldToDataType::operator() (const Object &) const +{ + /// TODO: Do we need different parameters for type Object? + return std::make_shared("json", false); } DataTypePtr FieldToDataType::operator() (const AggregateFunctionStateData & x) const diff --git a/src/DataTypes/FieldToDataType.h b/src/DataTypes/FieldToDataType.h index 72575c070f5..1922ac8b746 100644 --- a/src/DataTypes/FieldToDataType.h +++ b/src/DataTypes/FieldToDataType.h @@ -20,26 +20,34 @@ using DataTypePtr = std::shared_ptr; class FieldToDataType : public StaticVisitor { public: + FieldToDataType(bool allow_convertion_to_string_ = false) + : allow_convertion_to_string(allow_convertion_to_string_) + { + } + DataTypePtr operator() (const Null & x) const; DataTypePtr operator() (const UInt64 & x) const; DataTypePtr operator() (const UInt128 & x) const; - DataTypePtr operator() (const UInt256 & x) const; DataTypePtr operator() (const Int64 & x) const; DataTypePtr operator() (const Int128 & x) const; - DataTypePtr operator() (const Int256 & x) const; DataTypePtr operator() (const UUID & x) const; DataTypePtr operator() (const Float64 & x) const; DataTypePtr operator() (const String & x) const; DataTypePtr operator() (const Array & x) const; DataTypePtr operator() (const Tuple & tuple) const; DataTypePtr operator() (const Map & map) const; + DataTypePtr operator() (const Object & map) const; DataTypePtr operator() (const DecimalField & x) const; DataTypePtr operator() (const DecimalField & x) const; DataTypePtr operator() (const DecimalField & x) const; DataTypePtr operator() (const DecimalField & x) const; DataTypePtr operator() (const AggregateFunctionStateData & x) const; + DataTypePtr operator() (const UInt256 & x) const; + DataTypePtr operator() (const Int256 & x) const; DataTypePtr operator() (const bool & x) const; + +private: + bool allow_convertion_to_string; }; } - diff --git a/src/DataTypes/IDataType.cpp b/src/DataTypes/IDataType.cpp index edc9e4159f4..0976233c031 100644 --- a/src/DataTypes/IDataType.cpp +++ b/src/DataTypes/IDataType.cpp @@ -126,19 +126,25 @@ DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const { SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr }; - return getForSubcolumn(subcolumn_name, data, &SubstreamData::type); + return getForSubcolumn(subcolumn_name, data, &SubstreamData::type, true); } -SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const +ColumnPtr IDataType::tryGetSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const { - SubstreamData data = { serialization, nullptr, nullptr, nullptr }; - return getForSubcolumn(subcolumn_name, data, &SubstreamData::serialization); + SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr }; + return getForSubcolumn(subcolumn_name, data, &SubstreamData::column, false); } ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const { SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr }; - return getForSubcolumn(subcolumn_name, data, &SubstreamData::column); + return getForSubcolumn(subcolumn_name, data, &SubstreamData::column, true); +} + +SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const +{ + SubstreamData data = { serialization, nullptr, nullptr, nullptr }; + return getForSubcolumn(subcolumn_name, data, &SubstreamData::serialization, true); } Names IDataType::getSubcolumnNames() const diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index 36e1ce8ddd5..fc9e50dc55b 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -82,9 +82,11 @@ public: DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const; DataTypePtr getSubcolumnType(const String & subcolumn_name) const; - SerializationPtr getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const; + ColumnPtr tryGetSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const; ColumnPtr getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const; + SerializationPtr getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const; + using SubstreamData = ISerialization::SubstreamData; using SubstreamPath = ISerialization::SubstreamPath; @@ -309,7 +311,7 @@ private: const String & subcolumn_name, const SubstreamData & data, Ptr SubstreamData::*member, - bool throw_if_null = true) const; + bool throw_if_null) const; }; @@ -373,11 +375,13 @@ struct WhichDataType constexpr bool isMap() const {return idx == TypeIndex::Map; } constexpr bool isSet() const { return idx == TypeIndex::Set; } constexpr bool isInterval() const { return idx == TypeIndex::Interval; } + constexpr bool isObject() const { return idx == TypeIndex::Object; } constexpr bool isNothing() const { return idx == TypeIndex::Nothing; } constexpr bool isNullable() const { return idx == TypeIndex::Nullable; } constexpr bool isFunction() const { return idx == TypeIndex::Function; } constexpr bool isAggregateFunction() const { return idx == TypeIndex::AggregateFunction; } + constexpr bool isSimple() const { return isInt() || isUInt() || isFloat() || isString(); } constexpr bool isLowCarnality() const { return idx == TypeIndex::LowCardinality; } }; @@ -399,10 +403,16 @@ inline bool isEnum(const DataTypePtr & data_type) { return WhichDataType(data_ty inline bool isDecimal(const DataTypePtr & data_type) { return WhichDataType(data_type).isDecimal(); } inline bool isTuple(const DataTypePtr & data_type) { return WhichDataType(data_type).isTuple(); } inline bool isArray(const DataTypePtr & data_type) { return WhichDataType(data_type).isArray(); } -inline bool isMap(const DataTypePtr & data_type) { return WhichDataType(data_type).isMap(); } +inline bool isMap(const DataTypePtr & data_type) {return WhichDataType(data_type).isMap(); } inline bool isNothing(const DataTypePtr & data_type) { return WhichDataType(data_type).isNothing(); } inline bool isUUID(const DataTypePtr & data_type) { return WhichDataType(data_type).isUUID(); } +template +inline bool isObject(const T & data_type) +{ + return WhichDataType(data_type).isObject(); +} + template inline bool isUInt8(const T & data_type) { diff --git a/src/DataTypes/NestedUtils.cpp b/src/DataTypes/NestedUtils.cpp index b35a0713519..df504bc34a8 100644 --- a/src/DataTypes/NestedUtils.cpp +++ b/src/DataTypes/NestedUtils.cpp @@ -30,6 +30,12 @@ namespace Nested std::string concatenateName(const std::string & nested_table_name, const std::string & nested_field_name) { + if (nested_table_name.empty()) + return nested_field_name; + + if (nested_field_name.empty()) + return nested_table_name; + return nested_table_name + "." + nested_field_name; } diff --git a/src/DataTypes/ObjectUtils.cpp b/src/DataTypes/ObjectUtils.cpp new file mode 100644 index 00000000000..9004a5296e0 --- /dev/null +++ b/src/DataTypes/ObjectUtils.cpp @@ -0,0 +1,703 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int TYPE_MISMATCH; + extern const int LOGICAL_ERROR; + extern const int DUPLICATE_COLUMN; +} + +size_t getNumberOfDimensions(const IDataType & type) +{ + if (const auto * type_array = typeid_cast(&type)) + return type_array->getNumberOfDimensions(); + return 0; +} + +size_t getNumberOfDimensions(const IColumn & column) +{ + if (const auto * column_array = checkAndGetColumn(column)) + return column_array->getNumberOfDimensions(); + return 0; +} + +DataTypePtr getBaseTypeOfArray(const DataTypePtr & type) +{ + /// Get raw pointers to avoid extra copying of type pointers. + const DataTypeArray * last_array = nullptr; + const auto * current_type = type.get(); + while (const auto * type_array = typeid_cast(current_type)) + { + current_type = type_array->getNestedType().get(); + last_array = type_array; + } + + return last_array ? last_array->getNestedType() : type; +} + +ColumnPtr getBaseColumnOfArray(const ColumnPtr & column) +{ + /// Get raw pointers to avoid extra copying of column pointers. + const ColumnArray * last_array = nullptr; + const auto * current_column = column.get(); + while (const auto * column_array = checkAndGetColumn(current_column)) + { + current_column = &column_array->getData(); + last_array = column_array; + } + + return last_array ? last_array->getDataPtr() : column; +} + +DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions) +{ + for (size_t i = 0; i < num_dimensions; ++i) + type = std::make_shared(std::move(type)); + return type; +} + +ColumnPtr createArrayOfColumn(ColumnPtr column, size_t num_dimensions) +{ + for (size_t i = 0; i < num_dimensions; ++i) + column = ColumnArray::create(column); + return column; +} + +Array createEmptyArrayField(size_t num_dimensions) +{ + if (num_dimensions == 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create array field with 0 dimensions"); + + Array array; + Array * current_array = &array; + for (size_t i = 1; i < num_dimensions; ++i) + { + current_array->push_back(Array()); + current_array = ¤t_array->back().get(); + } + + return array; +} + +DataTypePtr getDataTypeByColumn(const IColumn & column) +{ + auto idx = column.getDataType(); + if (WhichDataType(idx).isSimple()) + return DataTypeFactory::instance().get(String(magic_enum::enum_name(idx))); + + if (const auto * column_array = checkAndGetColumn(&column)) + return std::make_shared(getDataTypeByColumn(column_array->getData())); + + if (const auto * column_nullable = checkAndGetColumn(&column)) + return makeNullable(getDataTypeByColumn(column_nullable->getNestedColumn())); + + /// TODO: add more types. + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get data type of column {}", column.getFamilyName()); +} + +template +static auto extractVector(const std::vector & vec) +{ + static_assert(I < std::tuple_size_v); + std::vector> res; + res.reserve(vec.size()); + for (const auto & elem : vec) + res.emplace_back(std::get(elem)); + return res; +} + +void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, const NamesAndTypesList & extended_storage_columns) +{ + std::unordered_map storage_columns_map; + for (const auto & [name, type] : extended_storage_columns) + storage_columns_map[name] = type; + + for (auto & name_type : columns_list) + { + if (!isObject(name_type.type)) + continue; + + auto & column = block.getByName(name_type.name); + if (!isObject(column.type)) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Type for column '{}' mismatch in columns list and in block. In list: {}, in block: {}", + name_type.name, name_type.type->getName(), column.type->getName()); + + const auto & column_object = assert_cast(*column.column); + const auto & subcolumns = column_object.getSubcolumns(); + + if (!column_object.isFinalized()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Cannot convert to tuple column '{}' from type {}. Column should be finalized first", + name_type.name, name_type.type->getName()); + + PathsInData tuple_paths; + DataTypes tuple_types; + Columns tuple_columns; + + for (const auto & entry : subcolumns) + { + tuple_paths.emplace_back(entry->path); + tuple_types.emplace_back(entry->data.getLeastCommonType()); + tuple_columns.emplace_back(entry->data.getFinalizedColumnPtr()); + } + + auto it = storage_columns_map.find(name_type.name); + if (it == storage_columns_map.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' not found in storage", name_type.name); + + std::tie(column.column, column.type) = unflattenTuple(tuple_paths, tuple_types, tuple_columns); + name_type.type = column.type; + + /// Check that constructed Tuple type and type in storage are compatible. + getLeastCommonTypeForObject({column.type, it->second}, true); + } +} + +static bool isPrefix(const PathInData::Parts & prefix, const PathInData::Parts & parts) +{ + if (prefix.size() > parts.size()) + return false; + + for (size_t i = 0; i < prefix.size(); ++i) + if (prefix[i].key != parts[i].key) + return false; + return true; +} + +void checkObjectHasNoAmbiguosPaths(const PathsInData & paths) +{ + size_t size = paths.size(); + for (size_t i = 0; i < size; ++i) + { + for (size_t j = 0; j < i; ++j) + { + if (isPrefix(paths[i].getParts(), paths[j].getParts()) + || isPrefix(paths[j].getParts(), paths[i].getParts())) + throw Exception(ErrorCodes::DUPLICATE_COLUMN, + "Data in Object has ambiguous paths: '{}' and '{}'", + paths[i].getPath(), paths[j].getPath()); + } + } +} + +DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool check_ambiguos_paths) +{ + if (types.empty()) + return nullptr; + + bool all_equal = true; + for (size_t i = 1; i < types.size(); ++i) + { + if (!types[i]->equals(*types[0])) + { + all_equal = false; + break; + } + } + + if (all_equal) + return types[0]; + + /// Types of subcolumns by path from all tuples. + std::unordered_map subcolumns_types; + + /// First we flatten tuples, then get common type for paths + /// and finally unflatten paths and create new tuple type. + for (const auto & type : types) + { + const auto * type_tuple = typeid_cast(type.get()); + if (!type_tuple) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Least common type for object can be deduced only from tuples, but {} given", type->getName()); + + auto [tuple_paths, tuple_types] = flattenTuple(type); + assert(tuple_paths.size() == tuple_types.size()); + + for (size_t i = 0; i < tuple_paths.size(); ++i) + subcolumns_types[tuple_paths[i]].push_back(tuple_types[i]); + } + + PathsInData tuple_paths; + DataTypes tuple_types; + + /// Get the least common type for all paths. + for (const auto & [key, subtypes] : subcolumns_types) + { + assert(!subtypes.empty()); + if (key.getPath() == ColumnObject::COLUMN_NAME_DUMMY) + continue; + + size_t first_dim = getNumberOfDimensions(*subtypes[0]); + for (size_t i = 1; i < subtypes.size(); ++i) + if (first_dim != getNumberOfDimensions(*subtypes[i])) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Uncompatible types of subcolumn '{}': {} and {}", + key.getPath(), subtypes[0]->getName(), subtypes[i]->getName()); + + tuple_paths.emplace_back(key); + tuple_types.emplace_back(getLeastSupertype(subtypes, /*allow_conversion_to_string=*/ true)); + } + + if (tuple_paths.empty()) + { + tuple_paths.emplace_back(ColumnObject::COLUMN_NAME_DUMMY); + tuple_types.emplace_back(std::make_shared()); + } + + if (check_ambiguos_paths) + checkObjectHasNoAmbiguosPaths(tuple_paths); + + return unflattenTuple(tuple_paths, tuple_types); +} + +NameSet getNamesOfObjectColumns(const NamesAndTypesList & columns_list) +{ + NameSet res; + for (const auto & [name, type] : columns_list) + if (isObject(type)) + res.insert(name); + + return res; +} + +bool hasObjectColumns(const ColumnsDescription & columns) +{ + return std::any_of(columns.begin(), columns.end(), [](const auto & column) { return isObject(column.type); }); +} + +void extendObjectColumns(NamesAndTypesList & columns_list, const ColumnsDescription & object_columns, bool with_subcolumns) +{ + NamesAndTypesList subcolumns_list; + for (auto & column : columns_list) + { + auto object_column = object_columns.tryGetColumn(GetColumnsOptions::All, column.name); + if (object_column) + { + column.type = object_column->type; + + if (with_subcolumns) + subcolumns_list.splice(subcolumns_list.end(), object_columns.getSubcolumns(column.name)); + } + } + + columns_list.splice(columns_list.end(), std::move(subcolumns_list)); +} + +void updateObjectColumns(ColumnsDescription & object_columns, const NamesAndTypesList & new_columns) +{ + for (const auto & new_column : new_columns) + { + auto object_column = object_columns.tryGetColumn(GetColumnsOptions::All, new_column.name); + if (object_column && !object_column->type->equals(*new_column.type)) + { + object_columns.modify(new_column.name, [&](auto & column) + { + column.type = getLeastCommonTypeForObject({object_column->type, new_column.type}); + }); + } + } +} + +namespace +{ + +void flattenTupleImpl( + PathInDataBuilder & builder, + DataTypePtr type, + std::vector & new_paths, + DataTypes & new_types) +{ + if (const auto * type_tuple = typeid_cast(type.get())) + { + const auto & tuple_names = type_tuple->getElementNames(); + const auto & tuple_types = type_tuple->getElements(); + + for (size_t i = 0; i < tuple_names.size(); ++i) + { + builder.append(tuple_names[i], false); + flattenTupleImpl(builder, tuple_types[i], new_paths, new_types); + builder.popBack(); + } + } + else if (const auto * type_array = typeid_cast(type.get())) + { + PathInDataBuilder element_builder; + std::vector element_paths; + DataTypes element_types; + + flattenTupleImpl(element_builder, type_array->getNestedType(), element_paths, element_types); + assert(element_paths.size() == element_types.size()); + + for (size_t i = 0; i < element_paths.size(); ++i) + { + builder.append(element_paths[i], true); + new_paths.emplace_back(builder.getParts()); + new_types.emplace_back(std::make_shared(element_types[i])); + builder.popBack(element_paths[i].size()); + } + } + else + { + new_paths.emplace_back(builder.getParts()); + new_types.emplace_back(type); + } +} + +/// @offsets_columns are used as stack of array offsets and allows to recreate Array columns. +void flattenTupleImpl(const ColumnPtr & column, Columns & new_columns, Columns & offsets_columns) +{ + if (const auto * column_tuple = checkAndGetColumn(column.get())) + { + const auto & subcolumns = column_tuple->getColumns(); + for (const auto & subcolumn : subcolumns) + flattenTupleImpl(subcolumn, new_columns, offsets_columns); + } + else if (const auto * column_array = checkAndGetColumn(column.get())) + { + offsets_columns.push_back(column_array->getOffsetsPtr()); + flattenTupleImpl(column_array->getDataPtr(), new_columns, offsets_columns); + offsets_columns.pop_back(); + } + else + { + if (!offsets_columns.empty()) + { + auto new_column = ColumnArray::create(column, offsets_columns.back()); + for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it) + new_column = ColumnArray::create(new_column, *it); + + new_columns.push_back(std::move(new_column)); + } + else + { + new_columns.push_back(column); + } + } +} + +DataTypePtr reduceNumberOfDimensions(DataTypePtr type, size_t dimensions_to_reduce) +{ + while (dimensions_to_reduce--) + { + const auto * type_array = typeid_cast(type.get()); + if (!type_array) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Not enough dimensions to reduce"); + + type = type_array->getNestedType(); + } + + return type; +} + +ColumnPtr reduceNumberOfDimensions(ColumnPtr column, size_t dimensions_to_reduce) +{ + while (dimensions_to_reduce--) + { + const auto * column_array = typeid_cast(column.get()); + if (!column_array) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Not enough dimensions to reduce"); + + column = column_array->getDataPtr(); + } + + return column; +} + +/// We save intermediate column, type and number of array +/// dimensions for each intermediate node in path in subcolumns tree. +struct ColumnWithTypeAndDimensions +{ + ColumnPtr column; + DataTypePtr type; + size_t array_dimensions; +}; + +using SubcolumnsTreeWithColumns = SubcolumnsTree; +using Node = SubcolumnsTreeWithColumns::Node; + +/// Creates data type and column from tree of subcolumns. +ColumnWithTypeAndDimensions createTypeFromNode(const Node * node) +{ + auto collect_tuple_elemets = [](const auto & children) + { + std::vector> tuple_elements; + tuple_elements.reserve(children.size()); + for (const auto & [name, child] : children) + { + auto column = createTypeFromNode(child.get()); + tuple_elements.emplace_back(name, std::move(column)); + } + + /// Sort to always create the same type for the same set of subcolumns. + std::sort(tuple_elements.begin(), tuple_elements.end(), + [](const auto & lhs, const auto & rhs) { return std::get<0>(lhs) < std::get<0>(rhs); }); + + auto tuple_names = extractVector<0>(tuple_elements); + auto tuple_columns = extractVector<1>(tuple_elements); + + return std::make_tuple(std::move(tuple_names), std::move(tuple_columns)); + }; + + if (node->kind == Node::SCALAR) + { + return node->data; + } + else if (node->kind == Node::NESTED) + { + auto [tuple_names, tuple_columns] = collect_tuple_elemets(node->children); + + Columns offsets_columns; + offsets_columns.reserve(tuple_columns[0].array_dimensions + 1); + + /// If we have a Nested node and child node with anonymous array levels + /// we need to push a Nested type through all array levels. + /// Example: { "k1": [[{"k2": 1, "k3": 2}] } should be parsed as + /// `k1 Array(Nested(k2 Int, k3 Int))` and k1 is marked as Nested + /// and `k2` and `k3` has anonymous_array_level = 1 in that case. + + const auto & current_array = assert_cast(*node->data.column); + offsets_columns.push_back(current_array.getOffsetsPtr()); + + auto first_column = tuple_columns[0].column; + for (size_t i = 0; i < tuple_columns[0].array_dimensions; ++i) + { + const auto & column_array = assert_cast(*first_column); + offsets_columns.push_back(column_array.getOffsetsPtr()); + first_column = column_array.getDataPtr(); + } + + size_t num_elements = tuple_columns.size(); + Columns tuple_elements_columns(num_elements); + DataTypes tuple_elements_types(num_elements); + + /// Reduce extra array dimensions to get columns and types of Nested elements. + for (size_t i = 0; i < num_elements; ++i) + { + assert(tuple_columns[i].array_dimensions == tuple_columns[0].array_dimensions); + tuple_elements_columns[i] = reduceNumberOfDimensions(tuple_columns[i].column, tuple_columns[i].array_dimensions); + tuple_elements_types[i] = reduceNumberOfDimensions(tuple_columns[i].type, tuple_columns[i].array_dimensions); + } + + auto result_column = ColumnArray::create(ColumnTuple::create(tuple_elements_columns), offsets_columns.back()); + auto result_type = createNested(tuple_elements_types, tuple_names); + + /// Recreate result Array type and Array column. + for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it) + { + result_column = ColumnArray::create(result_column, *it); + result_type = std::make_shared(result_type); + } + + return {result_column, result_type, tuple_columns[0].array_dimensions}; + } + else + { + auto [tuple_names, tuple_columns] = collect_tuple_elemets(node->children); + + size_t num_elements = tuple_columns.size(); + Columns tuple_elements_columns(num_elements); + DataTypes tuple_elements_types(num_elements); + + for (size_t i = 0; i < tuple_columns.size(); ++i) + { + assert(tuple_columns[i].array_dimensions == tuple_columns[0].array_dimensions); + tuple_elements_columns[i] = tuple_columns[i].column; + tuple_elements_types[i] = tuple_columns[i].type; + } + + auto result_column = ColumnTuple::create(tuple_elements_columns); + auto result_type = std::make_shared(tuple_elements_types, tuple_names); + + return {result_column, result_type, tuple_columns[0].array_dimensions}; + } +} + +} + +std::pair flattenTuple(const DataTypePtr & type) +{ + std::vector new_path_parts; + DataTypes new_types; + PathInDataBuilder builder; + + flattenTupleImpl(builder, type, new_path_parts, new_types); + + PathsInData new_paths(new_path_parts.begin(), new_path_parts.end()); + return {new_paths, new_types}; +} + +ColumnPtr flattenTuple(const ColumnPtr & column) +{ + Columns new_columns; + Columns offsets_columns; + + flattenTupleImpl(column, new_columns, offsets_columns); + return ColumnTuple::create(new_columns); +} + +DataTypePtr unflattenTuple(const PathsInData & paths, const DataTypes & tuple_types) +{ + assert(paths.size() == tuple_types.size()); + Columns tuple_columns; + tuple_columns.reserve(tuple_types.size()); + for (const auto & type : tuple_types) + tuple_columns.emplace_back(type->createColumn()); + + return unflattenTuple(paths, tuple_types, tuple_columns).second; +} + +std::pair unflattenTuple( + const PathsInData & paths, + const DataTypes & tuple_types, + const Columns & tuple_columns) +{ + assert(paths.size() == tuple_types.size()); + assert(paths.size() == tuple_columns.size()); + + /// We add all paths to the subcolumn tree and then create a type from it. + /// The tree stores column, type and number of array dimensions + /// for each intermediate node. + SubcolumnsTreeWithColumns tree; + + for (size_t i = 0; i < paths.size(); ++i) + { + auto column = tuple_columns[i]; + auto type = tuple_types[i]; + + const auto & parts = paths[i].getParts(); + size_t num_parts = parts.size(); + + size_t pos = 0; + tree.add(paths[i], [&](Node::Kind kind, bool exists) -> std::shared_ptr + { + if (pos >= num_parts) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Not enough name parts for path {}. Expected at least {}, got {}", + paths[i].getPath(), pos + 1, num_parts); + + size_t array_dimensions = kind == Node::NESTED ? 1 : parts[pos].anonymous_array_level; + ColumnWithTypeAndDimensions current_column{column, type, array_dimensions}; + + /// Get type and column for next node. + if (array_dimensions) + { + type = reduceNumberOfDimensions(type, array_dimensions); + column = reduceNumberOfDimensions(column, array_dimensions); + } + + ++pos; + if (exists) + return nullptr; + + return kind == Node::SCALAR + ? std::make_shared(kind, current_column, paths[i]) + : std::make_shared(kind, current_column); + }); + } + + auto [column, type, _] = createTypeFromNode(tree.getRoot()); + return std::make_pair(std::move(column), std::move(type)); +} + +static void addConstantToWithClause(const ASTPtr & query, const String & column_name, const DataTypePtr & data_type) +{ + auto & select = query->as(); + if (!select.with()) + select.setExpression(ASTSelectQuery::Expression::WITH, std::make_shared()); + + /// TODO: avoid materialize + auto node = makeASTFunction("materialize", + makeASTFunction("CAST", + std::make_shared(data_type->getDefault()), + std::make_shared(data_type->getName()))); + + node->alias = column_name; + node->prefer_alias_to_column_name = true; + select.with()->children.push_back(std::move(node)); +} + +/// @expected_columns and @available_columns contain descriptions +/// of extended Object columns. +void replaceMissedSubcolumnsByConstants( + const ColumnsDescription & expected_columns, + const ColumnsDescription & available_columns, + ASTPtr query) +{ + NamesAndTypes missed_names_types; + + /// Find all subcolumns that are in @expected_columns, but not in @available_columns. + for (const auto & column : available_columns) + { + auto expected_column = expected_columns.getColumn(GetColumnsOptions::All, column.name); + + /// Extract all paths from both descriptions to easily check existence of subcolumns. + auto [available_paths, available_types] = flattenTuple(column.type); + auto [expected_paths, expected_types] = flattenTuple(expected_column.type); + + auto extract_names_and_types = [&column](const auto & paths, const auto & types) + { + NamesAndTypes res; + res.reserve(paths.size()); + for (size_t i = 0; i < paths.size(); ++i) + { + auto full_name = Nested::concatenateName(column.name, paths[i].getPath()); + res.emplace_back(full_name, types[i]); + } + + std::sort(res.begin(), res.end()); + return res; + }; + + auto available_names_types = extract_names_and_types(available_paths, available_types); + auto expected_names_types = extract_names_and_types(expected_paths, expected_types); + + std::set_difference( + expected_names_types.begin(), expected_names_types.end(), + available_names_types.begin(), available_names_types.end(), + std::back_inserter(missed_names_types), + [](const auto & lhs, const auto & rhs) { return lhs.name < rhs.name; }); + } + + if (missed_names_types.empty()) + return; + + IdentifierNameSet identifiers; + query->collectIdentifierNames(identifiers); + + /// Replace missed subcolumns to default literals of theirs type. + for (const auto & [name, type] : missed_names_types) + if (identifiers.count(name)) + addConstantToWithClause(query, name, type); +} + +void finalizeObjectColumns(MutableColumns & columns) +{ + for (auto & column : columns) + if (auto * column_object = typeid_cast(column.get())) + column_object->finalize(); +} + +} diff --git a/src/DataTypes/ObjectUtils.h b/src/DataTypes/ObjectUtils.h new file mode 100644 index 00000000000..199a048c8cd --- /dev/null +++ b/src/DataTypes/ObjectUtils.h @@ -0,0 +1,140 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +/// Returns number of dimensions in Array type. 0 if type is not array. +size_t getNumberOfDimensions(const IDataType & type); + +/// Returns number of dimensions in Array column. 0 if column is not array. +size_t getNumberOfDimensions(const IColumn & column); + +/// Returns type of scalars of Array of arbitrary dimensions. +DataTypePtr getBaseTypeOfArray(const DataTypePtr & type); + +/// Returns Array type with requested scalar type and number of dimensions. +DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions); + +/// Returns column of scalars of Array of arbitrary dimensions. +ColumnPtr getBaseColumnOfArray(const ColumnPtr & column); + +/// Returns empty Array column with requested scalar column and number of dimensions. +ColumnPtr createArrayOfColumn(const ColumnPtr & column, size_t num_dimensions); + +/// Returns Array with requested number of dimensions and no scalars. +Array createEmptyArrayField(size_t num_dimensions); + +/// Tries to get data type by column. Only limited subset of types is supported +DataTypePtr getDataTypeByColumn(const IColumn & column); + +/// Converts Object types and columns to Tuples in @columns_list and @block +/// and checks that types are consistent with types in @extended_storage_columns. +void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, const NamesAndTypesList & extended_storage_columns); + +/// Checks that each path is not the prefix of any other path. +void checkObjectHasNoAmbiguosPaths(const PathsInData & paths); + +/// Receives several Tuple types and deduces the least common type among them. +DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool check_ambiguos_paths = false); + +/// Converts types of object columns to tuples in @columns_list +/// according to @object_columns and adds all tuple's subcolumns if needed. +void extendObjectColumns(NamesAndTypesList & columns_list, const ColumnsDescription & object_columns, bool with_subcolumns); + +NameSet getNamesOfObjectColumns(const NamesAndTypesList & columns_list); +bool hasObjectColumns(const ColumnsDescription & columns); +void finalizeObjectColumns(MutableColumns & columns); + +/// Updates types of objects in @object_columns inplace +/// according to types in new_columns. +void updateObjectColumns(ColumnsDescription & object_columns, const NamesAndTypesList & new_columns); + +using DataTypeTuplePtr = std::shared_ptr; + +/// Flattens nested Tuple to plain Tuple. I.e extracts all paths and types from tuple. +/// E.g. Tuple(t Tuple(c1 UInt32, c2 String), c3 UInt64) -> Tuple(t.c1 UInt32, t.c2 String, c3 UInt32) +std::pair flattenTuple(const DataTypePtr & type); + +/// Flattens nested Tuple column to plain Tuple column. +ColumnPtr flattenTuple(const ColumnPtr & column); + +/// The reverse operation to 'flattenTuple'. +/// Creates nested Tuple from all paths and types. +/// E.g. Tuple(t.c1 UInt32, t.c2 String, c3 UInt32) -> Tuple(t Tuple(c1 UInt32, c2 String), c3 UInt64) +DataTypePtr unflattenTuple( + const PathsInData & paths, + const DataTypes & tuple_types); + +std::pair unflattenTuple( + const PathsInData & paths, + const DataTypes & tuple_types, + const Columns & tuple_columns); + +/// For all columns which exist in @expected_columns and +/// don't exist in @available_columns adds to WITH clause +/// an alias with column name to literal of default value of column type. +void replaceMissedSubcolumnsByConstants( + const ColumnsDescription & expected_columns, + const ColumnsDescription & available_columns, + ASTPtr query); + +/// Receives range of objects, which contains collections +/// of columns-like objects (e.g. ColumnsDescription or NamesAndTypesList) +/// and deduces the common types of object columns for all entries. +/// @entry_columns_getter should extract reference to collection of +/// columns-like objects from entry to which Iterator points. +/// columns-like object should have fields "name" and "type". +template +ColumnsDescription getObjectColumns( + Iterator begin, Iterator end, + const ColumnsDescription & storage_columns, + EntryColumnsGetter && entry_columns_getter) +{ + ColumnsDescription res; + + if (begin == end) + { + for (const auto & column : storage_columns) + { + if (isObject(column.type)) + { + auto tuple_type = std::make_shared( + DataTypes{std::make_shared()}, + Names{ColumnObject::COLUMN_NAME_DUMMY}); + + res.add({column.name, std::move(tuple_type)}); + } + } + + return res; + } + + std::unordered_map types_in_entries; + + for (auto it = begin; it != end; ++it) + { + const auto & entry_columns = entry_columns_getter(*it); + for (const auto & column : entry_columns) + { + auto storage_column = storage_columns.tryGetPhysical(column.name); + if (storage_column && isObject(storage_column->type)) + types_in_entries[column.name].push_back(column.type); + } + } + + for (const auto & [name, types] : types_in_entries) + res.add({name, getLeastCommonTypeForObject(types)}); + + return res; +} + +} diff --git a/src/DataTypes/Serializations/CMakeLists.txt b/src/DataTypes/Serializations/CMakeLists.txt new file mode 100644 index 00000000000..65172356645 --- /dev/null +++ b/src/DataTypes/Serializations/CMakeLists.txt @@ -0,0 +1,3 @@ +if (ENABLE_TESTS) + add_subdirectory (tests) +endif () diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index 7df4a956c1a..512653ecb13 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -172,6 +172,10 @@ String getNameForSubstreamPath( else stream_name += "." + it->tuple_element_name; } + else if (it->type == Substream::ObjectElement) + { + stream_name += escapeForFileName(".") + escapeForFileName(it->object_key_name); + } } return stream_name; diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index 86d4eab289a..6c6b64f2416 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -125,6 +125,9 @@ public: SparseElements, SparseOffsets, + ObjectStructure, + ObjectElement, + Regular, }; @@ -133,6 +136,9 @@ public: /// Index of tuple element, starting at 1 or name. String tuple_element_name; + /// Name of subcolumn of object column. + String object_key_name; + /// Do we need to escape a dot in filenames for tuple elements. bool escape_tuple_delimiter = true; diff --git a/src/DataTypes/Serializations/JSONDataParser.h b/src/DataTypes/Serializations/JSONDataParser.h new file mode 100644 index 00000000000..76974b269fd --- /dev/null +++ b/src/DataTypes/Serializations/JSONDataParser.h @@ -0,0 +1,183 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +class ReadBuffer; +class WriteBuffer; + +template +static Field getValueAsField(const Element & element) +{ + if (element.isBool()) return element.getBool(); + if (element.isInt64()) return element.getInt64(); + if (element.isUInt64()) return element.getUInt64(); + if (element.isDouble()) return element.getDouble(); + if (element.isString()) return element.getString(); + if (element.isNull()) return Field(); + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unsupported type of JSON field"); +} + +template +class JSONDataParser +{ +public: + using Element = typename ParserImpl::Element; + + void readJSON(String & s, ReadBuffer & buf) + { + readJSONObjectPossiblyInvalid(s, buf); + } + + std::optional parse(const char * begin, size_t length) + { + std::string_view json{begin, length}; + Element document; + if (!parser.parse(json, document)) + return {}; + + ParseResult result; + PathInDataBuilder builder; + std::vector paths; + + traverse(document, builder, paths, result.values); + + result.paths.reserve(paths.size()); + for (auto && path : paths) + result.paths.emplace_back(std::move(path)); + + return result; + } + +private: + void traverse( + const Element & element, + PathInDataBuilder & builder, + std::vector & paths, + std::vector & values) + { + checkStackSize(); + + if (element.isObject()) + { + auto object = element.getObject(); + + paths.reserve(paths.size() + object.size()); + values.reserve(values.size() + object.size()); + + for (auto it = object.begin(); it != object.end(); ++it) + { + const auto & [key, value] = *it; + traverse(value, builder.append(key, false), paths, values); + builder.popBack(); + } + } + else if (element.isArray()) + { + auto array = element.getArray(); + + using PathPartsWithArray = std::pair; + using PathToArray = HashMapWithStackMemory; + + /// Traverse elements of array and collect an array + /// of fields by each path. + + PathToArray arrays_by_path; + Arena strings_pool; + + size_t current_size = 0; + for (auto it = array.begin(); it != array.end(); ++it) + { + std::vector element_paths; + std::vector element_values; + PathInDataBuilder element_builder; + + traverse(*it, element_builder, element_paths, element_values); + size_t size = element_paths.size(); + size_t keys_to_update = arrays_by_path.size(); + + for (size_t i = 0; i < size; ++i) + { + UInt128 hash = PathInData::getPartsHash(element_paths[i]); + if (auto * found = arrays_by_path.find(hash)) + { + auto & path_array = found->getMapped().second; + + assert(path_array.size() == current_size); + path_array.push_back(std::move(element_values[i])); + --keys_to_update; + } + else + { + /// We found a new key. Add and empty array with current size. + Array path_array; + path_array.reserve(array.size()); + path_array.resize(current_size); + path_array.push_back(std::move(element_values[i])); + + auto & elem = arrays_by_path[hash]; + elem.first = std::move(element_paths[i]); + elem.second = std::move(path_array); + } + } + + /// If some of the keys are missed in current element, + /// add default values for them. + if (keys_to_update) + { + for (auto & [_, value] : arrays_by_path) + { + auto & path_array = value.second; + assert(path_array.size() == current_size || path_array.size() == current_size + 1); + if (path_array.size() == current_size) + path_array.push_back(Field()); + } + } + + ++current_size; + } + + if (arrays_by_path.empty()) + { + paths.push_back(builder.getParts()); + values.push_back(Array()); + } + else + { + paths.reserve(paths.size() + arrays_by_path.size()); + values.reserve(values.size() + arrays_by_path.size()); + + for (auto && [_, value] : arrays_by_path) + { + auto && [path, path_array] = value; + + /// Merge prefix path and path of array element. + paths.push_back(builder.append(path, true).getParts()); + values.push_back(std::move(path_array)); + + builder.popBack(path.size()); + } + } + } + else + { + paths.push_back(builder.getParts()); + values.push_back(getValueAsField(element)); + } + } + + ParserImpl parser; +}; + +} diff --git a/src/DataTypes/Serializations/PathInData.cpp b/src/DataTypes/Serializations/PathInData.cpp new file mode 100644 index 00000000000..9631138dce9 --- /dev/null +++ b/src/DataTypes/Serializations/PathInData.cpp @@ -0,0 +1,199 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace DB +{ + +PathInData::PathInData(std::string_view path_) + : path(path_) +{ + const char * begin = path.data(); + const char * end = path.data() + path.size(); + + for (const char * it = path.data(); it != end; ++it) + { + if (*it == '.') + { + size_t size = static_cast(it - begin); + parts.emplace_back(std::string_view{begin, size}, false, 0); + begin = it + 1; + } + } + + size_t size = static_cast(end - begin); + parts.emplace_back(std::string_view{begin, size}, false, 0.); +} + +PathInData::PathInData(const Parts & parts_) + : path(buildPath(parts_)) + , parts(buildParts(path, parts_)) +{ +} + +PathInData::PathInData(const PathInData & other) + : path(other.path) + , parts(buildParts(path, other.getParts())) +{ +} + +PathInData & PathInData::operator=(const PathInData & other) +{ + if (this != &other) + { + path = other.path; + parts = buildParts(path, other.parts); + } + return *this; +} + +UInt128 PathInData::getPartsHash(const Parts & parts_) +{ + SipHash hash; + hash.update(parts_.size()); + for (const auto & part : parts_) + { + hash.update(part.key.data(), part.key.length()); + hash.update(part.is_nested); + hash.update(part.anonymous_array_level); + } + + UInt128 res; + hash.get128(res); + return res; +} + +void PathInData::writeBinary(WriteBuffer & out) const +{ + writeVarUInt(parts.size(), out); + for (const auto & part : parts) + { + writeStringBinary(part.key, out); + writeVarUInt(part.is_nested, out); + writeVarUInt(part.anonymous_array_level, out); + } +} + +void PathInData::readBinary(ReadBuffer & in) +{ + size_t num_parts; + readVarUInt(num_parts, in); + + Arena arena; + Parts temp_parts; + temp_parts.reserve(num_parts); + + for (size_t i = 0; i < num_parts; ++i) + { + bool is_nested; + UInt8 anonymous_array_level; + + auto ref = readStringBinaryInto(arena, in); + readVarUInt(is_nested, in); + readVarUInt(anonymous_array_level, in); + + temp_parts.emplace_back(static_cast(ref), is_nested, anonymous_array_level); + } + + /// Recreate path and parts. + path = buildPath(temp_parts); + parts = buildParts(path, temp_parts); +} + +String PathInData::buildPath(const Parts & other_parts) +{ + if (other_parts.empty()) + return ""; + + String res; + auto it = other_parts.begin(); + res += it->key; + ++it; + for (; it != other_parts.end(); ++it) + { + res += "."; + res += it->key; + } + + return res; +} + +PathInData::Parts PathInData::buildParts(const String & other_path, const Parts & other_parts) +{ + if (other_parts.empty()) + return {}; + + Parts res; + const char * begin = other_path.data(); + for (const auto & part : other_parts) + { + res.emplace_back(std::string_view{begin, part.key.length()}, part.is_nested, part.anonymous_array_level); + begin += part.key.length() + 1; + } + return res; +} + +size_t PathInData::Hash::operator()(const PathInData & value) const +{ + auto hash = getPartsHash(value.parts); + return hash.items[0] ^ hash.items[1]; +} + +PathInDataBuilder & PathInDataBuilder::append(std::string_view key, bool is_array) +{ + if (parts.empty()) + current_anonymous_array_level += is_array; + + if (!key.empty()) + { + if (!parts.empty()) + parts.back().is_nested = is_array; + + parts.emplace_back(key, false, current_anonymous_array_level); + current_anonymous_array_level = 0; + } + + return *this; +} + +PathInDataBuilder & PathInDataBuilder::append(const PathInData::Parts & path, bool is_array) +{ + if (parts.empty()) + current_anonymous_array_level += is_array; + + if (!path.empty()) + { + if (!parts.empty()) + parts.back().is_nested = is_array; + + auto it = parts.insert(parts.end(), path.begin(), path.end()); + for (; it != parts.end(); ++it) + it->anonymous_array_level += current_anonymous_array_level; + current_anonymous_array_level = 0; + } + + return *this; +} + +void PathInDataBuilder::popBack() +{ + parts.pop_back(); +} + +void PathInDataBuilder::popBack(size_t n) +{ + assert(n <= parts.size()); + parts.resize(parts.size() - n); +} + +} diff --git a/src/DataTypes/Serializations/PathInData.h b/src/DataTypes/Serializations/PathInData.h new file mode 100644 index 00000000000..35f6d10438d --- /dev/null +++ b/src/DataTypes/Serializations/PathInData.h @@ -0,0 +1,112 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +class ReadBuffer; +class WriteBuffer; + +/// Class that represents path in document, e.g. JSON. +class PathInData +{ +public: + struct Part + { + Part() = default; + Part(std::string_view key_, bool is_nested_, UInt8 anonymous_array_level_) + : key(key_), is_nested(is_nested_), anonymous_array_level(anonymous_array_level_) + { + } + + /// Name of part of path. + std::string_view key; + + /// If this part is Nested, i.e. element + /// related to this key is the array of objects. + bool is_nested = false; + + /// Number of array levels between current key and previous key. + /// E.g. in JSON {"k1": [[[{"k2": 1, "k3": 2}]]]} + /// "k1" is nested and has anonymous_array_level = 0. + /// "k2" and "k3" are not nested and have anonymous_array_level = 2. + UInt8 anonymous_array_level = 0; + + bool operator==(const Part & other) const = default; + }; + + using Parts = std::vector; + + PathInData() = default; + explicit PathInData(std::string_view path_); + explicit PathInData(const Parts & parts_); + + PathInData(const PathInData & other); + PathInData & operator=(const PathInData & other); + + static UInt128 getPartsHash(const Parts & parts_); + + bool empty() const { return parts.empty(); } + + const String & getPath() const { return path; } + const Parts & getParts() const { return parts; } + + bool isNested(size_t i) const { return parts[i].is_nested; } + bool hasNested() const { return std::any_of(parts.begin(), parts.end(), [](const auto & part) { return part.is_nested; }); } + + void writeBinary(WriteBuffer & out) const; + void readBinary(ReadBuffer & in); + + bool operator==(const PathInData & other) const { return parts == other.parts; } + struct Hash { size_t operator()(const PathInData & value) const; }; + +private: + /// Creates full path from parts. + static String buildPath(const Parts & other_parts); + + /// Creates new parts full from full path with correct string pointers. + static Parts buildParts(const String & other_path, const Parts & other_parts); + + /// The full path. Parts are separated by dots. + String path; + + /// Parts of the path. All string_view-s in parts must point to the @path. + Parts parts; +}; + +class PathInDataBuilder +{ +public: + const PathInData::Parts & getParts() const { return parts; } + + PathInDataBuilder & append(std::string_view key, bool is_array); + PathInDataBuilder & append(const PathInData::Parts & path, bool is_array); + + void popBack(); + void popBack(size_t n); + +private: + PathInData::Parts parts; + + /// Number of array levels without key to which + /// next non-empty key will be nested. + /// Example: for JSON { "k1": [[{"k2": 1, "k3": 2}] } + // `k2` and `k3` has anonymous_array_level = 1 in that case. + size_t current_anonymous_array_level = 0; +}; + +using PathsInData = std::vector; + +/// Result of parsing of a document. +/// Contains all paths extracted from document +/// and values which are related to them. +struct ParseResult +{ + std::vector paths; + std::vector values; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationObject.cpp b/src/DataTypes/Serializations/SerializationObject.cpp new file mode 100644 index 00000000000..f826478958c --- /dev/null +++ b/src/DataTypes/Serializations/SerializationObject.cpp @@ -0,0 +1,460 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; + extern const int INCORRECT_DATA; + extern const int CANNOT_READ_ALL_DATA; + extern const int LOGICAL_ERROR; +} + +namespace +{ + +/// Visitor that keeps @num_dimensions_to_keep dimensions in arrays +/// and replaces all scalars or nested arrays to @replacement at that level. +class FieldVisitorReplaceScalars : public StaticVisitor +{ +public: + FieldVisitorReplaceScalars(const Field & replacement_, size_t num_dimensions_to_keep_) + : replacement(replacement_), num_dimensions_to_keep(num_dimensions_to_keep_) + { + } + + template + Field operator()(const T & x) const + { + if constexpr (std::is_same_v) + { + if (num_dimensions_to_keep == 0) + return replacement; + + const size_t size = x.size(); + Array res(size); + for (size_t i = 0; i < size; ++i) + res[i] = applyVisitor(FieldVisitorReplaceScalars(replacement, num_dimensions_to_keep - 1), x[i]); + return res; + } + else + return replacement; + } + +private: + const Field & replacement; + size_t num_dimensions_to_keep; +}; + +using Node = typename ColumnObject::SubcolumnsTree::Node; + +/// Finds a subcolumn from the same Nested type as @entry and inserts +/// an array with default values with consistent sizes as in Nested type. +bool tryInsertDefaultFromNested( + std::shared_ptr entry, const ColumnObject::SubcolumnsTree & subcolumns) +{ + if (!entry->path.hasNested()) + return false; + + const Node * current_node = subcolumns.findLeaf(entry->path); + const Node * leaf = nullptr; + size_t num_skipped_nested = 0; + + while (current_node) + { + /// Try to find the first Nested up to the current node. + const auto * node_nested = subcolumns.findParent(current_node, + [](const auto & candidate) { return candidate.isNested(); }); + + if (!node_nested) + break; + + /// If there are no leaves, skip current node and find + /// the next node up to the current. + leaf = subcolumns.findLeaf(node_nested, + [&](const auto & candidate) + { + return candidate.data.size() == entry->data.size() + 1; + }); + + if (leaf) + break; + + current_node = node_nested->parent; + ++num_skipped_nested; + } + + if (!leaf) + return false; + + auto last_field = leaf->data.getLastField(); + if (last_field.isNull()) + return false; + + const auto & least_common_type = entry->data.getLeastCommonType(); + size_t num_dimensions = getNumberOfDimensions(*least_common_type); + assert(num_skipped_nested < num_dimensions); + + /// Replace scalars to default values with consistent array sizes. + size_t num_dimensions_to_keep = num_dimensions - num_skipped_nested; + auto default_scalar = num_skipped_nested + ? createEmptyArrayField(num_skipped_nested) + : getBaseTypeOfArray(least_common_type)->getDefault(); + + auto default_field = applyVisitor(FieldVisitorReplaceScalars(default_scalar, num_dimensions_to_keep), last_field); + entry->data.insert(std::move(default_field)); + + return true; +} + +} + +template +template +void SerializationObject::deserializeTextImpl(IColumn & column, Reader && reader) const +{ + auto & column_object = assert_cast(column); + + String buf; + reader(buf); + + auto result = parser.parse(buf.data(), buf.size()); + if (!result) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse object"); + + auto & [paths, values] = *result; + assert(paths.size() == values.size()); + + HashSet paths_set; + size_t column_size = column_object.size(); + + for (size_t i = 0; i < paths.size(); ++i) + { + auto field_info = getFieldInfo(values[i]); + if (isNothing(field_info.scalar_type)) + continue; + + if (!paths_set.insert(paths[i].getPath()).second) + throw Exception(ErrorCodes::INCORRECT_DATA, + "Object has ambiguous path: {}", paths[i].getPath()); + + if (!column_object.hasSubcolumn(paths[i])) + { + if (paths[i].hasNested()) + column_object.addNestedSubcolumn(paths[i], field_info, column_size); + else + column_object.addSubcolumn(paths[i], column_size); + } + + auto & subcolumn = column_object.getSubcolumn(paths[i]); + assert(subcolumn.size() == column_size); + + subcolumn.insert(std::move(values[i]), std::move(field_info)); + } + + /// Insert default values to missed subcolumns. + const auto & subcolumns = column_object.getSubcolumns(); + for (const auto & entry : subcolumns) + { + if (!paths_set.has(entry->path.getPath())) + { + bool inserted = tryInsertDefaultFromNested(entry, subcolumns); + if (!inserted) + entry->data.insertDefault(); + } + } + + column_object.incrementNumRows(); +} + +template +void SerializationObject::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + deserializeTextImpl(column, [&](String & s) { readStringInto(s, istr); }); +} + +template +void SerializationObject::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + deserializeTextImpl(column, [&](String & s) { readEscapedStringInto(s, istr); }); +} + +template +void SerializationObject::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + deserializeTextImpl(column, [&](String & s) { readQuotedStringInto(s, istr); }); +} + +template +void SerializationObject::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + deserializeTextImpl(column, [&](String & s) { parser.readJSON(s, istr); }); +} + +template +void SerializationObject::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextImpl(column, [&](String & s) { readCSVStringInto(s, istr, settings.csv); }); +} + +template +template +void SerializationObject::checkSerializationIsSupported(const TSettings & settings, const TStatePtr & state) const +{ + if (settings.position_independent_encoding) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "DataTypeObject doesn't support serialization with position independent encoding"); + + if (state) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "DataTypeObject doesn't support serialization with non-trivial state"); +} + +template +void SerializationObject::serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + checkSerializationIsSupported(settings, state); +} + +template +void SerializationObject::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + checkSerializationIsSupported(settings, state); +} + +template +void SerializationObject::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + checkSerializationIsSupported(settings, state); +} + +template +void SerializationObject::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + checkSerializationIsSupported(settings, state); + const auto & column_object = assert_cast(column); + + if (!column_object.isFinalized()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write non-finalized ColumnObject"); + + settings.path.push_back(Substream::ObjectStructure); + if (auto * stream = settings.getter(settings.path)) + writeVarUInt(column_object.getSubcolumns().size(), *stream); + + const auto & subcolumns = column_object.getSubcolumns(); + for (const auto & entry : subcolumns) + { + settings.path.back() = Substream::ObjectStructure; + settings.path.back().object_key_name = entry->path.getPath(); + + const auto & type = entry->data.getLeastCommonType(); + if (auto * stream = settings.getter(settings.path)) + { + entry->path.writeBinary(*stream); + writeStringBinary(type->getName(), *stream); + } + + settings.path.back() = Substream::ObjectElement; + if (auto * stream = settings.getter(settings.path)) + { + auto serialization = type->getDefaultSerialization(); + serialization->serializeBinaryBulkWithMultipleStreams( + entry->data.getFinalizedColumn(), offset, limit, settings, state); + } + } + + settings.path.pop_back(); +} + +template +void SerializationObject::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + checkSerializationIsSupported(settings, state); + if (!column->empty()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "DataTypeObject cannot be deserialized to non-empty column"); + + auto mutable_column = column->assumeMutable(); + auto & column_object = typeid_cast(*mutable_column); + + size_t num_subcolumns = 0; + settings.path.push_back(Substream::ObjectStructure); + if (auto * stream = settings.getter(settings.path)) + readVarUInt(num_subcolumns, *stream); + + settings.path.back() = Substream::ObjectElement; + for (size_t i = 0; i < num_subcolumns; ++i) + { + PathInData key; + String type_name; + + settings.path.back() = Substream::ObjectStructure; + if (auto * stream = settings.getter(settings.path)) + { + key.readBinary(*stream); + readStringBinary(type_name, *stream); + } + else + { + throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, + "Cannot read structure of DataTypeObject, because its stream is missing"); + } + + settings.path.back() = Substream::ObjectElement; + settings.path.back().object_key_name = key.getPath(); + + if (auto * stream = settings.getter(settings.path)) + { + auto type = DataTypeFactory::instance().get(type_name); + auto serialization = type->getDefaultSerialization(); + ColumnPtr subcolumn_data = type->createColumn(); + serialization->deserializeBinaryBulkWithMultipleStreams(subcolumn_data, limit, settings, state, cache); + column_object.addSubcolumn(key, subcolumn_data->assumeMutable()); + } + else + { + throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, + "Cannot read subcolumn '{}' of DataTypeObject, because its stream is missing", key.getPath()); + } + } + + settings.path.pop_back(); + column_object.checkConsistency(); + column_object.finalize(); + column = std::move(mutable_column); +} + +template +void SerializationObject::serializeBinary(const Field &, WriteBuffer &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject"); +} + +template +void SerializationObject::deserializeBinary(Field &, ReadBuffer &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject"); +} + +template +void SerializationObject::serializeBinary(const IColumn &, size_t, WriteBuffer &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject"); +} + +template +void SerializationObject::deserializeBinary(IColumn &, ReadBuffer &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject"); +} + +/// TODO: use format different of JSON in serializations. + +template +void SerializationObject::serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const auto & column_object = assert_cast(column); + const auto & subcolumns = column_object.getSubcolumns(); + + writeChar('{', ostr); + for (auto it = subcolumns.begin(); it != subcolumns.end(); ++it) + { + if (it != subcolumns.begin()) + writeCString(",", ostr); + + writeDoubleQuoted((*it)->path.getPath(), ostr); + writeChar(':', ostr); + + auto serialization = (*it)->data.getLeastCommonType()->getDefaultSerialization(); + serialization->serializeTextJSON((*it)->data.getFinalizedColumn(), row_num, ostr, settings); + } + writeChar('}', ostr); +} + +template +void SerializationObject::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeTextImpl(column, row_num, ostr, settings); +} + +template +void SerializationObject::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + WriteBufferFromOwnString ostr_str; + serializeTextImpl(column, row_num, ostr_str, settings); + writeEscapedString(ostr_str.str(), ostr); +} + +template +void SerializationObject::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + WriteBufferFromOwnString ostr_str; + serializeTextImpl(column, row_num, ostr_str, settings); + writeQuotedString(ostr_str.str(), ostr); +} + +template +void SerializationObject::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeTextImpl(column, row_num, ostr, settings); +} + +template +void SerializationObject::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + WriteBufferFromOwnString ostr_str; + serializeTextImpl(column, row_num, ostr_str, settings); + writeCSVString(ostr_str.str(), ostr); +} + +SerializationPtr getObjectSerialization(const String & schema_format) +{ + if (schema_format == "json") + { +#if USE_SIMDJSON + return std::make_shared>>(); +#elif USE_RAPIDJSON + return std::make_shared>>(); +#else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "To use data type Object with JSON format ClickHouse should be built with Simdjson or Rapidjson"); +#endif + } + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unknown schema format '{}'", schema_format); +} + +} diff --git a/src/DataTypes/Serializations/SerializationObject.h b/src/DataTypes/Serializations/SerializationObject.h new file mode 100644 index 00000000000..c91d467d5e1 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationObject.h @@ -0,0 +1,73 @@ +#pragma once + +#include + +namespace DB +{ + +/// Serialization for data type Object. +/// Supported only test serialization/deserialization. +/// and binary bulk serialization/deserialization without position independent +/// encoding, i.e. serialization/deserialization into Native format. +template +class SerializationObject : public ISerialization +{ +public: + void serializeBinaryBulkStatePrefix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr) const override; + void deserializeBinary(Field & field, ReadBuffer & istr) const override; + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + +private: + template + void checkSerializationIsSupported(const TSettings & settings, const TStatePtr & state) const; + + template + void deserializeTextImpl(IColumn & column, Reader && reader) const; + + void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; + + mutable Parser parser; +}; + +SerializationPtr getObjectSerialization(const String & schema_format); + +} diff --git a/src/DataTypes/Serializations/SubcolumnsTree.h b/src/DataTypes/Serializations/SubcolumnsTree.h new file mode 100644 index 00000000000..64fc14ba834 --- /dev/null +++ b/src/DataTypes/Serializations/SubcolumnsTree.h @@ -0,0 +1,209 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +/// Tree that represents paths in document +/// with additional data in nodes. +template +class SubcolumnsTree +{ +public: + struct Node + { + enum Kind + { + TUPLE, + NESTED, + SCALAR, + }; + + explicit Node(Kind kind_) : kind(kind_) {} + Node(Kind kind_, const NodeData & data_) : kind(kind_), data(data_) {} + Node(Kind kind_, const NodeData & data_, const PathInData & path_) + : kind(kind_), data(data_), path(path_) {} + + Kind kind = TUPLE; + const Node * parent = nullptr; + + std::map, std::less<>> children; + + NodeData data; + PathInData path; + + bool isNested() const { return kind == NESTED; } + bool isScalar() const { return kind == SCALAR; } + + void addChild(const String & key, std::shared_ptr next_node) + { + next_node->parent = this; + children[key] = std::move(next_node); + } + }; + + using NodeKind = typename Node::Kind; + using NodePtr = std::shared_ptr; + + /// Add a leaf without any data in other nodes. + bool add(const PathInData & path, const NodeData & leaf_data) + { + return add(path, [&](NodeKind kind, bool exists) -> NodePtr + { + if (exists) + return nullptr; + + if (kind == Node::SCALAR) + return std::make_shared(kind, leaf_data, path); + + return std::make_shared(kind); + }); + } + + /// Callback for creation of node. Receives kind of node and + /// flag, which is true if node already exists. + using NodeCreator = std::function; + + bool add(const PathInData & path, const NodeCreator & node_creator) + { + const auto & parts = path.getParts(); + + if (parts.empty()) + return false; + + if (!root) + root = std::make_shared(Node::TUPLE); + + Node * current_node = root.get(); + for (size_t i = 0; i < parts.size() - 1; ++i) + { + assert(current_node->kind != Node::SCALAR); + + auto it = current_node->children.find(parts[i].key); + if (it != current_node->children.end()) + { + current_node = it->second.get(); + node_creator(current_node->kind, true); + + if (current_node->isNested() != parts[i].is_nested) + return false; + } + else + { + auto next_kind = parts[i].is_nested ? Node::NESTED : Node::TUPLE; + auto next_node = node_creator(next_kind, false); + current_node->addChild(String(parts[i].key), next_node); + current_node = next_node.get(); + } + } + + auto it = current_node->children.find(parts.back().key); + if (it != current_node->children.end()) + return false; + + auto next_node = node_creator(Node::SCALAR, false); + current_node->addChild(String(parts.back().key), next_node); + leaves.push_back(std::move(next_node)); + + return true; + } + + /// Find node that matches the path the best. + const Node * findBestMatch(const PathInData & path) const + { + return findImpl(path, false); + } + + /// Find node that matches the path exactly. + const Node * findExact(const PathInData & path) const + { + return findImpl(path, true); + } + + /// Find leaf by path. + const Node * findLeaf(const PathInData & path) const + { + const auto * candidate = findExact(path); + if (!candidate || !candidate->isScalar()) + return nullptr; + return candidate; + } + + using NodePredicate = std::function; + + /// Finds leaf that satisfies the predicate. + const Node * findLeaf(const NodePredicate & predicate) + { + return findLeaf(root.get(), predicate); + } + + static const Node * findLeaf(const Node * node, const NodePredicate & predicate) + { + if (!node) + return nullptr; + + if (node->isScalar()) + return predicate(*node) ? node : nullptr; + + for (const auto & [_, child] : node->children) + if (const auto * leaf = findLeaf(child.get(), predicate)) + return leaf; + + return nullptr; + } + + /// Find first parent node that satisfies the predicate. + static const Node * findParent(const Node * node, const NodePredicate & predicate) + { + while (node && !predicate(*node)) + node = node->parent; + return node; + } + + bool empty() const { return root == nullptr; } + size_t size() const { return leaves.size(); } + + using Nodes = std::vector; + + const Nodes & getLeaves() const { return leaves; } + const Node * getRoot() const { return root.get(); } + + using iterator = typename Nodes::iterator; + using const_iterator = typename Nodes::const_iterator; + + iterator begin() { return leaves.begin(); } + iterator end() { return leaves.end(); } + + const_iterator begin() const { return leaves.begin(); } + const_iterator end() const { return leaves.end(); } + +private: + const Node * findImpl(const PathInData & path, bool find_exact) const + { + if (!root) + return nullptr; + + const auto & parts = path.getParts(); + const Node * current_node = root.get(); + + for (const auto & part : parts) + { + auto it = current_node->children.find(part.key); + if (it == current_node->children.end()) + return find_exact ? nullptr : current_node; + + current_node = it->second.get(); + } + + return current_node; + } + + NodePtr root; + Nodes leaves; +}; + +} diff --git a/src/DataTypes/Serializations/tests/CMakeLists.txt b/src/DataTypes/Serializations/tests/CMakeLists.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/DataTypes/Serializations/tests/gtest_json_parser.cpp b/src/DataTypes/Serializations/tests/gtest_json_parser.cpp new file mode 100644 index 00000000000..4dddb3cd03d --- /dev/null +++ b/src/DataTypes/Serializations/tests/gtest_json_parser.cpp @@ -0,0 +1,216 @@ +#include +#include +#include +#include + +#include +#include + +#if USE_SIMDJSON + +using namespace DB; + +const String json1 = R"({"k1" : 1, "k2" : {"k3" : "aa", "k4" : 2}})"; + +/// Nested(k2 String, k3 Nested(k4 String)) +const String json2 = +R"({"k1" : [ + { + "k2" : "aaa", + "k3" : [{ "k4" : "bbb" }, { "k4" : "ccc" }] + }, + { + "k2" : "ddd", + "k3" : [{ "k4" : "eee" }, { "k4" : "fff" }] + } + ] +})"; + +TEST(JSONDataParser, ReadJSON) +{ + { + String json_bad = json1 + "aaaaaaa"; + + JSONDataParser parser; + ReadBufferFromString buf(json_bad); + String res; + parser.readJSON(res, buf); + ASSERT_EQ(json1, res); + } + + { + String json_bad = json2 + "aaaaaaa"; + + JSONDataParser parser; + ReadBufferFromString buf(json_bad); + String res; + parser.readJSON(res, buf); + ASSERT_EQ(json2, res); + } +} + +struct JSONPathAndValue +{ + PathInData path; + Field value; + + JSONPathAndValue(const PathInData & path_, const Field & value_) + : path(path_), value(value_) + { + } + + bool operator==(const JSONPathAndValue & other) const = default; + bool operator<(const JSONPathAndValue & other) const { return path.getPath() < other.path.getPath(); } +}; + +static std::ostream & operator<<(std::ostream & ostr, const JSONPathAndValue & path_and_value) +{ + ostr << "{ PathInData{"; + bool first = true; + for (const auto & part : path_and_value.path.getParts()) + { + ostr << (first ? "{" : ", {") << part.key << ", " << part.is_nested << ", " << part.anonymous_array_level << "}"; + first = false; + } + + ostr << "}, Field{" << applyVisitor(FieldVisitorToString(), path_and_value.value) << "} }"; + return ostr; +} + +using JSONValues = std::vector; + +static void check( + const String & json_str, + const String & tag, + JSONValues expected_values) +{ + JSONDataParser parser; + auto res = parser.parse(json_str.data(), json_str.size()); + ASSERT_TRUE(res.has_value()) << tag; + + const auto & [paths, values] = *res; + + ASSERT_EQ(paths.size(), expected_values.size()) << tag; + ASSERT_EQ(values.size(), expected_values.size()) << tag; + + JSONValues result_values; + for (size_t i = 0; i < paths.size(); ++i) + result_values.emplace_back(paths[i], values[i]); + + std::sort(expected_values.begin(), expected_values.end()); + std::sort(result_values.begin(), result_values.end()); + + ASSERT_EQ(result_values, expected_values) << tag; +} + +TEST(JSONDataParser, Parse) +{ + { + check(json1, "json1", + { + { PathInData{{{"k1", false, 0}}}, 1 }, + { PathInData{{{"k2", false, 0}, {"k3", false, 0}}}, "aa" }, + { PathInData{{{"k2", false, 0}, {"k4", false, 0}}}, 2 }, + }); + } + + { + check(json2, "json2", + { + { PathInData{{{"k1", true, 0}, {"k2", false, 0}}}, Array{"aaa", "ddd"} }, + { PathInData{{{"k1", true, 0}, {"k3", true, 0}, {"k4", false, 0}}}, Array{Array{"bbb", "ccc"}, Array{"eee", "fff"}} }, + }); + } + + { + /// Nested(k2 Tuple(k3 Array(Int), k4 Array(Int)), k5 String) + const String json3 = + R"({"k1": [ + { + "k2": { + "k3": [1, 2], + "k4": [3, 4] + }, + "k5": "foo" + }, + { + "k2": { + "k3": [5, 6], + "k4": [7, 8] + }, + "k5": "bar" + } + ]})"; + + check(json3, "json3", + { + { PathInData{{{"k1", true, 0}, {"k5", false, 0}}}, Array{"foo", "bar"} }, + { PathInData{{{"k1", true, 0}, {"k2", false, 0}, {"k3", false, 0}}}, Array{Array{1, 2}, Array{5, 6}} }, + { PathInData{{{"k1", true, 0}, {"k2", false, 0}, {"k4", false, 0}}}, Array{Array{3, 4}, Array{7, 8}} }, + }); + } + + { + /// Nested(k2 Nested(k3 Int, k4 Int), k5 String) + const String json4 = + R"({"k1": [ + { + "k2": [{"k3": 1, "k4": 3}, {"k3": 2, "k4": 4}], + "k5": "foo" + }, + { + "k2": [{"k3": 5, "k4": 7}, {"k3": 6, "k4": 8}], + "k5": "bar" + } + ]})"; + + check(json4, "json4", + { + { PathInData{{{"k1", true, 0}, {"k5", false, 0}}}, Array{"foo", "bar"} }, + { PathInData{{{"k1", true, 0}, {"k2", true, 0}, {"k3", false, 0}}}, Array{Array{1, 2}, Array{5, 6}} }, + { PathInData{{{"k1", true, 0}, {"k2", true, 0}, {"k4", false, 0}}}, Array{Array{3, 4}, Array{7, 8}} }, + }); + } + + { + const String json5 = R"({"k1": [[1, 2, 3], [4, 5], [6]]})"; + check(json5, "json5", + { + { PathInData{{{"k1", false, 0}}}, Array{Array{1, 2, 3}, Array{4, 5}, Array{6}} } + }); + } + + { + /// Array(Nested(k2 Int, k3 Int)) + const String json6 = R"({ + "k1": [ + [{"k2": 1, "k3": 2}, {"k2": 3, "k3": 4}], + [{"k2": 5, "k3": 6}] + ] + })"; + + check(json6, "json6", + { + { PathInData{{{"k1", true, 0}, {"k2", false, 1}}}, Array{Array{1, 3}, Array{5}} }, + { PathInData{{{"k1", true, 0}, {"k3", false, 1}}}, Array{Array{2, 4}, Array{6}} }, + }); + } + + { + /// Nested(k2 Array(Int), k3 Array(Int)) + const String json7 = R"({ + "k1": [ + {"k2": [1, 3], "k3": [2, 4]}, + {"k2": [5], "k3": [6]} + ] + })"; + + check(json7, "json7", + { + { PathInData{{{"k1", true, 0}, {"k2", false, 0}}}, Array{Array{1, 3}, Array{5}} }, + { PathInData{{{"k1", true, 0}, {"k3", false, 0}}}, Array{Array{2, 4}, Array{6}} }, + }); + } +} + +#endif diff --git a/src/DataTypes/getLeastSupertype.cpp b/src/DataTypes/getLeastSupertype.cpp index 22f6a077504..3fcb3fef25b 100644 --- a/src/DataTypes/getLeastSupertype.cpp +++ b/src/DataTypes/getLeastSupertype.cpp @@ -18,6 +18,8 @@ #include #include #include +#include +#include namespace DB @@ -30,28 +32,181 @@ namespace ErrorCodes namespace { - String getExceptionMessagePrefix(const DataTypes & types) + +String typeToString(const DataTypePtr & type) { return type->getName(); } +String typeToString(const TypeIndex & type) { return String(magic_enum::enum_name(type)); } + +template +String getExceptionMessagePrefix(const DataTypes & types) +{ + WriteBufferFromOwnString res; + res << "There is no supertype for types "; + + bool first = true; + for (const auto & type : types) { - WriteBufferFromOwnString res; - res << "There is no supertype for types "; + if (!first) + res << ", "; + first = false; - bool first = true; - for (const auto & type : types) - { - if (!first) - res << ", "; - first = false; - - res << type->getName(); - } - - return res.str(); + res << typeToString(type); } + + return res.str(); } - -DataTypePtr getLeastSupertype(const DataTypes & types) +DataTypePtr getNumericType(const TypeIndexSet & types, bool allow_conversion_to_string) { + auto throw_or_return = [&](std::string_view message, int error_code) + { + if (allow_conversion_to_string) + return std::make_shared(); + + throw Exception(String(message), error_code); + }; + + bool all_numbers = true; + + size_t max_bits_of_signed_integer = 0; + size_t max_bits_of_unsigned_integer = 0; + size_t max_mantissa_bits_of_floating = 0; + + auto maximize = [](size_t & what, size_t value) + { + if (value > what) + what = value; + }; + + for (const auto & type : types) + { + if (type == TypeIndex::UInt8) + maximize(max_bits_of_unsigned_integer, 8); + else if (type == TypeIndex::UInt16) + maximize(max_bits_of_unsigned_integer, 16); + else if (type == TypeIndex::UInt32) + maximize(max_bits_of_unsigned_integer, 32); + else if (type == TypeIndex::UInt64) + maximize(max_bits_of_unsigned_integer, 64); + else if (type == TypeIndex::UInt128) + maximize(max_bits_of_unsigned_integer, 128); + else if (type == TypeIndex::UInt256) + maximize(max_bits_of_unsigned_integer, 256); + else if (type == TypeIndex::Int8 || type == TypeIndex::Enum8) + maximize(max_bits_of_signed_integer, 8); + else if (type == TypeIndex::Int16 || type == TypeIndex::Enum16) + maximize(max_bits_of_signed_integer, 16); + else if (type == TypeIndex::Int32) + maximize(max_bits_of_signed_integer, 32); + else if (type == TypeIndex::Int64) + maximize(max_bits_of_signed_integer, 64); + else if (type == TypeIndex::Int128) + maximize(max_bits_of_signed_integer, 128); + else if (type == TypeIndex::Int256) + maximize(max_bits_of_signed_integer, 256); + else if (type == TypeIndex::Float32) + maximize(max_mantissa_bits_of_floating, 24); + else if (type == TypeIndex::Float64) + maximize(max_mantissa_bits_of_floating, 53); + else + all_numbers = false; + } + + if (max_bits_of_signed_integer || max_bits_of_unsigned_integer || max_mantissa_bits_of_floating) + { + if (!all_numbers) + return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are numbers and some of them are not", ErrorCodes::NO_COMMON_TYPE); + + /// If there are signed and unsigned types of same bit-width, the result must be signed number with at least one more bit. + /// Example, common of Int32, UInt32 = Int64. + + size_t min_bit_width_of_integer = std::max(max_bits_of_signed_integer, max_bits_of_unsigned_integer); + + /// If unsigned is not covered by signed. + if (max_bits_of_signed_integer && max_bits_of_unsigned_integer >= max_bits_of_signed_integer) //-V1051 + { + // Because 128 and 256 bit integers are significantly slower, we should not promote to them. + // But if we already have wide numbers, promotion is necessary. + if (min_bit_width_of_integer != 64) + ++min_bit_width_of_integer; + else + return throw_or_return( + getExceptionMessagePrefix(types) + + " because some of them are signed integers and some are unsigned integers," + " but there is no signed integer type, that can exactly represent all required unsigned integer values", + ErrorCodes::NO_COMMON_TYPE); + } + + /// If the result must be floating. + if (max_mantissa_bits_of_floating) + { + size_t min_mantissa_bits = std::max(min_bit_width_of_integer, max_mantissa_bits_of_floating); + if (min_mantissa_bits <= 24) + return std::make_shared(); + else if (min_mantissa_bits <= 53) + return std::make_shared(); + else + return throw_or_return(getExceptionMessagePrefix(types) + + " because some of them are integers and some are floating point," + " but there is no floating point type, that can exactly represent all required integers", ErrorCodes::NO_COMMON_TYPE); + } + + /// If the result must be signed integer. + if (max_bits_of_signed_integer) + { + if (min_bit_width_of_integer <= 8) + return std::make_shared(); + else if (min_bit_width_of_integer <= 16) + return std::make_shared(); + else if (min_bit_width_of_integer <= 32) + return std::make_shared(); + else if (min_bit_width_of_integer <= 64) + return std::make_shared(); + else if (min_bit_width_of_integer <= 128) + return std::make_shared(); + else if (min_bit_width_of_integer <= 256) + return std::make_shared(); + else + return throw_or_return(getExceptionMessagePrefix(types) + + " because some of them are signed integers and some are unsigned integers," + " but there is no signed integer type, that can exactly represent all required unsigned integer values", ErrorCodes::NO_COMMON_TYPE); + } + + /// All unsigned. + { + if (min_bit_width_of_integer <= 8) + return std::make_shared(); + else if (min_bit_width_of_integer <= 16) + return std::make_shared(); + else if (min_bit_width_of_integer <= 32) + return std::make_shared(); + else if (min_bit_width_of_integer <= 64) + return std::make_shared(); + else if (min_bit_width_of_integer <= 128) + return std::make_shared(); + else if (min_bit_width_of_integer <= 256) + return std::make_shared(); + else + return throw_or_return("Logical error: " + getExceptionMessagePrefix(types) + + " but as all data types are unsigned integers, we must have found maximum unsigned integer type", ErrorCodes::NO_COMMON_TYPE); + + } + } + + return {}; +} + +} + +DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_conversion_to_string) +{ + auto throw_or_return = [&](std::string_view message, int error_code) + { + if (allow_conversion_to_string) + return std::make_shared(); + + throw Exception(String(message), error_code); + }; + /// Trivial cases if (types.empty()) @@ -88,7 +243,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types) non_nothing_types.emplace_back(type); if (non_nothing_types.size() < types.size()) - return getLeastSupertype(non_nothing_types); + return getLeastSupertype(non_nothing_types, allow_conversion_to_string); } /// For Arrays @@ -113,9 +268,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types) if (have_array) { if (!all_arrays) - throw Exception(getExceptionMessagePrefix(types) + " because some of them are Array and some of them are not", ErrorCodes::NO_COMMON_TYPE); + return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are Array and some of them are not", ErrorCodes::NO_COMMON_TYPE); - return std::make_shared(getLeastSupertype(nested_types)); + return std::make_shared(getLeastSupertype(nested_types, allow_conversion_to_string)); } } @@ -139,7 +294,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types) nested_types[elem_idx].reserve(types.size()); } else if (tuple_size != type_tuple->getElements().size()) - throw Exception(getExceptionMessagePrefix(types) + " because Tuples have different sizes", ErrorCodes::NO_COMMON_TYPE); + return throw_or_return(getExceptionMessagePrefix(types) + " because Tuples have different sizes", ErrorCodes::NO_COMMON_TYPE); have_tuple = true; @@ -153,11 +308,11 @@ DataTypePtr getLeastSupertype(const DataTypes & types) if (have_tuple) { if (!all_tuples) - throw Exception(getExceptionMessagePrefix(types) + " because some of them are Tuple and some of them are not", ErrorCodes::NO_COMMON_TYPE); + return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are Tuple and some of them are not", ErrorCodes::NO_COMMON_TYPE); DataTypes common_tuple_types(tuple_size); for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx) - common_tuple_types[elem_idx] = getLeastSupertype(nested_types[elem_idx]); + common_tuple_types[elem_idx] = getLeastSupertype(nested_types[elem_idx], allow_conversion_to_string); return std::make_shared(common_tuple_types); } @@ -187,9 +342,11 @@ DataTypePtr getLeastSupertype(const DataTypes & types) if (have_maps) { if (!all_maps) - throw Exception(getExceptionMessagePrefix(types) + " because some of them are Maps and some of them are not", ErrorCodes::NO_COMMON_TYPE); + return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are Maps and some of them are not", ErrorCodes::NO_COMMON_TYPE); - return std::make_shared(getLeastSupertype(key_types), getLeastSupertype(value_types)); + return std::make_shared( + getLeastSupertype(key_types, allow_conversion_to_string), + getLeastSupertype(value_types, allow_conversion_to_string)); } } @@ -220,9 +377,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types) if (have_low_cardinality) { if (have_not_low_cardinality) - return getLeastSupertype(nested_types); + return getLeastSupertype(nested_types, allow_conversion_to_string); else - return std::make_shared(getLeastSupertype(nested_types)); + return std::make_shared(getLeastSupertype(nested_types, allow_conversion_to_string)); } } @@ -248,13 +405,13 @@ DataTypePtr getLeastSupertype(const DataTypes & types) if (have_nullable) { - return std::make_shared(getLeastSupertype(nested_types)); + return std::make_shared(getLeastSupertype(nested_types, allow_conversion_to_string)); } } /// Non-recursive rules - std::unordered_set type_ids; + TypeIndexSet type_ids; for (const auto & type : types) type_ids.insert(type->getTypeId()); @@ -268,7 +425,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types) { bool all_strings = type_ids.size() == (have_string + have_fixed_string); if (!all_strings) - throw Exception(getExceptionMessagePrefix(types) + " because some of them are String/FixedString and some of them are not", ErrorCodes::NO_COMMON_TYPE); + return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are String/FixedString and some of them are not", ErrorCodes::NO_COMMON_TYPE); return std::make_shared(); } @@ -285,7 +442,8 @@ DataTypePtr getLeastSupertype(const DataTypes & types) { bool all_date_or_datetime = type_ids.size() == (have_date + have_date32 + have_datetime + have_datetime64); if (!all_date_or_datetime) - throw Exception(getExceptionMessagePrefix(types) + " because some of them are Date/Date32/DateTime/DateTime64 and some of them are not", + return throw_or_return(getExceptionMessagePrefix(types) + + " because some of them are Date/Date32/DateTime/DateTime64 and some of them are not", ErrorCodes::NO_COMMON_TYPE); if (have_datetime64 == 0 && have_date32 == 0) @@ -362,7 +520,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types) } if (num_supported != type_ids.size()) - throw Exception(getExceptionMessagePrefix(types) + " because some of them have no lossless conversion to Decimal", + return throw_or_return(getExceptionMessagePrefix(types) + " because some of them have no lossless conversion to Decimal", ErrorCodes::NO_COMMON_TYPE); UInt32 max_scale = 0; @@ -385,7 +543,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types) } if (min_precision > DataTypeDecimal::maxPrecision()) - throw Exception(getExceptionMessagePrefix(types) + " because the least supertype is Decimal(" + return throw_or_return(getExceptionMessagePrefix(types) + " because the least supertype is Decimal(" + toString(min_precision) + ',' + toString(max_scale) + ')', ErrorCodes::NO_COMMON_TYPE); @@ -399,135 +557,56 @@ DataTypePtr getLeastSupertype(const DataTypes & types) /// For numeric types, the most complicated part. { - bool all_numbers = true; - - size_t max_bits_of_signed_integer = 0; - size_t max_bits_of_unsigned_integer = 0; - size_t max_mantissa_bits_of_floating = 0; - - auto maximize = [](size_t & what, size_t value) - { - if (value > what) - what = value; - }; - - for (const auto & type : types) - { - if (typeid_cast(type.get())) - maximize(max_bits_of_unsigned_integer, 8); - else if (typeid_cast(type.get())) - maximize(max_bits_of_unsigned_integer, 16); - else if (typeid_cast(type.get())) - maximize(max_bits_of_unsigned_integer, 32); - else if (typeid_cast(type.get())) - maximize(max_bits_of_unsigned_integer, 64); - else if (typeid_cast(type.get())) - maximize(max_bits_of_unsigned_integer, 128); - else if (typeid_cast(type.get())) - maximize(max_bits_of_unsigned_integer, 256); - else if (typeid_cast(type.get()) || typeid_cast(type.get())) - maximize(max_bits_of_signed_integer, 8); - else if (typeid_cast(type.get()) || typeid_cast(type.get())) - maximize(max_bits_of_signed_integer, 16); - else if (typeid_cast(type.get())) - maximize(max_bits_of_signed_integer, 32); - else if (typeid_cast(type.get())) - maximize(max_bits_of_signed_integer, 64); - else if (typeid_cast(type.get())) - maximize(max_bits_of_signed_integer, 128); - else if (typeid_cast(type.get())) - maximize(max_bits_of_signed_integer, 256); - else if (typeid_cast(type.get())) - maximize(max_mantissa_bits_of_floating, 24); - else if (typeid_cast(type.get())) - maximize(max_mantissa_bits_of_floating, 53); - else - all_numbers = false; - } - - if (max_bits_of_signed_integer || max_bits_of_unsigned_integer || max_mantissa_bits_of_floating) - { - if (!all_numbers) - throw Exception(getExceptionMessagePrefix(types) + " because some of them are numbers and some of them are not", ErrorCodes::NO_COMMON_TYPE); - - /// If there are signed and unsigned types of same bit-width, the result must be signed number with at least one more bit. - /// Example, common of Int32, UInt32 = Int64. - - size_t min_bit_width_of_integer = std::max(max_bits_of_signed_integer, max_bits_of_unsigned_integer); - - /// If unsigned is not covered by signed. - if (max_bits_of_signed_integer && max_bits_of_unsigned_integer >= max_bits_of_signed_integer) //-V1051 - { - // Because 128 and 256 bit integers are significantly slower, we should not promote to them. - // But if we already have wide numbers, promotion is necessary. - if (min_bit_width_of_integer != 64) - ++min_bit_width_of_integer; - else - throw Exception( - getExceptionMessagePrefix(types) - + " because some of them are signed integers and some are unsigned integers," - " but there is no signed integer type, that can exactly represent all required unsigned integer values", - ErrorCodes::NO_COMMON_TYPE); - } - - /// If the result must be floating. - if (max_mantissa_bits_of_floating) - { - size_t min_mantissa_bits = std::max(min_bit_width_of_integer, max_mantissa_bits_of_floating); - if (min_mantissa_bits <= 24) - return std::make_shared(); - else if (min_mantissa_bits <= 53) - return std::make_shared(); - else - throw Exception(getExceptionMessagePrefix(types) - + " because some of them are integers and some are floating point," - " but there is no floating point type, that can exactly represent all required integers", ErrorCodes::NO_COMMON_TYPE); - } - - /// If the result must be signed integer. - if (max_bits_of_signed_integer) - { - if (min_bit_width_of_integer <= 8) - return std::make_shared(); - else if (min_bit_width_of_integer <= 16) - return std::make_shared(); - else if (min_bit_width_of_integer <= 32) - return std::make_shared(); - else if (min_bit_width_of_integer <= 64) - return std::make_shared(); - else if (min_bit_width_of_integer <= 128) - return std::make_shared(); - else if (min_bit_width_of_integer <= 256) - return std::make_shared(); - else - throw Exception(getExceptionMessagePrefix(types) - + " because some of them are signed integers and some are unsigned integers," - " but there is no signed integer type, that can exactly represent all required unsigned integer values", ErrorCodes::NO_COMMON_TYPE); - } - - /// All unsigned. - { - if (min_bit_width_of_integer <= 8) - return std::make_shared(); - else if (min_bit_width_of_integer <= 16) - return std::make_shared(); - else if (min_bit_width_of_integer <= 32) - return std::make_shared(); - else if (min_bit_width_of_integer <= 64) - return std::make_shared(); - else if (min_bit_width_of_integer <= 128) - return std::make_shared(); - else if (min_bit_width_of_integer <= 256) - return std::make_shared(); - else - throw Exception("Logical error: " + getExceptionMessagePrefix(types) - + " but as all data types are unsigned integers, we must have found maximum unsigned integer type", ErrorCodes::NO_COMMON_TYPE); - } - } + auto numeric_type = getNumericType(type_ids, allow_conversion_to_string); + if (numeric_type) + return numeric_type; } /// All other data types (UUID, AggregateFunction, Enum...) are compatible only if they are the same (checked in trivial cases). - throw Exception(getExceptionMessagePrefix(types), ErrorCodes::NO_COMMON_TYPE); + return throw_or_return(getExceptionMessagePrefix(types), ErrorCodes::NO_COMMON_TYPE); +} + +DataTypePtr getLeastSupertype(const TypeIndexSet & types, bool allow_conversion_to_string) +{ + auto throw_or_return = [&](std::string_view message, int error_code) + { + if (allow_conversion_to_string) + return std::make_shared(); + + throw Exception(String(message), error_code); + }; + + TypeIndexSet types_set; + for (const auto & type : types) + { + if (WhichDataType(type).isNothing()) + continue; + + if (!WhichDataType(type).isSimple()) + throw Exception(ErrorCodes::NO_COMMON_TYPE, + "Cannot get common type by type ids with parametric type {}", typeToString(type)); + + types_set.insert(type); + } + + if (types_set.empty()) + return std::make_shared(); + + if (types.count(TypeIndex::String)) + { + if (types.size() != 1) + return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are String and some of them are not", ErrorCodes::NO_COMMON_TYPE); + + return std::make_shared(); + } + + /// For numeric types, the most complicated part. + auto numeric_type = getNumericType(types, allow_conversion_to_string); + if (numeric_type) + return numeric_type; + + /// All other data types (UUID, AggregateFunction, Enum...) are compatible only if they are the same (checked in trivial cases). + return throw_or_return(getExceptionMessagePrefix(types), ErrorCodes::NO_COMMON_TYPE); } DataTypePtr tryGetLeastSupertype(const DataTypes & types) diff --git a/src/DataTypes/getLeastSupertype.h b/src/DataTypes/getLeastSupertype.h index c35ec7d722c..5444bb34d06 100644 --- a/src/DataTypes/getLeastSupertype.h +++ b/src/DataTypes/getLeastSupertype.h @@ -7,12 +7,16 @@ namespace DB { /** Get data type that covers all possible values of passed data types. - * If there is no such data type, throws an exception. + * If there is no such data type, throws an exception + * or if 'allow_conversion_to_string' is true returns String as common type. * * Examples: least common supertype for UInt8, Int8 - Int16. * Examples: there is no least common supertype for Array(UInt8), Int8. */ -DataTypePtr getLeastSupertype(const DataTypes & types); +DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_conversion_to_string = false); + +using TypeIndexSet = std::unordered_set; +DataTypePtr getLeastSupertype(const TypeIndexSet & types, bool allow_conversion_to_string = false); /// Same as above but return nullptr instead of throwing exception. DataTypePtr tryGetLeastSupertype(const DataTypes & types); diff --git a/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp b/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp index ce1ed98b977..a6b4a978c7b 100644 --- a/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp +++ b/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp @@ -406,13 +406,24 @@ ASTPtr DatabasePostgreSQL::getCreateTableQueryImpl(const String & table_name, Co ASTs storage_children = ast_storage->children; auto storage_engine_arguments = ast_storage->engine->arguments; - /// Remove extra engine argument (`schema` and `use_table_cache`) - if (storage_engine_arguments->children.size() >= 5) - storage_engine_arguments->children.resize(4); + if (storage_engine_arguments->children.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected number of arguments: {}", storage_engine_arguments->children.size()); - /// Add table_name to engine arguments - assert(storage_engine_arguments->children.size() >= 2); - storage_engine_arguments->children.insert(storage_engine_arguments->children.begin() + 2, std::make_shared(table_id.table_name)); + /// Check for named collection. + if (typeid_cast(storage_engine_arguments->children[0].get())) + { + storage_engine_arguments->children.push_back(makeASTFunction("equals", std::make_shared("table"), std::make_shared(table_id.table_name))); + } + else + { + /// Remove extra engine argument (`schema` and `use_table_cache`) + if (storage_engine_arguments->children.size() >= 5) + storage_engine_arguments->children.resize(4); + + /// Add table_name to engine arguments. + if (storage_engine_arguments->children.size() >= 2) + storage_engine_arguments->children.insert(storage_engine_arguments->children.begin() + 2, std::make_shared(table_id.table_name)); + } return create_table_query; } diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp index c63b8453634..66e0538fef1 100644 --- a/src/Formats/JSONEachRowUtils.cpp +++ b/src/Formats/JSONEachRowUtils.cpp @@ -9,9 +9,9 @@ #include #include #include -#include -#include -#include +#include +#include +#include #include @@ -169,6 +169,10 @@ DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field) value_type = type; } + + if (!value_type) + return nullptr; + return std::make_shared(std::make_shared(), value_type); } diff --git a/src/Formats/registerFormats.cpp b/src/Formats/registerFormats.cpp index b7b3b51cd7b..78851e5ebb0 100644 --- a/src/Formats/registerFormats.cpp +++ b/src/Formats/registerFormats.cpp @@ -74,6 +74,7 @@ void registerOutputFormatCapnProto(FormatFactory & factory); void registerInputFormatRegexp(FormatFactory & factory); void registerInputFormatJSONAsString(FormatFactory & factory); +void registerInputFormatJSONAsObject(FormatFactory & factory); void registerInputFormatLineAsString(FormatFactory & factory); void registerInputFormatCapnProto(FormatFactory & factory); @@ -84,6 +85,7 @@ void registerInputFormatHiveText(FormatFactory & factory); /// Non trivial prefix and suffix checkers for disabling parallel parsing. void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory); void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factory); +void registerNonTrivialPrefixAndSuffixCheckerJSONAsObject(FormatFactory & factory); void registerArrowSchemaReader(FormatFactory & factory); void registerParquetSchemaReader(FormatFactory & factory); @@ -175,6 +177,7 @@ void registerFormats() registerInputFormatRegexp(factory); registerInputFormatJSONAsString(factory); registerInputFormatLineAsString(factory); + registerInputFormatJSONAsObject(factory); #if USE_HIVE registerInputFormatHiveText(factory); #endif @@ -183,6 +186,7 @@ void registerFormats() registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(factory); registerNonTrivialPrefixAndSuffixCheckerJSONAsString(factory); + registerNonTrivialPrefixAndSuffixCheckerJSONAsObject(factory); registerArrowSchemaReader(factory); registerParquetSchemaReader(factory); diff --git a/src/Functions/CastOverloadResolver.h b/src/Functions/CastOverloadResolver.h index ffd5dda4af3..cff17d810fe 100644 --- a/src/Functions/CastOverloadResolver.h +++ b/src/Functions/CastOverloadResolver.h @@ -33,22 +33,27 @@ public: ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } - explicit CastOverloadResolverImpl(std::optional diagnostic_, bool keep_nullable_) - : diagnostic(std::move(diagnostic_)), keep_nullable(keep_nullable_) + explicit CastOverloadResolverImpl(std::optional diagnostic_, bool keep_nullable_, bool cast_ipv4_ipv6_default_on_conversion_error_) + : diagnostic(std::move(diagnostic_)) + , keep_nullable(keep_nullable_) + , cast_ipv4_ipv6_default_on_conversion_error(cast_ipv4_ipv6_default_on_conversion_error_) { } static FunctionOverloadResolverPtr create(ContextPtr context) { + const auto & settings_ref = context->getSettingsRef(); + if constexpr (internal) - return createImpl(); - return createImpl({}, context->getSettingsRef().cast_keep_nullable); + return createImpl({}, false /*keep_nullable*/, false /*cast_ipv4_ipv6_default_on_conversion_error*/); + + return createImpl({}, settings_ref.cast_keep_nullable, settings_ref.cast_ipv4_ipv6_default_on_conversion_error); } - static FunctionOverloadResolverPtr createImpl(std::optional diagnostic = {}, bool keep_nullable = false) + static FunctionOverloadResolverPtr createImpl(std::optional diagnostic = {}, bool keep_nullable = false, bool cast_ipv4_ipv6_default_on_conversion_error = false) { assert(!internal || !keep_nullable); - return std::make_unique(std::move(diagnostic), keep_nullable); + return std::make_unique(std::move(diagnostic), keep_nullable, cast_ipv4_ipv6_default_on_conversion_error); } protected: @@ -61,7 +66,7 @@ protected: data_types[i] = arguments[i].type; auto monotonicity = MonotonicityHelper::getMonotonicityInformation(arguments.front().type, return_type.get()); - return std::make_unique>(name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type); + return std::make_unique>(name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type, cast_ipv4_ipv6_default_on_conversion_error); } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override @@ -98,6 +103,7 @@ protected: private: std::optional diagnostic; bool keep_nullable; + bool cast_ipv4_ipv6_default_on_conversion_error; }; @@ -115,7 +121,10 @@ struct CastInternalOverloadName static constexpr auto accurate_cast_or_null_name = "accurate_CastOrNull"; }; -template using CastOverloadResolver = CastOverloadResolverImpl; -template using CastInternalOverloadResolver = CastOverloadResolverImpl; +template +using CastOverloadResolver = CastOverloadResolverImpl; + +template +using CastInternalOverloadResolver = CastOverloadResolverImpl; } diff --git a/src/Functions/FunctionSQLJSON.h b/src/Functions/FunctionSQLJSON.h index 56d29e0c776..e45951e3ec5 100644 --- a/src/Functions/FunctionSQLJSON.h +++ b/src/Functions/FunctionSQLJSON.h @@ -8,13 +8,13 @@ #include #include #include -#include +#include #include #include #include #include -#include -#include +#include +#include #include #include #include diff --git a/src/Functions/FunctionsCharsetClassification.cpp b/src/Functions/FunctionsCharsetClassification.cpp index d29dc14fa9f..af6c1de2768 100644 --- a/src/Functions/FunctionsCharsetClassification.cpp +++ b/src/Functions/FunctionsCharsetClassification.cpp @@ -8,26 +8,21 @@ namespace DB { -/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes. - * Then we use marked-up dictionaries with distributions of bigram bytes of various languages ​​and charsets. - * Using a naive Bayesian classifier, find the most likely charset and language and return it - */ - -template -struct CharsetClassificationImpl +namespace { /* We need to solve zero-frequency problem for Naive Bayes Classifier * If the bigram is not found in the text, we assume that the probability of its meeting is 1e-06. * 1e-06 is minimal value in our marked-up dictionary. */ - static constexpr Float64 zero_frequency = 1e-06; + constexpr Float64 zero_frequency = 1e-06; /// If the data size is bigger than this, behaviour is unspecified for this function. - static constexpr size_t max_string_size = 1u << 15; + constexpr size_t max_string_size = 1UL << 15; - static ALWAYS_INLINE inline Float64 naiveBayes( + template + ALWAYS_INLINE inline Float64 naiveBayes( const FrequencyHolder::EncodingMap & standard, - const HashMap & model, + const ModelMap & model, Float64 max_result) { Float64 res = 0; @@ -52,10 +47,11 @@ struct CharsetClassificationImpl } /// Сount how many times each bigram occurs in the text. - static ALWAYS_INLINE inline void calculateStats( + template + ALWAYS_INLINE inline void calculateStats( const UInt8 * data, const size_t size, - HashMap & model) + ModelMap & model) { UInt16 hash = 0; for (size_t i = 0; i < size; ++i) @@ -65,7 +61,15 @@ struct CharsetClassificationImpl ++model[hash]; } } +} +/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes. + * Then we use marked-up dictionaries with distributions of bigram bytes of various languages ​​and charsets. + * Using a naive Bayesian classifier, find the most likely charset and language and return it + */ +template +struct CharsetClassificationImpl +{ static void vector( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, @@ -74,7 +78,7 @@ struct CharsetClassificationImpl { const auto & encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency(); - if (detect_language) + if constexpr (detect_language) /// 2 chars for ISO code + 1 zero byte res_data.reserve(offsets.size() * 3); else @@ -83,37 +87,43 @@ struct CharsetClassificationImpl res_offsets.resize(offsets.size()); - size_t res_offset = 0; + size_t current_result_offset = 0; + + double zero_frequency_log = log(zero_frequency); for (size_t i = 0; i < offsets.size(); ++i) { const UInt8 * str = data.data() + offsets[i - 1]; const size_t str_len = offsets[i] - offsets[i - 1] - 1; - std::string_view res; - - HashMap model; + HashMapWithStackMemory, 4> model; calculateStats(str, str_len, model); + std::string_view result_value; + /// Go through the dictionary and find the charset with the highest weight - Float64 max_result = log(zero_frequency) * (max_string_size); + Float64 max_result = zero_frequency_log * (max_string_size); for (const auto & item : encodings_freq) { Float64 score = naiveBayes(item.map, model, max_result); if (max_result < score) { max_result = score; - res = detect_language ? item.lang : item.name; + + if constexpr (detect_language) + result_value = item.lang; + else + result_value = item.name; } } - res_data.resize(res_offset + res.size() + 1); - memcpy(&res_data[res_offset], res.data(), res.size()); + size_t result_value_size = result_value.size(); + res_data.resize(current_result_offset + result_value_size + 1); + memcpy(&res_data[current_result_offset], result_value.data(), result_value_size); + res_data[current_result_offset + result_value_size] = '\0'; + current_result_offset += result_value_size + 1; - res_data[res_offset + res.size()] = 0; - res_offset += res.size() + 1; - - res_offsets[i] = res_offset; + res_offsets[i] = current_result_offset; } } }; diff --git a/src/Functions/FunctionsCodingIP.cpp b/src/Functions/FunctionsCodingIP.cpp index 3e7c8bff4d5..de814529d03 100644 --- a/src/Functions/FunctionsCodingIP.cpp +++ b/src/Functions/FunctionsCodingIP.cpp @@ -2,12 +2,15 @@ #pragma clang diagnostic ignored "-Wreserved-identifier" #endif +#include + #include #include #include #include #include #include +#include #include #include #include @@ -17,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -239,17 +242,19 @@ private: } }; - +template class FunctionIPv6StringToNum : public IFunction { public: - static constexpr auto name = "IPv6StringToNum"; - static FunctionPtr create(ContextPtr) { return std::make_shared(); } + static constexpr auto name = exception_mode == IPStringToNumExceptionMode::Throw + ? "IPv6StringToNum" + : (exception_mode == IPStringToNumExceptionMode::Default ? "IPv6StringToNumOrDefault" : "IPv6StringToNumOrNull"); - static inline bool tryParseIPv4(const char * pos) + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + + explicit FunctionIPv6StringToNum(ContextPtr context) + : cast_ipv4_ipv6_default_on_conversion_error(context->getSettingsRef().cast_ipv4_ipv6_default_on_conversion_error) { - UInt32 result = 0; - return DB::parseIPv4(pos, reinterpret_cast(&result)); } String getName() const override { return name; } @@ -258,62 +263,43 @@ public: bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + bool useDefaultImplementationForConstants() const override { return true; } + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (!isString(arguments[0])) + if (!isStringOrFixedString(arguments[0])) + { throw Exception( - "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->getName(), getName()); + } - return std::make_shared(IPV6_BINARY_LENGTH); + auto result_type = std::make_shared(IPV6_BINARY_LENGTH); + + if constexpr (exception_mode == IPStringToNumExceptionMode::Null) + { + return makeNullable(result_type); + } + + return result_type; } - bool useDefaultImplementationForConstants() const override { return true; } - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override { const ColumnPtr & column = arguments[0].column; - if (const auto * col_in = checkAndGetColumn(column.get())) + if constexpr (exception_mode == IPStringToNumExceptionMode::Throw) { - auto col_res = ColumnFixedString::create(IPV6_BINARY_LENGTH); - - auto & vec_res = col_res->getChars(); - vec_res.resize(col_in->size() * IPV6_BINARY_LENGTH); - - const ColumnString::Chars & vec_src = col_in->getChars(); - const ColumnString::Offsets & offsets_src = col_in->getOffsets(); - size_t src_offset = 0; - char src_ipv4_buf[sizeof("::ffff:") + IPV4_MAX_TEXT_LENGTH + 1] = "::ffff:"; - - for (size_t out_offset = 0, i = 0; out_offset < vec_res.size(); out_offset += IPV6_BINARY_LENGTH, ++i) + if (cast_ipv4_ipv6_default_on_conversion_error) { - /// For both cases below: In case of failure, the function parseIPv6 fills vec_res with zero bytes. - - /// If the source IP address is parsable as an IPv4 address, then transform it into a valid IPv6 address. - /// Keeping it simple by just prefixing `::ffff:` to the IPv4 address to represent it as a valid IPv6 address. - if (tryParseIPv4(reinterpret_cast(&vec_src[src_offset]))) - { - std::memcpy( - src_ipv4_buf + std::strlen("::ffff:"), - reinterpret_cast(&vec_src[src_offset]), - std::min(offsets_src[i] - src_offset, IPV4_MAX_TEXT_LENGTH + 1)); - parseIPv6(src_ipv4_buf, reinterpret_cast(&vec_res[out_offset])); - } - else - { - parseIPv6( - reinterpret_cast(&vec_src[src_offset]), reinterpret_cast(&vec_res[out_offset])); - } - src_offset = offsets_src[i]; + return convertToIPv6(column); } - - return col_res; } - else - throw Exception("Illegal column " + arguments[0].column->getName() - + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); + + return convertToIPv6(column); } + +private: + bool cast_ipv4_ipv6_default_on_conversion_error = false; }; @@ -381,69 +367,64 @@ public: } }; - +template class FunctionIPv4StringToNum : public IFunction { public: - static constexpr auto name = "IPv4StringToNum"; - static FunctionPtr create(ContextPtr) { return std::make_shared(); } + static constexpr auto name = exception_mode == IPStringToNumExceptionMode::Throw + ? "IPv4StringToNum" + : (exception_mode == IPStringToNumExceptionMode::Default ? "IPv4StringToNumOrDefault" : "IPv4StringToNumOrNull"); - String getName() const override + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + + explicit FunctionIPv4StringToNum(ContextPtr context) + : cast_ipv4_ipv6_default_on_conversion_error(context->getSettingsRef().cast_ipv4_ipv6_default_on_conversion_error) { - return name; } + String getName() const override { return name; } + size_t getNumberOfArguments() const override { return 1; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + bool useDefaultImplementationForConstants() const override { return true; } + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (!isString(arguments[0])) - throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->getName(), getName()); + } - return std::make_shared(); + auto result_type = std::make_shared(); + + if constexpr (exception_mode == IPStringToNumExceptionMode::Null) + { + return makeNullable(result_type); + } + + return result_type; } - static inline UInt32 parseIPv4(const char * pos) - { - UInt32 result = 0; - DB::parseIPv4(pos, reinterpret_cast(&result)); - - return result; - } - - bool useDefaultImplementationForConstants() const override { return true; } - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override { const ColumnPtr & column = arguments[0].column; - if (const ColumnString * col = checkAndGetColumn(column.get())) + if constexpr (exception_mode == IPStringToNumExceptionMode::Throw) { - auto col_res = ColumnUInt32::create(); - - ColumnUInt32::Container & vec_res = col_res->getData(); - vec_res.resize(col->size()); - - const ColumnString::Chars & vec_src = col->getChars(); - const ColumnString::Offsets & offsets_src = col->getOffsets(); - size_t prev_offset = 0; - - for (size_t i = 0; i < vec_res.size(); ++i) + if (cast_ipv4_ipv6_default_on_conversion_error) { - vec_res[i] = parseIPv4(reinterpret_cast(&vec_src[prev_offset])); - prev_offset = offsets_src[i]; + return convertToIPv4(column); } - - return col_res; } - else - throw Exception("Illegal column " + arguments[0].column->getName() - + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); + + return convertToIPv4(column); } + +private: + bool cast_ipv4_ipv6_default_on_conversion_error = false; }; @@ -503,16 +484,21 @@ private: } }; -class FunctionToIPv4 : public FunctionIPv4StringToNum +template +class FunctionToIPv4 : public FunctionIPv4StringToNum { public: - static constexpr auto name = "toIPv4"; - static FunctionPtr create(ContextPtr) { return std::make_shared(); } + using Base = FunctionIPv4StringToNum; - String getName() const override - { - return name; - } + static constexpr auto name = exception_mode == IPStringToNumExceptionMode::Throw + ? "toIPv4" + : (exception_mode == IPStringToNumExceptionMode::Default ? "toIPv4OrDefault" : "toIPv4OrNull"); + + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + + explicit FunctionToIPv4(ContextPtr context) : Base(context) { } + + String getName() const override { return name; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } @@ -521,18 +507,35 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (!isString(arguments[0])) - throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->getName(), getName()); + } - return DataTypeFactory::instance().get("IPv4"); + auto result_type = DataTypeFactory::instance().get("IPv4"); + + if constexpr (exception_mode == IPStringToNumExceptionMode::Null) + { + return makeNullable(result_type); + } + + return result_type; } }; -class FunctionToIPv6 : public FunctionIPv6StringToNum +template +class FunctionToIPv6 : public FunctionIPv6StringToNum { public: - static constexpr auto name = "toIPv6"; - static FunctionPtr create(ContextPtr) { return std::make_shared(); } + using Base = FunctionIPv6StringToNum; + + static constexpr auto name = exception_mode == IPStringToNumExceptionMode::Throw + ? "toIPv6" + : (exception_mode == IPStringToNumExceptionMode::Default ? "toIPv6OrDefault" : "toIPv6OrNull"); + + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + + explicit FunctionToIPv6(ContextPtr context) : Base(context) { } String getName() const override { return name; } @@ -540,11 +543,20 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (!isString(arguments[0])) - throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + if (!isStringOrFixedString(arguments[0])) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->getName(), getName()); + } - return DataTypeFactory::instance().get("IPv6"); + auto result_type = DataTypeFactory::instance().get("IPv6"); + + if constexpr (exception_mode == IPStringToNumExceptionMode::Null) + { + return makeNullable(result_type); + } + + return result_type; } }; @@ -971,7 +983,7 @@ public: } }; -class FunctionIsIPv4String : public FunctionIPv4StringToNum +class FunctionIsIPv4String : public IFunction { public: static constexpr auto name = "isIPv4String"; @@ -980,46 +992,51 @@ public: String getName() const override { return name; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + size_t getNumberOfArguments() const override { return 1; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + bool useDefaultImplementationForConstants() const override { return true; } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (!isString(arguments[0])) - throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + throw Exception( + "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); return std::make_shared(); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override { - const ColumnPtr & column = arguments[0].column; - if (const ColumnString * col = checkAndGetColumn(column.get())) + const ColumnString * input_column = checkAndGetColumn(arguments[0].column.get()); + + if (!input_column) { - auto col_res = ColumnUInt8::create(); - - ColumnUInt8::Container & vec_res = col_res->getData(); - vec_res.resize(col->size()); - - const ColumnString::Chars & vec_src = col->getChars(); - const ColumnString::Offsets & offsets_src = col->getOffsets(); - size_t prev_offset = 0; - UInt32 result = 0; - - for (size_t i = 0; i < vec_res.size(); ++i) - { - vec_res[i] = DB::parseIPv4(reinterpret_cast(&vec_src[prev_offset]), reinterpret_cast(&result)); - prev_offset = offsets_src[i]; - } - return col_res; + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", arguments[0].column->getName(), getName()); } - else - throw Exception("Illegal column " + arguments[0].column->getName() - + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); + + auto col_res = ColumnUInt8::create(); + + ColumnUInt8::Container & vec_res = col_res->getData(); + vec_res.resize(input_column->size()); + + const ColumnString::Chars & vec_src = input_column->getChars(); + const ColumnString::Offsets & offsets_src = input_column->getOffsets(); + size_t prev_offset = 0; + UInt32 result = 0; + + for (size_t i = 0; i < vec_res.size(); ++i) + { + vec_res[i] = DB::parseIPv4(reinterpret_cast(&vec_src[prev_offset]), reinterpret_cast(&result)); + prev_offset = offsets_src[i]; + } + + return col_res; } }; -class FunctionIsIPv6String : public FunctionIPv6StringToNum +class FunctionIsIPv6String : public IFunction { public: static constexpr auto name = "isIPv6String"; @@ -1028,44 +1045,49 @@ public: String getName() const override { return name; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + size_t getNumberOfArguments() const override { return 1; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + bool useDefaultImplementationForConstants() const override { return true; } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (!isString(arguments[0])) - throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->getName(), getName()); + } return std::make_shared(); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override { - const ColumnPtr & column = arguments[0].column; - - if (const ColumnString * col = checkAndGetColumn(column.get())) + const ColumnString * input_column = checkAndGetColumn(arguments[0].column.get()); + if (!input_column) { - auto col_res = ColumnUInt8::create(); - - ColumnUInt8::Container & vec_res = col_res->getData(); - vec_res.resize(col->size()); - - const ColumnString::Chars & vec_src = col->getChars(); - const ColumnString::Offsets & offsets_src = col->getOffsets(); - size_t prev_offset = 0; - char v[IPV6_BINARY_LENGTH]; - - for (size_t i = 0; i < vec_res.size(); ++i) - { - vec_res[i] = DB::parseIPv6(reinterpret_cast(&vec_src[prev_offset]), reinterpret_cast(v)); - prev_offset = offsets_src[i]; - } - return col_res; + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", arguments[0].column->getName(), getName()); } - else - throw Exception("Illegal column " + arguments[0].column->getName() - + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); + + auto col_res = ColumnUInt8::create(); + + ColumnUInt8::Container & vec_res = col_res->getData(); + vec_res.resize(input_column->size()); + + const ColumnString::Chars & vec_src = input_column->getChars(); + const ColumnString::Offsets & offsets_src = input_column->getOffsets(); + size_t prev_offset = 0; + char buffer[IPV6_BINARY_LENGTH]; + + for (size_t i = 0; i < vec_res.size(); ++i) + { + vec_res[i] = DB::parseIPv6(reinterpret_cast(&vec_src[prev_offset]), reinterpret_cast(buffer)); + prev_offset = offsets_src[i]; + } + + return col_res; } }; @@ -1079,8 +1101,6 @@ void registerFunctionsCoding(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction>(); factory.registerFunction>(); - factory.registerFunction(); - factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); @@ -1089,14 +1109,26 @@ void registerFunctionsCoding(FunctionFactory & factory) factory.registerFunction>(); factory.registerFunction>(); - factory.registerFunction(); - factory.registerFunction(); - factory.registerFunction(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); - /// MysQL compatibility aliases: - factory.registerAlias("INET_ATON", FunctionIPv4StringToNum::name, FunctionFactory::CaseInsensitive); + factory.registerFunction(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + factory.registerFunction>(); + + + /// MySQL compatibility aliases: + factory.registerAlias("INET_ATON", FunctionIPv4StringToNum::name, FunctionFactory::CaseInsensitive); factory.registerAlias("INET6_NTOA", FunctionIPv6NumToString::name, FunctionFactory::CaseInsensitive); - factory.registerAlias("INET6_ATON", FunctionIPv6StringToNum::name, FunctionFactory::CaseInsensitive); + factory.registerAlias("INET6_ATON", FunctionIPv6StringToNum::name, FunctionFactory::CaseInsensitive); factory.registerAlias("INET_NTOA", NameFunctionIPv4NumToString::name, FunctionFactory::CaseInsensitive); } diff --git a/src/Functions/FunctionsCodingIP.h b/src/Functions/FunctionsCodingIP.h new file mode 100644 index 00000000000..246e62d965c --- /dev/null +++ b/src/Functions/FunctionsCodingIP.h @@ -0,0 +1,212 @@ +#pragma once + +#include + +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING; + extern const int ILLEGAL_COLUMN; +} + +enum class IPStringToNumExceptionMode : uint8_t +{ + Throw, + Default, + Null +}; + +static inline bool tryParseIPv4(const char * pos, UInt32 & result_value) +{ + return parseIPv4(pos, reinterpret_cast(&result_value)); +} + +namespace detail +{ + template + ColumnPtr convertToIPv6(const StringColumnType & string_column) + { + size_t column_size = string_column.size(); + + ColumnUInt8::MutablePtr col_null_map_to; + ColumnUInt8::Container * vec_null_map_to = nullptr; + + if constexpr (exception_mode == IPStringToNumExceptionMode::Null) + { + col_null_map_to = ColumnUInt8::create(column_size, false); + vec_null_map_to = &col_null_map_to->getData(); + } + + auto col_res = ColumnFixedString::create(IPV6_BINARY_LENGTH); + + auto & vec_res = col_res->getChars(); + vec_res.resize(column_size * IPV6_BINARY_LENGTH); + + using Chars = typename StringColumnType::Chars; + const Chars & vec_src = string_column.getChars(); + + size_t src_offset = 0; + char src_ipv4_buf[sizeof("::ffff:") + IPV4_MAX_TEXT_LENGTH + 1] = "::ffff:"; + + /// ColumnFixedString contains not null terminated strings. But functions parseIPv6, parseIPv4 expect null terminated string. + std::string fixed_string_buffer; + + if constexpr (std::is_same_v) + { + fixed_string_buffer.resize(string_column.getN()); + } + + for (size_t out_offset = 0, i = 0; out_offset < vec_res.size(); out_offset += IPV6_BINARY_LENGTH, ++i) + { + size_t src_next_offset = src_offset; + + const char * src_value = nullptr; + unsigned char * res_value = reinterpret_cast(&vec_res[out_offset]); + + if constexpr (std::is_same_v) + { + src_value = reinterpret_cast(&vec_src[src_offset]); + src_next_offset = string_column.getOffsets()[i]; + } + else if constexpr (std::is_same_v) + { + size_t fixed_string_size = string_column.getN(); + + std::memcpy(fixed_string_buffer.data(), reinterpret_cast(&vec_src[src_offset]), fixed_string_size); + src_value = fixed_string_buffer.data(); + + src_next_offset += fixed_string_size; + } + + bool parse_result = false; + UInt32 dummy_result = 0; + + /// For both cases below: In case of failure, the function parseIPv6 fills vec_res with zero bytes. + + /// If the source IP address is parsable as an IPv4 address, then transform it into a valid IPv6 address. + /// Keeping it simple by just prefixing `::ffff:` to the IPv4 address to represent it as a valid IPv6 address. + if (tryParseIPv4(src_value, dummy_result)) + { + std::memcpy( + src_ipv4_buf + std::strlen("::ffff:"), + src_value, + std::min(src_next_offset - src_offset, IPV4_MAX_TEXT_LENGTH + 1)); + parse_result = parseIPv6(src_ipv4_buf, res_value); + } + else + { + parse_result = parseIPv6(src_value, res_value); + } + + if (!parse_result) + { + if constexpr (exception_mode == IPStringToNumExceptionMode::Throw) + throw Exception("Invalid IPv6 value", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); + else if constexpr (exception_mode == IPStringToNumExceptionMode::Default) + vec_res[i] = 0; + else if constexpr (exception_mode == IPStringToNumExceptionMode::Null) + (*vec_null_map_to)[i] = true; + } + + src_offset = src_next_offset; + } + + if constexpr (exception_mode == IPStringToNumExceptionMode::Null) + return ColumnNullable::create(std::move(col_res), std::move(col_null_map_to)); + + return col_res; + } +} + +template +ColumnPtr convertToIPv6(ColumnPtr column) +{ + size_t column_size = column->size(); + + auto col_res = ColumnFixedString::create(IPV6_BINARY_LENGTH); + + auto & vec_res = col_res->getChars(); + vec_res.resize(column_size * IPV6_BINARY_LENGTH); + + if (const auto * column_input_string = checkAndGetColumn(column.get())) + { + return detail::convertToIPv6(*column_input_string); + } + else if (const auto * column_input_fixed_string = checkAndGetColumn(column.get())) + { + return detail::convertToIPv6(*column_input_fixed_string); + } + else + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column type {}. Expected String or FixedString", column->getName()); + } +} + +template +ColumnPtr convertToIPv4(ColumnPtr column) +{ + const ColumnString * column_string = checkAndGetColumn(column.get()); + + if (!column_string) + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column type {}. Expected String.", column->getName()); + } + + size_t column_size = column_string->size(); + + ColumnUInt8::MutablePtr col_null_map_to; + ColumnUInt8::Container * vec_null_map_to = nullptr; + + if constexpr (exception_mode == IPStringToNumExceptionMode::Null) + { + col_null_map_to = ColumnUInt8::create(column_size, false); + vec_null_map_to = &col_null_map_to->getData(); + } + + auto col_res = ColumnUInt32::create(); + + ColumnUInt32::Container & vec_res = col_res->getData(); + vec_res.resize(column_size); + + const ColumnString::Chars & vec_src = column_string->getChars(); + const ColumnString::Offsets & offsets_src = column_string->getOffsets(); + size_t prev_offset = 0; + + for (size_t i = 0; i < vec_res.size(); ++i) + { + bool parse_result = tryParseIPv4(reinterpret_cast(&vec_src[prev_offset]), vec_res[i]); + + if (!parse_result) + { + if constexpr (exception_mode == IPStringToNumExceptionMode::Throw) + { + throw Exception("Invalid IPv4 value", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); + } + else if constexpr (exception_mode == IPStringToNumExceptionMode::Default) + { + vec_res[i] = 0; + } + else if constexpr (exception_mode == IPStringToNumExceptionMode::Null) + { + (*vec_null_map_to)[i] = true; + vec_res[i] = 0; + } + } + + prev_offset = offsets_src[i]; + } + + if constexpr (exception_mode == IPStringToNumExceptionMode::Null) + return ColumnNullable::create(std::move(col_res), std::move(col_null_map_to)); + + return col_res; +} + +} diff --git a/src/Functions/FunctionsComparison.h b/src/Functions/FunctionsComparison.h index 0d0195eb2d7..48170d6f564 100644 --- a/src/Functions/FunctionsComparison.h +++ b/src/Functions/FunctionsComparison.h @@ -1055,7 +1055,7 @@ private: ColumnPtr executeGeneric(const ColumnWithTypeAndName & c0, const ColumnWithTypeAndName & c1) const { - DataTypePtr common_type = getLeastSupertype({c0.type, c1.type}); + DataTypePtr common_type = getLeastSupertype(DataTypes{c0.type, c1.type}); ColumnPtr c0_converted = castColumn(c0, common_type); ColumnPtr c1_converted = castColumn(c1, common_type); @@ -1228,7 +1228,7 @@ public: // Comparing Date/Date32 and DateTime64 requires implicit conversion, if (date_and_datetime && (isDateOrDate32(left_type) || isDateOrDate32(right_type))) { - DataTypePtr common_type = getLeastSupertype({left_type, right_type}); + DataTypePtr common_type = getLeastSupertype(DataTypes{left_type, right_type}); ColumnPtr c0_converted = castColumn(col_with_type_and_name_left, common_type); ColumnPtr c1_converted = castColumn(col_with_type_and_name_right, common_type); return executeDecimal({c0_converted, common_type, "left"}, {c1_converted, common_type, "right"}); @@ -1258,7 +1258,7 @@ public: } else if (date_and_datetime) { - DataTypePtr common_type = getLeastSupertype({left_type, right_type}); + DataTypePtr common_type = getLeastSupertype(DataTypes{left_type, right_type}); ColumnPtr c0_converted = castColumn(col_with_type_and_name_left, common_type); ColumnPtr c1_converted = castColumn(col_with_type_and_name_right, common_type); if (!((res = executeNumLeftType(c0_converted.get(), c1_converted.get())) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 5e11cab7e79..bba94409fb9 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -25,6 +25,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -34,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -2532,10 +2537,12 @@ public: , const DataTypes & argument_types_ , const DataTypePtr & return_type_ , std::optional diagnostic_ - , CastType cast_type_) + , CastType cast_type_ + , bool cast_ipv4_ipv6_default_on_conversion_error_) : cast_name(cast_name_), monotonicity_for_range(std::move(monotonicity_for_range_)) , argument_types(argument_types_), return_type(return_type_), diagnostic(std::move(diagnostic_)) , cast_type(cast_type_) + , cast_ipv4_ipv6_default_on_conversion_error(cast_ipv4_ipv6_default_on_conversion_error_) { } @@ -2584,6 +2591,7 @@ private: std::optional diagnostic; CastType cast_type; + bool cast_ipv4_ipv6_default_on_conversion_error; static WrapperType createFunctionAdaptor(FunctionPtr function, const DataTypePtr & from_type) { @@ -2934,20 +2942,61 @@ private: throw Exception{"CAST AS Tuple can only be performed between tuple types or from String.\nLeft type: " + from_type_untyped->getName() + ", right type: " + to_type->getName(), ErrorCodes::TYPE_MISMATCH}; - if (from_type->getElements().size() != to_type->getElements().size()) - throw Exception{"CAST AS Tuple can only be performed between tuple types with the same number of elements or from String.\n" - "Left type: " + from_type->getName() + ", right type: " + to_type->getName(), ErrorCodes::TYPE_MISMATCH}; - const auto & from_element_types = from_type->getElements(); const auto & to_element_types = to_type->getElements(); - auto element_wrappers = getElementWrappers(from_element_types, to_element_types); - return [element_wrappers, from_element_types, to_element_types] + std::vector element_wrappers; + std::vector> to_reverse_index; + + /// For named tuples allow conversions for tuples with + /// different sets of elements. If element exists in @to_type + /// and doesn't exist in @to_type it will be filled by default values. + if (from_type->haveExplicitNames() && from_type->serializeNames() + && to_type->haveExplicitNames() && to_type->serializeNames()) + { + const auto & from_names = from_type->getElementNames(); + std::unordered_map from_positions; + from_positions.reserve(from_names.size()); + for (size_t i = 0; i < from_names.size(); ++i) + from_positions[from_names[i]] = i; + + const auto & to_names = to_type->getElementNames(); + element_wrappers.reserve(to_names.size()); + to_reverse_index.reserve(from_names.size()); + + for (size_t i = 0; i < to_names.size(); ++i) + { + auto it = from_positions.find(to_names[i]); + if (it != from_positions.end()) + { + element_wrappers.emplace_back(prepareUnpackDictionaries(from_element_types[it->second], to_element_types[i])); + to_reverse_index.emplace_back(it->second); + } + else + { + element_wrappers.emplace_back(); + to_reverse_index.emplace_back(); + } + } + } + else + { + if (from_element_types.size() != to_element_types.size()) + throw Exception{"CAST AS Tuple can only be performed between tuple types with the same number of elements or from String.\n" + "Left type: " + from_type->getName() + ", right type: " + to_type->getName(), ErrorCodes::TYPE_MISMATCH}; + + element_wrappers = getElementWrappers(from_element_types, to_element_types); + to_reverse_index.reserve(to_element_types.size()); + for (size_t i = 0; i < to_element_types.size(); ++i) + to_reverse_index.emplace_back(i); + } + + return [element_wrappers, from_element_types, to_element_types, to_reverse_index] (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t input_rows_count) -> ColumnPtr { const auto * col = arguments.front().column.get(); - size_t tuple_size = from_element_types.size(); + size_t tuple_size = to_element_types.size(); const ColumnTuple & column_tuple = typeid_cast(*col); Columns converted_columns(tuple_size); @@ -2955,8 +3004,16 @@ private: /// invoke conversion for each element for (size_t i = 0; i < tuple_size; ++i) { - ColumnsWithTypeAndName element = {{column_tuple.getColumns()[i], from_element_types[i], "" }}; - converted_columns[i] = element_wrappers[i](element, to_element_types[i], nullable_source, input_rows_count); + if (to_reverse_index[i]) + { + size_t from_idx = *to_reverse_index[i]; + ColumnsWithTypeAndName element = {{column_tuple.getColumns()[from_idx], from_element_types[from_idx], "" }}; + converted_columns[i] = element_wrappers[i](element, to_element_types[i], nullable_source, input_rows_count); + } + else + { + converted_columns[i] = to_element_types[i]->createColumn()->cloneResized(input_rows_count); + } } return ColumnTuple::create(converted_columns); @@ -3077,6 +3134,68 @@ private: } } + WrapperType createObjectWrapper(const DataTypePtr & from_type, const DataTypeObject * to_type) const + { + if (const auto * from_tuple = checkAndGetDataType(from_type.get())) + { + if (!from_tuple->haveExplicitNames()) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object can be performed only from flatten Named Tuple. Got: {}", from_type->getName()); + + PathsInData paths; + DataTypes from_types; + + std::tie(paths, from_types) = flattenTuple(from_type); + auto to_types = from_types; + + for (auto & type : to_types) + { + if (isTuple(type) || isNested(type)) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object can be performed only from flatten Named Tuple. Got: {}", from_type->getName()); + + type = recursiveRemoveLowCardinality(type); + } + + return [element_wrappers = getElementWrappers(from_types, to_types), + has_nullable_subcolumns = to_type->hasNullableSubcolumns(), from_types, to_types, paths] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t input_rows_count) + { + size_t tuple_size = to_types.size(); + auto flattened_column = flattenTuple(arguments.front().column); + const auto & column_tuple = assert_cast(*flattened_column); + + if (tuple_size != column_tuple.getColumns().size()) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Expected tuple with {} subcolumn, but got {} subcolumns", + tuple_size, column_tuple.getColumns().size()); + + auto res = ColumnObject::create(has_nullable_subcolumns); + for (size_t i = 0; i < tuple_size; ++i) + { + ColumnsWithTypeAndName element = {{column_tuple.getColumns()[i], from_types[i], "" }}; + auto converted_column = element_wrappers[i](element, to_types[i], nullable_source, input_rows_count); + res->addSubcolumn(paths[i], converted_column->assumeMutable()); + } + + return res; + }; + } + else if (checkAndGetDataType(from_type.get())) + { + return [] (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * nullable_source, size_t input_rows_count) + { + auto res = ConvertImplGenericFromString::execute(arguments, result_type, nullable_source, input_rows_count); + auto & res_object = assert_cast(res->assumeMutableRef()); + res_object.finalize(); + return res; + }; + } + + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object can be performed only from flatten named tuple or string. Got: {}", from_type->getName()); + } + template WrapperType createEnumWrapper(const DataTypePtr & from_type, const DataTypeEnum * to_type) const { @@ -3381,7 +3500,9 @@ private: /// 'requested_result_is_nullable' is true if CAST to Nullable type is requested. WrapperType prepareImpl(const DataTypePtr & from_type, const DataTypePtr & to_type, bool requested_result_is_nullable) const { - if (from_type->equals(*to_type)) + bool convert_to_ipv6 = to_type->getCustomName() && to_type->getCustomName()->getName() == "IPv6"; + + if (from_type->equals(*to_type) && !convert_to_ipv6) { if (isUInt8(from_type)) return createUInt8ToUInt8Wrapper(from_type, to_type); @@ -3449,7 +3570,9 @@ private: return false; }; - auto make_custom_serialization_wrapper = [&](const auto & types) -> bool + bool cast_ipv4_ipv6_default_on_conversion_error_value = cast_ipv4_ipv6_default_on_conversion_error; + + auto make_custom_serialization_wrapper = [&, cast_ipv4_ipv6_default_on_conversion_error_value](const auto & types) -> bool { using Types = std::decay_t; using ToDataType = typename Types::RightType; @@ -3457,8 +3580,45 @@ private: if constexpr (WhichDataType(FromDataType::type_id).isStringOrFixedString()) { - if (to_type->getCustomSerialization()) + if (to_type->getCustomSerialization() && to_type->getCustomName()) { + if (to_type->getCustomName()->getName() == "IPv4") + { + ret = [cast_ipv4_ipv6_default_on_conversion_error_value]( + ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t) + -> ColumnPtr + { + if (!WhichDataType(result_type).isUInt32()) + throw Exception(ErrorCodes::TYPE_MISMATCH, "Wrong result type {}. Expected UInt32", result_type->getName()); + + if (cast_ipv4_ipv6_default_on_conversion_error_value) + return convertToIPv4(arguments[0].column); + else + return convertToIPv4(arguments[0].column); + }; + + return true; + } + + if (to_type->getCustomName()->getName() == "IPv6") + { + ret = [cast_ipv4_ipv6_default_on_conversion_error_value]( + ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t) + -> ColumnPtr + { + if (!WhichDataType(result_type).isFixedString()) + throw Exception( + ErrorCodes::TYPE_MISMATCH, "Wrong result type {}. Expected FixedString", result_type->getName()); + + if (cast_ipv4_ipv6_default_on_conversion_error_value) + return convertToIPv6(arguments[0].column); + else + return convertToIPv6(arguments[0].column); + }; + + return true; + } + ret = &ConvertImplGenericFromString::execute; return true; } @@ -3496,6 +3656,8 @@ private: return createTupleWrapper(from_type, checkAndGetDataType(to_type.get())); case TypeIndex::Map: return createMapWrapper(from_type, checkAndGetDataType(to_type.get())); + case TypeIndex::Object: + return createObjectWrapper(from_type, checkAndGetDataType(to_type.get())); case TypeIndex::AggregateFunction: return createAggregateFunctionWrapper(from_type, checkAndGetDataType(to_type.get())); default: diff --git a/src/Functions/FunctionsJSON.cpp b/src/Functions/FunctionsJSON.cpp index d542f023625..e8f9f73b805 100644 --- a/src/Functions/FunctionsJSON.cpp +++ b/src/Functions/FunctionsJSON.cpp @@ -35,9 +35,9 @@ #include #include -#include -#include -#include +#include +#include +#include #include #include diff --git a/src/Functions/FunctionsRound.h b/src/Functions/FunctionsRound.h index 1a8bf85167f..518b969d441 100644 --- a/src/Functions/FunctionsRound.h +++ b/src/Functions/FunctionsRound.h @@ -663,7 +663,7 @@ public: throw Exception{"Elements of array of second argument of function " + getName() + " must be numeric type.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; } - return getLeastSupertype({type_x, type_arr_nested}); + return getLeastSupertype(DataTypes{type_x, type_arr_nested}); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t) const override diff --git a/src/Functions/array/arrayIndex.h b/src/Functions/array/arrayIndex.h index 35c731dfc78..8b42b99cd69 100644 --- a/src/Functions/array/arrayIndex.h +++ b/src/Functions/array/arrayIndex.h @@ -474,7 +474,7 @@ private: auto arg_decayed = removeNullable(removeLowCardinality(arg)); return ((isNativeNumber(inner_type_decayed) || isEnum(inner_type_decayed)) && isNativeNumber(arg_decayed)) - || getLeastSupertype({inner_type_decayed, arg_decayed}); + || getLeastSupertype(DataTypes{inner_type_decayed, arg_decayed}); } /** @@ -1045,7 +1045,7 @@ private: DataTypePtr array_elements_type = assert_cast(*arguments[0].type).getNestedType(); const DataTypePtr & index_type = arguments[1].type; - DataTypePtr common_type = getLeastSupertype({array_elements_type, index_type}); + DataTypePtr common_type = getLeastSupertype(DataTypes{array_elements_type, index_type}); ColumnPtr col_nested = castColumn({ col->getDataPtr(), array_elements_type, "" }, common_type); diff --git a/src/Functions/array/arrayResize.cpp b/src/Functions/array/arrayResize.cpp index 9d2a29b2fb4..1e6dcfbf069 100644 --- a/src/Functions/array/arrayResize.cpp +++ b/src/Functions/array/arrayResize.cpp @@ -62,7 +62,7 @@ public: if (number_of_arguments == 2) return arguments[0]; else /* if (number_of_arguments == 3) */ - return std::make_shared(getLeastSupertype({array_type->getNestedType(), arguments[2]})); + return std::make_shared(getLeastSupertype(DataTypes{array_type->getNestedType(), arguments[2]})); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type, size_t input_rows_count) const override diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index 0b30f404f8e..730612745ef 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -632,7 +632,7 @@ private: const ColumnWithTypeAndName & arg1 = arguments[1]; const ColumnWithTypeAndName & arg2 = arguments[2]; - DataTypePtr common_type = getLeastSupertype({arg1.type, arg2.type}); + DataTypePtr common_type = getLeastSupertype(DataTypes{arg1.type, arg2.type}); ColumnPtr col_then = castColumn(arg1, common_type); ColumnPtr col_else = castColumn(arg2, common_type); @@ -1022,7 +1022,7 @@ public: throw Exception("Illegal type " + arguments[0]->getName() + " of first argument (condition) of function if. Must be UInt8.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - return getLeastSupertype({arguments[1], arguments[2]}); + return getLeastSupertype(DataTypes{arguments[1], arguments[2]}); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & args, const DataTypePtr & result_type, size_t input_rows_count) const override diff --git a/src/Functions/ifNull.cpp b/src/Functions/ifNull.cpp index 31880b81a41..ab8e2677d28 100644 --- a/src/Functions/ifNull.cpp +++ b/src/Functions/ifNull.cpp @@ -47,7 +47,7 @@ public: if (!arguments[0]->isNullable()) return arguments[0]; - return getLeastSupertype({removeNullable(arguments[0]), arguments[1]}); + return getLeastSupertype(DataTypes{removeNullable(arguments[0]), arguments[1]}); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override diff --git a/src/Functions/neighbor.cpp b/src/Functions/neighbor.cpp index a1254446e01..ab447e61aed 100644 --- a/src/Functions/neighbor.cpp +++ b/src/Functions/neighbor.cpp @@ -78,7 +78,7 @@ public: // check that default value column has supertype with first argument if (number_of_arguments == 3) - return getLeastSupertype({arguments[0], arguments[2]}); + return getLeastSupertype(DataTypes{arguments[0], arguments[2]}); return arguments[0]; } diff --git a/src/Functions/normalizeString.cpp b/src/Functions/normalizeString.cpp index 2fe6a1159af..55eee90f136 100644 --- a/src/Functions/normalizeString.cpp +++ b/src/Functions/normalizeString.cpp @@ -98,8 +98,6 @@ struct NormalizeUTF8Impl ColumnString::Offset current_from_offset = 0; ColumnString::Offset current_to_offset = 0; - icu::UnicodeString to_string; - PODArray from_uchars; PODArray to_uchars; diff --git a/src/Functions/registerFunctionsArithmetic.cpp b/src/Functions/registerFunctionsArithmetic.cpp index d3d82ca0dd8..96e77d34882 100644 --- a/src/Functions/registerFunctionsArithmetic.cpp +++ b/src/Functions/registerFunctionsArithmetic.cpp @@ -41,6 +41,7 @@ void registerFunctionBitBoolMaskOr(FunctionFactory & factory); void registerFunctionBitBoolMaskAnd(FunctionFactory & factory); void registerFunctionBitWrapperFunc(FunctionFactory & factory); void registerFunctionBitSwapLastTwo(FunctionFactory & factory); +void registerFunctionZTest(FunctionFactory & factory); void registerFunctionsArithmetic(FunctionFactory & factory) @@ -84,6 +85,8 @@ void registerFunctionsArithmetic(FunctionFactory & factory) registerFunctionBitBoolMaskAnd(factory); registerFunctionBitWrapperFunc(factory); registerFunctionBitSwapLastTwo(factory); + + registerFunctionZTest(factory); } } diff --git a/src/Functions/roundToExp2.cpp b/src/Functions/roundToExp2.cpp index 37f0637c79a..846890bc5c8 100644 --- a/src/Functions/roundToExp2.cpp +++ b/src/Functions/roundToExp2.cpp @@ -14,36 +14,36 @@ namespace { template -inline std::enable_if_t && (sizeof(T) <= sizeof(UInt32)), T> -roundDownToPowerOfTwo(T x) +requires std::is_integral_v && (sizeof(T) <= sizeof(UInt32)) +inline T roundDownToPowerOfTwo(T x) { return x <= 0 ? 0 : (T(1) << (31 - __builtin_clz(x))); } template -inline std::enable_if_t && (sizeof(T) == sizeof(UInt64)), T> -roundDownToPowerOfTwo(T x) +requires std::is_integral_v && (sizeof(T) == sizeof(UInt64)) +inline T roundDownToPowerOfTwo(T x) { return x <= 0 ? 0 : (T(1) << (63 - __builtin_clzll(x))); } template -inline std::enable_if_t, T> -roundDownToPowerOfTwo(T x) +requires std::is_same_v +inline T roundDownToPowerOfTwo(T x) { return bit_cast(bit_cast(x) & ~((1ULL << 23) - 1)); } template -inline std::enable_if_t, T> -roundDownToPowerOfTwo(T x) +requires std::is_same_v +inline T roundDownToPowerOfTwo(T x) { return bit_cast(bit_cast(x) & ~((1ULL << 52) - 1)); } template -inline std::enable_if_t, T> -roundDownToPowerOfTwo(T) +requires is_big_int_v +inline T roundDownToPowerOfTwo(T) { throw Exception("roundToExp2() for big integers is not implemented", ErrorCodes::NOT_IMPLEMENTED); } diff --git a/src/Functions/transform.cpp b/src/Functions/transform.cpp index b7e1db59c23..de9f1a5ba05 100644 --- a/src/Functions/transform.cpp +++ b/src/Functions/transform.cpp @@ -117,7 +117,7 @@ public: + " has signature: transform(T, Array(T), Array(U), U) -> U; or transform(T, Array(T), Array(T)) -> T; where T and U are types.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; - return getLeastSupertype({type_x, type_arr_to_nested}); + return getLeastSupertype(DataTypes{type_x, type_arr_to_nested}); } else { @@ -140,7 +140,7 @@ public: if (type_arr_to_nested->isValueRepresentedByNumber() && type_default->isValueRepresentedByNumber()) { /// We take the smallest common type for the elements of the array of values `to` and for `default`. - return getLeastSupertype({type_arr_to_nested, type_default}); + return getLeastSupertype(DataTypes{type_arr_to_nested, type_default}); } /// TODO More checks. diff --git a/src/Functions/ztest.cpp b/src/Functions/ztest.cpp new file mode 100644 index 00000000000..aa83b30e020 --- /dev/null +++ b/src/Functions/ztest.cpp @@ -0,0 +1,231 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + + namespace ErrorCodes + { + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int BAD_ARGUMENTS; + } + + + class FunctionTwoSampleProportionsZTest : public IFunction + { + public: + static constexpr auto POOLED = "pooled"; + static constexpr auto UNPOOLED = "unpooled"; + + static constexpr auto name = "proportionsZTest"; + + static FunctionPtr create(ContextPtr) + { + return std::make_shared(); + } + + String getName() const override + { + return name; + } + + size_t getNumberOfArguments() const override { return 6; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {5}; } + + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForConstants() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + static DataTypePtr getReturnType() + { + auto float_data_type = std::make_shared>(); + DataTypes types(4, float_data_type); + + Strings names + { + "z_statistic", + "p_value", + "confidence_interval_low", + "confidence_interval_high" + }; + + return std::make_shared( + std::move(types), + std::move(names) + ); + } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + for (size_t i = 0; i < 4; ++i) + { + if (!isUnsignedInteger(arguments[i].type)) + { + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "The {}th Argument of function {} must be an unsigned integer.", i + 1, getName()); + } + } + + if (!isFloat(arguments[4].type)) + { + throw Exception{ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "The fifth argument {} of function {} should be a float,", arguments[4].type->getName(), getName()}; + } + + /// There is an additional check for constancy in ExecuteImpl + if (!isString(arguments[5].type) || !arguments[5].column) + { + throw Exception{ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "The sixth argument {} of function {} should be a constant string", arguments[5].type->getName(), getName()}; + } + + return getReturnType(); + } + + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & const_arguments, const DataTypePtr &, size_t input_rows_count) const override + { + auto arguments = const_arguments; + /// Only last argument have to be constant + for (size_t i = 0; i < 5; ++i) + arguments[i].column = arguments[i].column->convertToFullColumnIfConst(); + + static const auto uint64_data_type = std::make_shared>(); + + auto column_successes_x = castColumnAccurate(arguments[0], uint64_data_type); + const auto & data_successes_x = checkAndGetColumn>(column_successes_x.get())->getData(); + + auto column_successes_y = castColumnAccurate(arguments[1], uint64_data_type); + const auto & data_successes_y = checkAndGetColumn>(column_successes_y.get())->getData(); + + auto column_trials_x = castColumnAccurate(arguments[2], uint64_data_type); + const auto & data_trials_x = checkAndGetColumn>(column_trials_x.get())->getData(); + + auto column_trials_y = castColumnAccurate(arguments[3], uint64_data_type); + const auto & data_trials_y = checkAndGetColumn>(column_trials_y.get())->getData(); + + static const auto float64_data_type = std::make_shared>(); + + auto column_confidence_level = castColumnAccurate(arguments[4], float64_data_type); + const auto & data_confidence_level = checkAndGetColumn>(column_confidence_level.get())->getData(); + + String usevar = checkAndGetColumnConst(arguments[5].column.get())->getValue(); + + if (usevar != UNPOOLED && usevar != POOLED) + throw Exception{ErrorCodes::BAD_ARGUMENTS, + "The sixth argument {} of function {} must be equal to `pooled` or `unpooled`", arguments[5].type->getName(), getName()}; + + const bool is_unpooled = (usevar == UNPOOLED); + + auto res_z_statistic = ColumnFloat64::create(); + auto & data_z_statistic = res_z_statistic->getData(); + data_z_statistic.reserve(input_rows_count); + + auto res_p_value = ColumnFloat64::create(); + auto & data_p_value = res_p_value->getData(); + data_p_value.reserve(input_rows_count); + + auto res_ci_lower = ColumnFloat64::create(); + auto & data_ci_lower = res_ci_lower->getData(); + data_ci_lower.reserve(input_rows_count); + + auto res_ci_upper = ColumnFloat64::create(); + auto & data_ci_upper = res_ci_upper->getData(); + data_ci_upper.reserve(input_rows_count); + + auto insert_values_into_result = [&data_z_statistic, &data_p_value, &data_ci_lower, &data_ci_upper](Float64 z_stat, Float64 p_value, Float64 lower, Float64 upper) + { + data_z_statistic.emplace_back(z_stat); + data_p_value.emplace_back(p_value); + data_ci_lower.emplace_back(lower); + data_ci_upper.emplace_back(upper); + }; + + static constexpr Float64 nan = std::numeric_limits::quiet_NaN(); + + boost::math::normal_distribution<> nd(0.0, 1.0); + + for (size_t row_num = 0; row_num < input_rows_count; ++row_num) + { + const UInt64 successes_x = data_successes_x[row_num]; + const UInt64 successes_y = data_successes_y[row_num]; + const UInt64 trials_x = data_trials_x[row_num]; + const UInt64 trials_y = data_trials_y[row_num]; + const Float64 confidence_level = data_confidence_level[row_num]; + + const Float64 props_x = static_cast(successes_x) / trials_x; + const Float64 props_y = static_cast(successes_y) / trials_y; + const Float64 diff = props_x - props_y; + const UInt64 trials_total = trials_x + trials_y; + + if (successes_x == 0 || successes_y == 0 + || successes_x > trials_x || successes_y > trials_y + || trials_total == 0 + || !std::isfinite(confidence_level) || confidence_level < 0.0 || confidence_level > 1.0) + { + insert_values_into_result(nan, nan, nan, nan); + continue; + } + + Float64 se = std::sqrt(props_x * (1.0 - props_x) / trials_x + props_y * (1.0 - props_y) / trials_y); + + /// z-statistics + /// z = \frac{ \bar{p_{1}} - \bar{p_{2}} }{ \sqrt{ \frac{ \bar{p_{1}} \left ( 1 - \bar{p_{1}} \right ) }{ n_{1} } \frac{ \bar{p_{2}} \left ( 1 - \bar{p_{2}} \right ) }{ n_{2} } } } + Float64 zstat; + if (is_unpooled) + { + zstat = (props_x - props_y) / se; + } + else + { + UInt64 successes_total = successes_x + successes_y; + Float64 p_pooled = static_cast(successes_total) / trials_total; + Float64 trials_fact = 1.0 / trials_x + 1.0 / trials_y; + zstat = diff / std::sqrt(p_pooled * (1.0 - p_pooled) * trials_fact); + } + + if (!std::isfinite(zstat)) + { + insert_values_into_result(nan, nan, nan, nan); + continue; + } + + // pvalue + Float64 pvalue = 0; + Float64 one_side = 1 - boost::math::cdf(nd, std::abs(zstat)); + pvalue = one_side * 2; + + // Confidence intervals + Float64 d = props_x - props_y; + Float64 z = -boost::math::quantile(nd, (1.0 - confidence_level) / 2.0); + Float64 dist = z * se; + Float64 ci_low = d - dist; + Float64 ci_high = d + dist; + + insert_values_into_result(zstat, pvalue, ci_low, ci_high); + } + + return ColumnTuple::create(Columns{std::move(res_z_statistic), std::move(res_p_value), std::move(res_ci_lower), std::move(res_ci_upper)}); + } + }; + + + void registerFunctionZTest(FunctionFactory & factory) + { + factory.registerFunction(); + } + +} diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index a6125818155..e086f16be54 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -248,7 +248,7 @@ void readString(String & s, ReadBuffer & buf) } template void readStringInto>(PaddedPODArray & s, ReadBuffer & buf); - +template void readStringInto(String & s, ReadBuffer & buf); template void readStringUntilEOFInto(Vector & s, ReadBuffer & buf) @@ -580,6 +580,7 @@ void readQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) template void readQuotedStringInto(PaddedPODArray & s, ReadBuffer & buf); +template void readQuotedStringInto(String & s, ReadBuffer & buf); template void readDoubleQuotedStringInto(NullOutput & s, ReadBuffer & buf); void readDoubleQuotedString(String & s, ReadBuffer & buf) @@ -782,6 +783,68 @@ template bool readJSONStringInto, bool>(PaddedPODArray(NullOutput & s, ReadBuffer & buf); template void readJSONStringInto(String & s, ReadBuffer & buf); +template +ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf) +{ + static constexpr bool throw_exception = std::is_same_v; + + auto error = [](const char * message [[maybe_unused]], int code [[maybe_unused]]) + { + if constexpr (throw_exception) + throw ParsingException(message, code); + return ReturnType(false); + }; + + if (buf.eof() || *buf.position() != '{') + return error("JSON should start from opening curly bracket", ErrorCodes::INCORRECT_DATA); + + s.push_back(*buf.position()); + ++buf.position(); + + Int64 balance = 1; + bool quotes = false; + + while (!buf.eof()) + { + char * next_pos = find_first_symbols<'\\', '{', '}', '"'>(buf.position(), buf.buffer().end()); + appendToStringOrVector(s, buf, next_pos); + buf.position() = next_pos; + + if (!buf.hasPendingData()) + continue; + + s.push_back(*buf.position()); + + if (*buf.position() == '\\') + { + ++buf.position(); + if (!buf.eof()) + { + s.push_back(*buf.position()); + ++buf.position(); + } + + continue; + } + + if (*buf.position() == '"') + quotes = !quotes; + else if (!quotes) // can be only '{' or '}' + balance += *buf.position() == '{' ? 1 : -1; + + ++buf.position(); + + if (balance == 0) + return ReturnType(true); + + if (balance < 0) + break; + } + + return error("JSON should have equal number of opening and closing brackets", ErrorCodes::INCORRECT_DATA); +} + +template void readJSONObjectPossiblyInvalid(String & s, ReadBuffer & buf); template ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf) diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index fd2c4218aef..9396e1d32f7 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -601,6 +601,12 @@ bool tryReadJSONStringInto(Vector & s, ReadBuffer & buf) return readJSONStringInto(s, buf); } +/// Reads chunk of data between {} in that way, +/// that it has balanced parentheses sequence of {}. +/// So, it may form a JSON object, but it can be incorrenct. +template +ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf); + template void readStringUntilWhitespaceInto(Vector & s, ReadBuffer & buf); @@ -966,8 +972,8 @@ inline void readDateTimeText(LocalDateTime & datetime, ReadBuffer & buf) /// Generic methods to read value in native binary format. template -inline std::enable_if_t, void> -readBinary(T & x, ReadBuffer & buf) { readPODBinary(x, buf); } +requires is_arithmetic_v +inline void readBinary(T & x, ReadBuffer & buf) { readPODBinary(x, buf); } inline void readBinary(String & x, ReadBuffer & buf) { readStringBinary(x, buf); } inline void readBinary(Int128 & x, ReadBuffer & buf) { readPODBinary(x, buf); } @@ -982,8 +988,8 @@ inline void readBinary(LocalDate & x, ReadBuffer & buf) { readPODBinary(x, buf); template -inline std::enable_if_t && (sizeof(T) <= 8), void> -readBinaryBigEndian(T & x, ReadBuffer & buf) /// Assuming little endian architecture. +requires is_arithmetic_v && (sizeof(T) <= 8) +inline void readBinaryBigEndian(T & x, ReadBuffer & buf) /// Assuming little endian architecture. { readPODBinary(x, buf); @@ -998,8 +1004,8 @@ readBinaryBigEndian(T & x, ReadBuffer & buf) /// Assuming little endian archi } template -inline std::enable_if_t, void> -readBinaryBigEndian(T & x, ReadBuffer & buf) /// Assuming little endian architecture. +requires is_big_int_v +inline void readBinaryBigEndian(T & x, ReadBuffer & buf) /// Assuming little endian architecture. { for (size_t i = 0; i != std::size(x.items); ++i) { @@ -1034,8 +1040,8 @@ inline void readText(UUID & x, ReadBuffer & buf) { readUUIDText(x, buf); } /// Generic methods to read value in text format, /// possibly in single quotes (only for data types that use quotes in VALUES format of INSERT statement in SQL). template -inline std::enable_if_t, void> -readQuoted(T & x, ReadBuffer & buf) { readText(x, buf); } +requires is_arithmetic_v +inline void readQuoted(T & x, ReadBuffer & buf) { readText(x, buf); } inline void readQuoted(String & x, ReadBuffer & buf) { readQuotedString(x, buf); } @@ -1063,8 +1069,8 @@ inline void readQuoted(UUID & x, ReadBuffer & buf) /// Same as above, but in double quotes. template -inline std::enable_if_t, void> -readDoubleQuoted(T & x, ReadBuffer & buf) { readText(x, buf); } +requires is_arithmetic_v +inline void readDoubleQuoted(T & x, ReadBuffer & buf) { readText(x, buf); } inline void readDoubleQuoted(String & x, ReadBuffer & buf) { readDoubleQuotedString(x, buf); } @@ -1101,7 +1107,8 @@ inline void readCSVSimple(T & x, ReadBuffer & buf) } template -inline std::enable_if_t, void> readCSV(T & x, ReadBuffer & buf) +requires is_arithmetic_v +inline void readCSV(T & x, ReadBuffer & buf) { readCSVSimple(x, buf); } diff --git a/src/IO/VarInt.h b/src/IO/VarInt.h index 29c8a60c935..3161ca6d8a8 100644 --- a/src/IO/VarInt.h +++ b/src/IO/VarInt.h @@ -108,8 +108,8 @@ inline void readVarInt(Int16 & x, ReadBuffer & istr) } template -inline std::enable_if_t, void> -readVarUInt(T & x, ReadBuffer & istr) +requires (!std::is_same_v) +inline void readVarUInt(T & x, ReadBuffer & istr) { UInt64 tmp; readVarUInt(tmp, istr); diff --git a/src/Interpreters/Access/InterpreterCreateUserQuery.cpp b/src/Interpreters/Access/InterpreterCreateUserQuery.cpp index cad451f8ef5..627b4dbac17 100644 --- a/src/Interpreters/Access/InterpreterCreateUserQuery.cpp +++ b/src/Interpreters/Access/InterpreterCreateUserQuery.cpp @@ -16,7 +16,7 @@ namespace DB { namespace ErrorCodes { - extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int BAD_ARGUMENTS; } namespace @@ -27,7 +27,9 @@ namespace const std::shared_ptr & override_name, const std::optional & override_default_roles, const std::optional & override_settings, - const std::optional & override_grantees, bool allow_no_password, bool allow_plaintext_password) + const std::optional & override_grantees, + bool allow_no_password, + bool allow_plaintext_password) { if (override_name) user.setName(override_name->toString()); @@ -35,15 +37,23 @@ namespace user.setName(query.new_name); else if (query.names->size() == 1) user.setName(query.names->front()->toString()); + if (query.auth_data) - { user.auth_data = *query.auth_data; - //User and query IDENTIFIED WITH AUTHTYPE PLAINTEXT and NO_PASSWORD should not be allowed if allow_plaintext_and_no_password is unset. - if ((query.auth_data->getType() == AuthenticationType::PLAINTEXT_PASSWORD && !allow_plaintext_password) || (query.auth_data->getType() == AuthenticationType::NO_PASSWORD && !allow_no_password)) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "User is not allowed to ALTER/CREATE USERS with type "+ toString(query.auth_data->getType())+". Please configure User with authtype" - + "to SHA256_PASSWORD,DOUBLE_SHA1_PASSWORD OR enable setting allow_plaintext_and_no_password in server configuration to configure user with " + toString(query.auth_data->getType()) +" Auth_type." - + "It is not recommended to use " + toString(query.auth_data->getType()) + "."); + + if (query.auth_data || !query.alter) + { + auto auth_type = user.auth_data.getType(); + if (((auth_type == AuthenticationType::NO_PASSWORD) && !allow_no_password) || + ((auth_type == AuthenticationType::PLAINTEXT_PASSWORD) && !allow_plaintext_password)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Authentication type {} is not allowed, check the setting allow_{} in the server configuration", + toString(auth_type), + AuthenticationTypeInfo::get(auth_type).name); + } } + if (override_name && !override_name->host_pattern.empty()) { user.allowed_client_hosts = AllowedClientHosts{}; @@ -91,8 +101,8 @@ BlockIO InterpreterCreateUserQuery::execute() auto & access_control = getContext()->getAccessControl(); auto access = getContext()->getAccess(); access->checkAccess(query.alter ? AccessType::ALTER_USER : AccessType::CREATE_USER); - bool allow_plaintext_password = access_control.isPlaintextPasswordAllowed(); - bool allow_no_password = access_control.isNoPasswordAllowed(); + bool no_password_allowed = access_control.isNoPasswordAllowed(); + bool plaintext_password_allowed = access_control.isPlaintextPasswordAllowed(); std::optional default_roles_from_query; if (query.default_roles) @@ -119,7 +129,7 @@ BlockIO InterpreterCreateUserQuery::execute() auto update_func = [&](const AccessEntityPtr & entity) -> AccessEntityPtr { auto updated_user = typeid_cast>(entity->clone()); - updateUserFromQueryImpl(*updated_user, query, {}, default_roles_from_query, settings_from_query, grantees_from_query, allow_no_password, allow_plaintext_password); + updateUserFromQueryImpl(*updated_user, query, {}, default_roles_from_query, settings_from_query, grantees_from_query, no_password_allowed, plaintext_password_allowed); return updated_user; }; @@ -138,7 +148,7 @@ BlockIO InterpreterCreateUserQuery::execute() for (const auto & name : *query.names) { auto new_user = std::make_shared(); - updateUserFromQueryImpl(*new_user, query, name, default_roles_from_query, settings_from_query, RolesOrUsersSet::AllTag{}, allow_no_password, allow_plaintext_password); + updateUserFromQueryImpl(*new_user, query, name, default_roles_from_query, settings_from_query, RolesOrUsersSet::AllTag{}, no_password_allowed, plaintext_password_allowed); new_users.emplace_back(std::move(new_user)); } diff --git a/src/Interpreters/Access/InterpreterCreateUserQuery.h b/src/Interpreters/Access/InterpreterCreateUserQuery.h index 42d911c712b..372066cfd5e 100644 --- a/src/Interpreters/Access/InterpreterCreateUserQuery.h +++ b/src/Interpreters/Access/InterpreterCreateUserQuery.h @@ -17,7 +17,7 @@ public: BlockIO execute() override; - static void updateUserFromQuery(User & user, const ASTCreateUserQuery & query, bool allow_no_password=true, bool allow_plaintext_password=true); + static void updateUserFromQuery(User & user, const ASTCreateUserQuery & query, bool allow_no_password, bool allow_plaintext_password); private: ASTPtr query_ptr; diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp index 99583c41b64..fb9752ae391 100644 --- a/src/Interpreters/ActionsVisitor.cpp +++ b/src/Interpreters/ActionsVisitor.cpp @@ -81,7 +81,10 @@ static Block createBlockFromCollection(const Collection & collection, const Data size_t columns_num = types.size(); MutableColumns columns(columns_num); for (size_t i = 0; i < columns_num; ++i) + { columns[i] = types[i]->createColumn(); + columns[i]->reserve(collection.size()); + } Row tuple_values; for (const auto & value : collection) diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp index 5d35525aee9..2b92fab15de 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -35,9 +36,13 @@ namespace ClusterProxy SelectStreamFactory::SelectStreamFactory( const Block & header_, + const ColumnsDescriptionByShardNum & objects_by_shard_, + const StorageSnapshotPtr & storage_snapshot_, QueryProcessingStage::Enum processed_stage_) - : header(header_) - , processed_stage{processed_stage_} + : header(header_), + objects_by_shard(objects_by_shard_), + storage_snapshot(storage_snapshot_), + processed_stage(processed_stage_) { } @@ -100,6 +105,10 @@ void SelectStreamFactory::createForShard( Shards & remote_shards, UInt32 shard_count) { + auto it = objects_by_shard.find(shard_info.shard_num); + if (it != objects_by_shard.end()) + replaceMissedSubcolumnsByConstants(storage_snapshot->object_columns, it->second, query_ast); + auto emplace_local_stream = [&]() { local_plans.emplace_back(createLocalPlan(query_ast, header, context, processed_stage, shard_info.shard_num, shard_count)); diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.h b/src/Interpreters/ClusterProxy/SelectStreamFactory.h index 55e81feee33..731bf3acd10 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.h +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -11,11 +12,15 @@ namespace DB namespace ClusterProxy { +using ColumnsDescriptionByShardNum = std::unordered_map; + class SelectStreamFactory final : public IStreamFactory { public: SelectStreamFactory( const Block & header_, + const ColumnsDescriptionByShardNum & objects_by_shard_, + const StorageSnapshotPtr & storage_snapshot_, QueryProcessingStage::Enum processed_stage_); void createForShard( @@ -30,6 +35,8 @@ public: private: const Block header; + const ColumnsDescriptionByShardNum objects_by_shard; + const StorageSnapshotPtr storage_snapshot; QueryProcessingStage::Enum processed_stage; }; diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index 884b8445732..3f1823fb171 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index ed996430996..f7dbd1c8b65 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -31,6 +31,7 @@ #include #include +#include #include #include @@ -52,6 +53,7 @@ #include #include #include +#include #include #include @@ -94,6 +96,7 @@ namespace ErrorCodes extern const int PATH_ACCESS_DENIED; extern const int NOT_IMPLEMENTED; extern const int ENGINE_REQUIRED; + extern const int UNKNOWN_STORAGE; } namespace fs = std::filesystem; @@ -731,11 +734,26 @@ void InterpreterCreateQuery::validateTableStructure(const ASTCreateQuery & creat { String message = "Cannot create table with column '" + name_and_type_pair.name + "' which type is '" + type + "' because experimental geo types are not allowed. " - + "Set setting allow_experimental_geo_types = 1 in order to allow it."; + + "Set setting allow_experimental_geo_types = 1 in order to allow it"; throw Exception(message, ErrorCodes::ILLEGAL_COLUMN); } } } + + if (!create.attach && !settings.allow_experimental_object_type) + { + for (const auto & [name, type] : properties.columns.getAllPhysical()) + { + if (isObject(type)) + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Cannot create table with column '{}' which type is '{}' " + "because experimental Object type is not allowed. " + "Set setting allow_experimental_object_type = 1 in order to allow it", + name, type->getName()); + } + } + } } String InterpreterCreateQuery::getTableEngineName(DefaultTableEngine default_table_engine) @@ -1195,6 +1213,14 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, addColumnsDescriptionToCreateQueryIfNecessary(query_ptr->as(), res); } + if (!create.attach && getContext()->getSettingsRef().database_replicated_allow_only_replicated_engine) + { + bool is_replicated_storage = typeid_cast(res.get()) != nullptr; + if (!is_replicated_storage && res->storesDataOnDisk() && database && database->getEngineName() == "Replicated") + throw Exception(ErrorCodes::UNKNOWN_STORAGE, + "Only table with Replicated engine or tables which does not store data on disk are allowed in Replicated database"); + } + if (from_path && !res->storesDataOnDisk()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ATTACH ... FROM ... query is not supported for {} table engine, " @@ -1220,6 +1246,14 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, /// we can safely destroy the object without a call to "shutdown", because there is guarantee /// that no background threads/similar resources remain after exception from "startup". + if (!res->supportsDynamicSubcolumns() && hasObjectColumns(res->getInMemoryMetadataPtr()->getColumns())) + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Cannot create table with column of type Object, " + "because storage {} doesn't support dynamic subcolumns", + res->getName()); + } + res->startup(); return true; } diff --git a/src/Interpreters/InterpreterDescribeQuery.cpp b/src/Interpreters/InterpreterDescribeQuery.cpp index 36ea2949b6a..da5fcedd469 100644 --- a/src/Interpreters/InterpreterDescribeQuery.cpp +++ b/src/Interpreters/InterpreterDescribeQuery.cpp @@ -64,9 +64,12 @@ Block InterpreterDescribeQuery::getSampleBlock(bool include_subcolumns) BlockIO InterpreterDescribeQuery::execute() { ColumnsDescription columns; + StorageSnapshotPtr storage_snapshot; const auto & ast = query_ptr->as(); const auto & table_expression = ast.table_expression->as(); + const auto & settings = getContext()->getSettingsRef(); + if (table_expression.subquery) { auto names_and_types = InterpreterSelectWithUnionQuery::getSampleBlock( @@ -83,19 +86,27 @@ BlockIO InterpreterDescribeQuery::execute() auto table_id = getContext()->resolveStorageID(table_expression.database_and_table_name); getContext()->checkAccess(AccessType::SHOW_COLUMNS, table_id); auto table = DatabaseCatalog::instance().getTable(table_id, getContext()); - auto table_lock = table->lockForShare(getContext()->getInitialQueryId(), getContext()->getSettingsRef().lock_acquire_timeout); + auto table_lock = table->lockForShare(getContext()->getInitialQueryId(), settings.lock_acquire_timeout); + auto metadata_snapshot = table->getInMemoryMetadataPtr(); + storage_snapshot = table->getStorageSnapshot(metadata_snapshot); columns = metadata_snapshot->getColumns(); } - bool include_subcolumns = getContext()->getSettingsRef().describe_include_subcolumns; + bool extend_object_types = settings.describe_extend_object_types && storage_snapshot; + bool include_subcolumns = settings.describe_include_subcolumns; + Block sample_block = getSampleBlock(include_subcolumns); MutableColumns res_columns = sample_block.cloneEmptyColumns(); for (const auto & column : columns) { res_columns[0]->insert(column.name); - res_columns[1]->insert(column.type->getName()); + + if (extend_object_types) + res_columns[1]->insert(storage_snapshot->getConcreteType(column.name)->getName()); + else + res_columns[1]->insert(column.type->getName()); if (column.default_desc.expression) { @@ -128,6 +139,8 @@ BlockIO InterpreterDescribeQuery::execute() { for (const auto & column : columns) { + auto type = extend_object_types ? storage_snapshot->getConcreteType(column.name) : column.type; + IDataType::forEachSubcolumn([&](const auto & path, const auto & name, const auto & data) { res_columns[0]->insert(Nested::concatenateName(column.name, name)); @@ -150,7 +163,7 @@ BlockIO InterpreterDescribeQuery::execute() res_columns[6]->insertDefault(); res_columns[7]->insert(1u); - }, {column.type->getDefaultSerialization(), column.type, nullptr, nullptr}); + }, { type->getDefaultSerialization(), type, nullptr, nullptr }); } } diff --git a/src/Interpreters/InterpreterOptimizeQuery.cpp b/src/Interpreters/InterpreterOptimizeQuery.cpp index f9a701a0a77..d4fe7604ced 100644 --- a/src/Interpreters/InterpreterOptimizeQuery.cpp +++ b/src/Interpreters/InterpreterOptimizeQuery.cpp @@ -32,6 +32,7 @@ BlockIO InterpreterOptimizeQuery::execute() auto table_id = getContext()->resolveStorageID(ast, Context::ResolveOrdinary); StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext()); auto metadata_snapshot = table->getInMemoryMetadataPtr(); + auto storage_snapshot = table->getStorageSnapshot(metadata_snapshot); // Empty list of names means we deduplicate by all columns, but user can explicitly state which columns to use. Names column_names; @@ -46,7 +47,7 @@ BlockIO InterpreterOptimizeQuery::execute() column_names.emplace_back(col->getColumnName()); } - metadata_snapshot->check(column_names, NamesAndTypesList{}, table_id); + storage_snapshot->check(column_names); Names required_columns; { required_columns = metadata_snapshot->getColumnsRequiredForSortingKey(); diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index f2fc17fbf9a..ce0929f9c6e 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -138,7 +138,7 @@ String InterpreterSelectQuery::generateFilterActions(ActionsDAGPtr & actions, co table_expr->children.push_back(table_expr->database_and_table_name); /// Using separate expression analyzer to prevent any possible alias injection - auto syntax_result = TreeRewriter(context).analyzeSelect(query_ast, TreeRewriterResult({}, storage, metadata_snapshot)); + auto syntax_result = TreeRewriter(context).analyzeSelect(query_ast, TreeRewriterResult({}, storage, storage_snapshot)); SelectQueryExpressionAnalyzer analyzer(query_ast, syntax_result, context, metadata_snapshot); actions = analyzer.simpleSelectActions(); @@ -328,6 +328,8 @@ InterpreterSelectQuery::InterpreterSelectQuery( table_id = storage->getStorageID(); if (!metadata_snapshot) metadata_snapshot = storage->getInMemoryMetadataPtr(); + + storage_snapshot = storage->getStorageSnapshotForQuery(metadata_snapshot, query_ptr); } if (has_input || !joined_tables.resolveTables()) @@ -395,7 +397,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( syntax_analyzer_result = TreeRewriter(context).analyzeSelect( query_ptr, - TreeRewriterResult(source_header.getNamesAndTypesList(), storage, metadata_snapshot), + TreeRewriterResult(source_header.getNamesAndTypesList(), storage, storage_snapshot), options, joined_tables.tablesWithColumns(), required_result_column_names, table_join); query_info.syntax_analyzer_result = syntax_analyzer_result; @@ -516,7 +518,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( } } - source_header = metadata_snapshot->getSampleBlockForColumns(required_columns, storage->getVirtuals(), storage->getStorageID()); + source_header = storage_snapshot->getSampleBlockForColumns(required_columns); } /// Calculate structure of the result. @@ -582,6 +584,9 @@ InterpreterSelectQuery::InterpreterSelectQuery( analysis_result.required_columns = required_columns; } + if (query_info.projection) + storage_snapshot->addProjection(query_info.projection->desc); + /// Blocks used in expression analysis contains size 1 const columns for constant folding and /// null non-const columns to avoid useless memory allocations. However, a valid block sample /// requires all columns to be of size 0, thus we need to sanitize the block here. @@ -631,10 +636,9 @@ Block InterpreterSelectQuery::getSampleBlockImpl() query_analyzer->makeSetsForIndex(query.where()); query_analyzer->makeSetsForIndex(query.prewhere()); query_info.sets = query_analyzer->getPreparedSets(); - } - if (storage && !options.only_analyze) - from_stage = storage->getQueryProcessingStage(context, options.to_stage, metadata_snapshot, query_info); + from_stage = storage->getQueryProcessingStage(context, options.to_stage, storage_snapshot, query_info); + } /// Do I need to perform the first part of the pipeline? /// Running on remote servers during distributed processing or if query is not distributed. @@ -1724,7 +1728,7 @@ void InterpreterSelectQuery::addPrewhereAliasActions() } auto syntax_result - = TreeRewriter(context).analyze(required_columns_all_expr, required_columns_after_prewhere, storage, metadata_snapshot); + = TreeRewriter(context).analyze(required_columns_all_expr, required_columns_after_prewhere, storage, storage_snapshot); alias_actions = ExpressionAnalyzer(required_columns_all_expr, syntax_result, context).getActionsDAG(true); /// The set of required columns could be added as a result of adding an action to calculate ALIAS. @@ -2000,7 +2004,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc quota = context->getQuota(); query_info.settings_limit_offset_done = options.settings_limit_offset_done; - storage->read(query_plan, required_columns, metadata_snapshot, query_info, context, processing_stage, max_block_size, max_streams); + storage->read(query_plan, required_columns, storage_snapshot, query_info, context, processing_stage, max_block_size, max_streams); if (context->hasQueryContext() && !options.is_internal) { @@ -2017,11 +2021,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc /// Create step which reads from empty source if storage has no data. if (!query_plan.isInitialized()) { - auto header = query_info.projection - ? query_info.projection->desc->metadata->getSampleBlockForColumns( - query_info.projection->required_columns, storage->getVirtuals(), storage->getStorageID()) - : metadata_snapshot->getSampleBlockForColumns(required_columns, storage->getVirtuals(), storage->getStorageID()); - + auto header = storage_snapshot->getSampleBlockForColumns(required_columns); addEmptySourceToQueryPlan(query_plan, header, query_info, context); } diff --git a/src/Interpreters/InterpreterSelectQuery.h b/src/Interpreters/InterpreterSelectQuery.h index 4298cbbb794..6bb12caff7d 100644 --- a/src/Interpreters/InterpreterSelectQuery.h +++ b/src/Interpreters/InterpreterSelectQuery.h @@ -202,6 +202,7 @@ private: Poco::Logger * log; StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; /// Reuse already built sets for multiple passes of analysis, possibly across interpreters. PreparedSets prepared_sets; diff --git a/src/Interpreters/MergeJoin.cpp b/src/Interpreters/MergeJoin.cpp index c51d54c13ff..e1e03e53014 100644 --- a/src/Interpreters/MergeJoin.cpp +++ b/src/Interpreters/MergeJoin.cpp @@ -881,6 +881,7 @@ bool MergeJoin::leftJoin(MergeJoinCursor & left_cursor, const Block & left_block { right_cursor.nextN(range.right_length); right_block_info.skip = right_cursor.position(); + left_cursor.nextN(range.left_length); return false; } } diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 1c7b970e731..5e795c5760a 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -802,7 +802,9 @@ ASTPtr MutationsInterpreter::prepareInterpreterSelectQuery(std::vector & /// e.g. ALTER referencing the same table in scalar subquery bool execute_scalar_subqueries = !dry_run; auto syntax_result = TreeRewriter(context).analyze( - all_asts, all_columns, storage, metadata_snapshot, false, true, execute_scalar_subqueries); + all_asts, all_columns, storage, storage->getStorageSnapshot(metadata_snapshot), + false, true, execute_scalar_subqueries); + if (execute_scalar_subqueries && context->hasQueryContext()) for (const auto & it : syntax_result->getScalars()) context->getQueryContext()->addScalar(it.first, it.second); diff --git a/src/Interpreters/Set.cpp b/src/Interpreters/Set.cpp index 7af3e23d0d4..224b13d2c45 100644 --- a/src/Interpreters/Set.cpp +++ b/src/Interpreters/Set.cpp @@ -165,7 +165,7 @@ void Set::setHeader(const ColumnsWithTypeAndName & header) bool Set::insertFromBlock(const ColumnsWithTypeAndName & columns) { - std::unique_lock lock(rwlock); + std::lock_guard lock(rwlock); if (data.empty()) throw Exception("Method Set::setHeader must be called before Set::insertFromBlock", ErrorCodes::LOGICAL_ERROR); diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 8fda5371f68..7b7ccb689c3 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -551,7 +551,7 @@ void TableJoin::inferJoinKeyCommonType(const LeftNamesAndTypes & left, const Rig try { /// TODO(vdimir): use getMostSubtype if possible - common_type = DB::getLeastSupertype({ltype->second, rtype->second}); + common_type = DB::getLeastSupertype(DataTypes{ltype->second, rtype->second}); } catch (DB::Exception & ex) { diff --git a/src/Interpreters/TableOverrideUtils.cpp b/src/Interpreters/TableOverrideUtils.cpp index 922dd6af25b..58e885380bf 100644 --- a/src/Interpreters/TableOverrideUtils.cpp +++ b/src/Interpreters/TableOverrideUtils.cpp @@ -96,7 +96,7 @@ void TableOverrideAnalyzer::analyze(const StorageInMemoryMetadata & metadata, Re { auto * override_column = column_ast->as(); auto override_type = DataTypeFactory::instance().get(override_column->type); - auto found = metadata.columns.tryGetColumnOrSubcolumn(ColumnsDescription::GetFlags::All, override_column->name); + auto found = metadata.columns.tryGetColumnOrSubcolumn(GetColumnsOptions::All, override_column->name); std::optional override_default_kind; if (!override_column->default_specifier.empty()) override_default_kind = columnDefaultKindFromString(override_column->default_specifier); diff --git a/src/Interpreters/TreeOptimizer.cpp b/src/Interpreters/TreeOptimizer.cpp index 64b25ca9777..23938beffc5 100644 --- a/src/Interpreters/TreeOptimizer.cpp +++ b/src/Interpreters/TreeOptimizer.cpp @@ -445,7 +445,7 @@ void optimizeMonotonousFunctionsInOrderBy(ASTSelectQuery * select_query, Context } } - auto sorting_key_columns = result.metadata_snapshot ? result.metadata_snapshot->getSortingKeyColumns() : Names{}; + auto sorting_key_columns = result.storage_snapshot ? result.storage_snapshot->metadata->getSortingKeyColumns() : Names{}; bool is_sorting_key_prefix = true; for (size_t i = 0; i < order_by->children.size(); ++i) @@ -740,9 +740,8 @@ void TreeOptimizer::apply(ASTPtr & query, TreeRewriterResult & result, if (!select_query) throw Exception("Select analyze for not select asts.", ErrorCodes::LOGICAL_ERROR); - if (settings.optimize_functions_to_subcolumns && result.storage - && result.storage->supportsSubcolumns() && result.metadata_snapshot) - optimizeFunctionsToSubcolumns(query, result.metadata_snapshot); + if (settings.optimize_functions_to_subcolumns && result.storage_snapshot && result.storage->supportsSubcolumns()) + optimizeFunctionsToSubcolumns(query, result.storage_snapshot->metadata); /// Move arithmetic operations out of aggregation functions if (settings.optimize_arithmetic_operations_in_aggregate_functions) @@ -755,11 +754,11 @@ void TreeOptimizer::apply(ASTPtr & query, TreeRewriterResult & result, if (converted_to_cnf && settings.optimize_using_constraints) { optimizeWithConstraints(select_query, result.aliases, result.source_columns_set, - tables_with_columns, result.metadata_snapshot, settings.optimize_append_index); + tables_with_columns, result.storage_snapshot->metadata, settings.optimize_append_index); if (settings.optimize_substitute_columns) optimizeSubstituteColumn(select_query, result.aliases, result.source_columns_set, - tables_with_columns, result.metadata_snapshot, result.storage); + tables_with_columns, result.storage_snapshot->metadata, result.storage); } /// GROUP BY injective function elimination. diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index fc3ef681c2c..78e7ed33f8f 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -518,7 +518,7 @@ void getArrayJoinedColumns(ASTPtr & query, TreeRewriterResult & result, const AS bool found = false; for (const auto & column : source_columns) { - auto split = Nested::splitName(column.name); + auto split = Nested::splitName(column.name, /*reverse=*/ true); if (split.first == source_name && !split.second.empty()) { result.array_join_result_to_source[Nested::concatenateName(result_name, split.second)] = column.name; @@ -831,10 +831,10 @@ using RewriteShardNumVisitor = InDepthNodeVisitor; TreeRewriterResult::TreeRewriterResult( const NamesAndTypesList & source_columns_, ConstStoragePtr storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, bool add_special) : storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , source_columns(source_columns_) { collectSourceColumns(add_special); @@ -847,13 +847,12 @@ void TreeRewriterResult::collectSourceColumns(bool add_special) { if (storage) { - const ColumnsDescription & columns = metadata_snapshot->getColumns(); - - NamesAndTypesList columns_from_storage; + auto options = GetColumnsOptions(add_special ? GetColumnsOptions::All : GetColumnsOptions::AllPhysical); + options.withExtendedObjects(); if (storage->supportsSubcolumns()) - columns_from_storage = add_special ? columns.getAllWithSubcolumns() : columns.getAllPhysicalWithSubcolumns(); - else - columns_from_storage = add_special ? columns.getAll() : columns.getAllPhysical(); + options.withSubcolumns(); + + auto columns_from_storage = storage_snapshot->getColumns(options); if (source_columns.empty()) source_columns.swap(columns_from_storage); @@ -960,9 +959,9 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select /// If we have no information about columns sizes, choose a column of minimum size of its data type. required.insert(ExpressionActions::getSmallestColumn(source_columns)); } - else if (is_select && metadata_snapshot && !columns_context.has_array_join) + else if (is_select && storage_snapshot && !columns_context.has_array_join) { - const auto & partition_desc = metadata_snapshot->getPartitionKey(); + const auto & partition_desc = storage_snapshot->metadata->getPartitionKey(); if (partition_desc.expression) { auto partition_source_columns = partition_desc.expression->getRequiredColumns(); @@ -1018,7 +1017,7 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select { for (const auto & name_type : storage_virtuals) { - if (name_type.name == "_shard_num" && storage->isVirtualColumn("_shard_num", metadata_snapshot)) + if (name_type.name == "_shard_num" && storage->isVirtualColumn("_shard_num", storage_snapshot->getMetadataForQuery())) { has_virtual_shard_num = true; break; @@ -1190,7 +1189,7 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect( /// rewrite filters for select query, must go after getArrayJoinedColumns bool is_initiator = getContext()->getClientInfo().distributed_depth == 0; - if (settings.optimize_respect_aliases && result.metadata_snapshot && is_initiator) + if (settings.optimize_respect_aliases && result.storage_snapshot && is_initiator) { std::unordered_set excluded_nodes; { @@ -1201,7 +1200,7 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect( excluded_nodes.insert(table_join_ast->using_expression_list.get()); } - bool is_changed = replaceAliasColumnsInQuery(query, result.metadata_snapshot->getColumns(), + bool is_changed = replaceAliasColumnsInQuery(query, result.storage_snapshot->metadata->getColumns(), result.array_join_result_to_source, getContext(), excluded_nodes); /// If query is changed, we need to redo some work to correct name resolution. if (is_changed) @@ -1234,7 +1233,7 @@ TreeRewriterResultPtr TreeRewriter::analyze( ASTPtr & query, const NamesAndTypesList & source_columns, ConstStoragePtr storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, bool allow_aggregations, bool allow_self_aliases, bool execute_scalar_subqueries) const @@ -1244,7 +1243,7 @@ TreeRewriterResultPtr TreeRewriter::analyze( const auto & settings = getContext()->getSettingsRef(); - TreeRewriterResult result(source_columns, storage, metadata_snapshot, false); + TreeRewriterResult result(source_columns, storage, storage_snapshot, false); normalize(query, result.aliases, result.source_columns_set, false, settings, allow_self_aliases); diff --git a/src/Interpreters/TreeRewriter.h b/src/Interpreters/TreeRewriter.h index 45b3a5a00e3..7fbe4e45fb3 100644 --- a/src/Interpreters/TreeRewriter.h +++ b/src/Interpreters/TreeRewriter.h @@ -19,11 +19,13 @@ struct SelectQueryOptions; using Scalars = std::map; struct StorageInMemoryMetadata; using StorageMetadataPtr = std::shared_ptr; +struct StorageSnapshot; +using StorageSnapshotPtr = std::shared_ptr; struct TreeRewriterResult { ConstStoragePtr storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; std::shared_ptr analyzed_join; const ASTTablesInSelectQueryElement * ast_join = nullptr; @@ -80,7 +82,7 @@ struct TreeRewriterResult explicit TreeRewriterResult( const NamesAndTypesList & source_columns_, ConstStoragePtr storage_ = {}, - const StorageMetadataPtr & metadata_snapshot_ = {}, + const StorageSnapshotPtr & storage_snapshot_ = {}, bool add_special = true); void collectSourceColumns(bool add_special); @@ -112,7 +114,7 @@ public: ASTPtr & query, const NamesAndTypesList & source_columns_, ConstStoragePtr storage = {}, - const StorageMetadataPtr & metadata_snapshot = {}, + const StorageSnapshotPtr & storage_snapshot = {}, bool allow_aggregations = false, bool allow_self_aliases = true, bool execute_scalar_subqueries = true) const; diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index 5813b8c3926..7abe8342100 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,7 @@ #include #include #include +#include #include #include @@ -246,6 +248,8 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID } return src; } + + return applyVisitor(FieldVisitorToString(), src); } else if (const DataTypeArray * type_array = typeid_cast(&type)) { @@ -363,6 +367,46 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID return src; } + else if (isObject(type)) + { + const auto * from_type_tuple = typeid_cast(from_type_hint); + if (src.getType() == Field::Types::Tuple && from_type_tuple && from_type_tuple->haveExplicitNames()) + { + const auto & names = from_type_tuple->getElementNames(); + const auto & tuple = src.get(); + + if (names.size() != tuple.size()) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Bad size of tuple in IN or VALUES section (while converting to Object). Expected size: {}, actual size: {}", + names.size(), tuple.size()); + + Object object; + for (size_t i = 0; i < names.size(); ++i) + object[names[i]] = tuple[i]; + + return object; + } + + if (src.getType() == Field::Types::Map) + { + Object object; + const auto & map = src.get(); + for (size_t i = 0; i < map.size(); ++i) + { + const auto & map_entry = map[i].get(); + const auto & key = map_entry[0]; + const auto & value = map_entry[1]; + + if (key.getType() != Field::Types::String) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cannot convert from Map with key of type {} to Object", key.getTypeName()); + + object[key.get()] = value; + } + + return object; + } + } /// Conversion from string by parsing. if (src.getType() == Field::Types::String) diff --git a/src/Interpreters/evaluateConstantExpression.cpp b/src/Interpreters/evaluateConstantExpression.cpp index ae304906476..e7c5095b7fb 100644 --- a/src/Interpreters/evaluateConstantExpression.cpp +++ b/src/Interpreters/evaluateConstantExpression.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,9 @@ namespace ErrorCodes std::pair> evaluateConstantExpression(const ASTPtr & node, ContextPtr context) { + if (ASTLiteral * literal = node->as()) + return std::make_pair(literal->value, applyVisitor(FieldToDataType(), literal->value)); + NamesAndTypesList source_columns = {{ "_dummy", std::make_shared() }}; auto ast = node->clone(); ReplaceQueryParameterVisitor param_visitor(context->getQueryParameters()); diff --git a/src/Interpreters/getColumnFromBlock.cpp b/src/Interpreters/getColumnFromBlock.cpp new file mode 100644 index 00000000000..ce6fa2904db --- /dev/null +++ b/src/Interpreters/getColumnFromBlock.cpp @@ -0,0 +1,50 @@ +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_FOUND_COLUMN_IN_BLOCK; +} + +ColumnPtr tryGetColumnFromBlock(const Block & block, const NameAndTypePair & requested_column) +{ + const auto * elem = block.findByName(requested_column.getNameInStorage()); + if (!elem) + return nullptr; + + DataTypePtr elem_type; + ColumnPtr elem_column; + + if (requested_column.isSubcolumn()) + { + auto subcolumn_name = requested_column.getSubcolumnName(); + elem_type = elem->type->tryGetSubcolumnType(subcolumn_name); + elem_column = elem->type->tryGetSubcolumn(subcolumn_name, elem->column); + + if (!elem_type || !elem_column) + return nullptr; + } + else + { + elem_type = elem->type; + elem_column = elem->column; + } + + return castColumn({elem_column, elem_type, ""}, requested_column.type); +} + +ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & requested_column) +{ + auto result_column = tryGetColumnFromBlock(block, requested_column); + if (!result_column) + throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, + "Not found column or subcolumn {} in block. There are only columns: {}", + requested_column.name, block.dumpNames()); + + return result_column; +} + +} diff --git a/src/Interpreters/getColumnFromBlock.h b/src/Interpreters/getColumnFromBlock.h new file mode 100644 index 00000000000..26500cfdd17 --- /dev/null +++ b/src/Interpreters/getColumnFromBlock.h @@ -0,0 +1,13 @@ +#pragma once +#include + +namespace DB +{ + +/// Helps in-memory storages to extract columns from block. +/// Properly handles cases, when column is a subcolumn and when it is compressed. +ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & requested_column); + +ColumnPtr tryGetColumnFromBlock(const Block & block, const NameAndTypePair & requested_column); + +} diff --git a/src/Interpreters/getHeaderForProcessingStage.cpp b/src/Interpreters/getHeaderForProcessingStage.cpp index 31913777902..6f4d7d5c525 100644 --- a/src/Interpreters/getHeaderForProcessingStage.cpp +++ b/src/Interpreters/getHeaderForProcessingStage.cpp @@ -82,9 +82,8 @@ bool removeJoin(ASTSelectQuery & select, TreeRewriterResult & rewriter_result, C } Block getHeaderForProcessingStage( - const IStorage & storage, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage) @@ -93,7 +92,8 @@ Block getHeaderForProcessingStage( { case QueryProcessingStage::FetchColumns: { - Block header = metadata_snapshot->getSampleBlockForColumns(column_names, storage.getVirtuals(), storage.getStorageID()); + Block header = storage_snapshot->getSampleBlockForColumns(column_names); + if (query_info.prewhere_info) { auto & prewhere_info = *query_info.prewhere_info; @@ -123,7 +123,7 @@ Block getHeaderForProcessingStage( removeJoin(*query->as(), new_rewriter_result, context); auto pipe = Pipe(std::make_shared( - metadata_snapshot->getSampleBlockForColumns(column_names, storage.getVirtuals(), storage.getStorageID()))); + storage_snapshot->getSampleBlockForColumns(column_names))); return InterpreterSelectQuery(query, context, std::move(pipe), SelectQueryOptions(processed_stage).analyze()).getSampleBlock(); } } diff --git a/src/Interpreters/getHeaderForProcessingStage.h b/src/Interpreters/getHeaderForProcessingStage.h index 54a1126a3df..6ada136030e 100644 --- a/src/Interpreters/getHeaderForProcessingStage.h +++ b/src/Interpreters/getHeaderForProcessingStage.h @@ -10,8 +10,8 @@ namespace DB { class IStorage; -struct StorageInMemoryMetadata; -using StorageMetadataPtr = std::shared_ptr; +struct StorageSnapshot; +using StorageSnapshotPtr = std::shared_ptr; struct SelectQueryInfo; struct TreeRewriterResult; class ASTSelectQuery; @@ -20,9 +20,8 @@ bool hasJoin(const ASTSelectQuery & select); bool removeJoin(ASTSelectQuery & select, TreeRewriterResult & rewriter_result, ContextPtr context); Block getHeaderForProcessingStage( - const IStorage & storage, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage); diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp index 5c87950b821..15dd9229194 100644 --- a/src/Interpreters/inplaceBlockConversions.cpp +++ b/src/Interpreters/inplaceBlockConversions.cpp @@ -15,11 +15,20 @@ #include #include #include +#include +#include +#include +#include namespace DB { +namespace ErrorCode +{ + extern const int LOGICAL_ERROR; +} + namespace { @@ -178,4 +187,97 @@ ActionsDAGPtr evaluateMissingDefaults( return createExpressions(header, expr_list, save_unneeded_columns, context); } +static bool arrayHasNoElementsRead(const IColumn & column) +{ + const auto * column_array = typeid_cast(&column); + + if (!column_array) + return false; + + size_t size = column_array->size(); + if (!size) + return false; + + size_t data_size = column_array->getData().size(); + if (data_size) + return false; + + size_t last_offset = column_array->getOffsets()[size - 1]; + return last_offset != 0; +} + +void fillMissingColumns( + Columns & res_columns, + size_t num_rows, + const NamesAndTypesList & requested_columns, + StorageMetadataPtr metadata_snapshot) +{ + size_t num_columns = requested_columns.size(); + if (num_columns != res_columns.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Invalid number of columns passed to fillMissingColumns. Expected {}, got {}", + num_columns, res_columns.size()); + + /// For a missing column of a nested data structure we must create not a column of empty + /// arrays, but a column of arrays of correct length. + + /// First, collect offset columns for all arrays in the block. + + std::unordered_map offset_columns; + auto requested_column = requested_columns.begin(); + for (size_t i = 0; i < num_columns; ++i, ++requested_column) + { + if (res_columns[i] == nullptr) + continue; + + if (const auto * array = typeid_cast(res_columns[i].get())) + { + String offsets_name = Nested::extractTableName(requested_column->name); + auto & offsets_column = offset_columns[offsets_name]; + + /// If for some reason multiple offsets columns are present for the same nested data structure, + /// choose the one that is not empty. + if (!offsets_column || offsets_column->empty()) + offsets_column = array->getOffsetsPtr(); + } + } + + /// insert default values only for columns without default expressions + requested_column = requested_columns.begin(); + for (size_t i = 0; i < num_columns; ++i, ++requested_column) + { + const auto & [name, type] = *requested_column; + + if (res_columns[i] && arrayHasNoElementsRead(*res_columns[i])) + res_columns[i] = nullptr; + + if (res_columns[i] == nullptr) + { + if (metadata_snapshot && metadata_snapshot->getColumns().hasDefault(name)) + continue; + + String offsets_name = Nested::extractTableName(name); + auto offset_it = offset_columns.find(offsets_name); + const auto * array_type = typeid_cast(type.get()); + if (offset_it != offset_columns.end() && array_type) + { + const auto & nested_type = array_type->getNestedType(); + ColumnPtr offsets_column = offset_it->second; + size_t nested_rows = typeid_cast(*offsets_column).getData().back(); + + ColumnPtr nested_column = + nested_type->createColumnConstWithDefaultValue(nested_rows)->convertToFullColumnIfConst(); + + res_columns[i] = ColumnArray::create(nested_column, offsets_column); + } + else + { + /// We must turn a constant column into a full column because the interpreter could infer + /// that it is constant everywhere but in some blocks (from other parts) it can be a full column. + res_columns[i] = type->createColumnConstWithDefaultValue(num_rows)->convertToFullColumnIfConst(); + } + } + } +} + } diff --git a/src/Interpreters/inplaceBlockConversions.h b/src/Interpreters/inplaceBlockConversions.h index cc8261693f9..b3113ddfa5c 100644 --- a/src/Interpreters/inplaceBlockConversions.h +++ b/src/Interpreters/inplaceBlockConversions.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -14,6 +15,13 @@ class Block; class NamesAndTypesList; class ColumnsDescription; +class IColumn; +using ColumnPtr = COW::Ptr; +using Columns = std::vector; + +struct StorageInMemoryMetadata; +using StorageMetadataPtr = std::shared_ptr; + class ActionsDAG; using ActionsDAGPtr = std::shared_ptr; @@ -31,4 +39,10 @@ ActionsDAGPtr evaluateMissingDefaults( /// Tries to convert columns in block to required_columns void performRequiredConversions(Block & block, const NamesAndTypesList & required_columns, ContextPtr context); +void fillMissingColumns( + Columns & res_columns, + size_t num_rows, + const NamesAndTypesList & requested_columns, + StorageMetadataPtr metadata_snapshot); + } diff --git a/src/Processors/Formats/IRowInputFormat.cpp b/src/Processors/Formats/IRowInputFormat.cpp index c4857326e6e..645100dad19 100644 --- a/src/Processors/Formats/IRowInputFormat.cpp +++ b/src/Processors/Formats/IRowInputFormat.cpp @@ -1,4 +1,5 @@ #include +#include #include // toString #include @@ -199,6 +200,7 @@ Chunk IRowInputFormat::generate() return {}; } + finalizeObjectColumns(columns); Chunk chunk(std::move(columns), num_rows); //chunk.setChunkInfo(std::move(chunk_missing_values)); return chunk; diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp index 56ba975dea1..914ec27fc46 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp @@ -14,30 +14,25 @@ namespace ErrorCodes extern const int INCORRECT_DATA; } -JSONAsStringRowInputFormat::JSONAsStringRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_) - : JSONAsStringRowInputFormat(header_, std::make_unique(in_), params_) {} +JSONAsRowInputFormat::JSONAsRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_) + : JSONAsRowInputFormat(header_, std::make_unique(in_), params_) {} -JSONAsStringRowInputFormat::JSONAsStringRowInputFormat(const Block & header_, std::unique_ptr buf_, Params params_) : +JSONAsRowInputFormat::JSONAsRowInputFormat(const Block & header_, std::unique_ptr buf_, Params params_) : IRowInputFormat(header_, *buf_, std::move(params_)), buf(std::move(buf_)) { if (header_.columns() > 1) throw Exception(ErrorCodes::BAD_ARGUMENTS, - "This input format is only suitable for tables with a single column of type String but the number of columns is {}", + "This input format is only suitable for tables with a single column of type String or Object, but the number of columns is {}", header_.columns()); - - if (!isString(removeNullable(removeLowCardinality(header_.getByPosition(0).type)))) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "This input format is only suitable for tables with a single column of type String but the column type is {}", - header_.getByPosition(0).type->getName()); } -void JSONAsStringRowInputFormat::resetParser() +void JSONAsRowInputFormat::resetParser() { IRowInputFormat::resetParser(); buf->reset(); } -void JSONAsStringRowInputFormat::readPrefix() +void JSONAsRowInputFormat::readPrefix() { /// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it. skipBOMIfExists(*buf); @@ -50,7 +45,7 @@ void JSONAsStringRowInputFormat::readPrefix() } } -void JSONAsStringRowInputFormat::readSuffix() +void JSONAsRowInputFormat::readSuffix() { skipWhitespaceIfAny(*buf); if (data_in_square_brackets) @@ -66,6 +61,57 @@ void JSONAsStringRowInputFormat::readSuffix() assertEOF(*buf); } +bool JSONAsRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) +{ + assert(columns.size() == 1); + assert(serializations.size() == 1); + + if (!allow_new_rows) + return false; + + skipWhitespaceIfAny(*buf); + if (!buf->eof()) + { + if (!data_in_square_brackets && *buf->position() == ';') + { + /// ';' means the end of query, but it cannot be before ']'. + return allow_new_rows = false; + } + else if (data_in_square_brackets && *buf->position() == ']') + { + /// ']' means the end of query. + return allow_new_rows = false; + } + } + + if (!buf->eof()) + readJSONObject(*columns[0]); + + skipWhitespaceIfAny(*buf); + if (!buf->eof() && *buf->position() == ',') + ++buf->position(); + skipWhitespaceIfAny(*buf); + + return !buf->eof(); +} + +void JSONAsRowInputFormat::setReadBuffer(ReadBuffer & in_) +{ + buf = std::make_unique(in_); + IInputFormat::setReadBuffer(*buf); +} + + +JSONAsStringRowInputFormat::JSONAsStringRowInputFormat( + const Block & header_, ReadBuffer & in_, Params params_) + : JSONAsRowInputFormat(header_, in_, params_) +{ + if (!isString(removeNullable(removeLowCardinality(header_.getByPosition(0).type)))) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "This input format is only suitable for tables with a single column of type String but the column type is {}", + header_.getByPosition(0).type->getName()); +} + void JSONAsStringRowInputFormat::readJSONObject(IColumn & column) { PeekableReadBufferCheckpoint checkpoint{*buf}; @@ -143,41 +189,21 @@ void JSONAsStringRowInputFormat::readJSONObject(IColumn & column) buf->position() = end; } -bool JSONAsStringRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &) + +JSONAsObjectRowInputFormat::JSONAsObjectRowInputFormat( + const Block & header_, ReadBuffer & in_, Params params_, const FormatSettings & format_settings_) + : JSONAsRowInputFormat(header_, in_, params_) + , format_settings(format_settings_) { - if (!allow_new_rows) - return false; - - skipWhitespaceIfAny(*buf); - if (!buf->eof()) - { - if (!data_in_square_brackets && *buf->position() == ';') - { - /// ';' means the end of query, but it cannot be before ']'. - return allow_new_rows = false; - } - else if (data_in_square_brackets && *buf->position() == ']') - { - /// ']' means the end of query. - return allow_new_rows = false; - } - } - - if (!buf->eof()) - readJSONObject(*columns[0]); - - skipWhitespaceIfAny(*buf); - if (!buf->eof() && *buf->position() == ',') - ++buf->position(); - skipWhitespaceIfAny(*buf); - - return !buf->eof(); + if (!isObject(header_.getByPosition(0).type)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Input format JSONAsObject is only suitable for tables with a single column of type Object but the column type is {}", + header_.getByPosition(0).type->getName()); } -void JSONAsStringRowInputFormat::setReadBuffer(ReadBuffer & in_) +void JSONAsObjectRowInputFormat::readJSONObject(IColumn & column) { - buf = std::make_unique(in_); - IInputFormat::setReadBuffer(*buf); + serializations[0]->deserializeTextJSON(column, *buf, format_settings); } void registerInputFormatJSONAsString(FormatFactory & factory) @@ -202,6 +228,23 @@ void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factor factory.registerNonTrivialPrefixAndSuffixChecker("JSONAsString", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); } +void registerInputFormatJSONAsObject(FormatFactory & factory) +{ + factory.registerInputFormat("JSONAsObject", []( + ReadBuffer & buf, + const Block & sample, + IRowInputFormat::Params params, + const FormatSettings & settings) + { + return std::make_shared(sample, buf, std::move(params), settings); + }); +} + +void registerNonTrivialPrefixAndSuffixCheckerJSONAsObject(FormatFactory & factory) +{ + factory.registerNonTrivialPrefixAndSuffixChecker("JSONAsObject", nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl); +} + void registerJSONAsStringSchemaReader(FormatFactory & factory) { factory.registerExternalSchemaReader("JSONAsString", [](const FormatSettings &) diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h index 9979a5d1474..f7880eac867 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.h @@ -12,35 +12,58 @@ namespace DB class ReadBuffer; /// This format parses a sequence of JSON objects separated by newlines, spaces and/or comma. -/// Each JSON object is parsed as a whole to string. -/// This format can only parse a table with single field of type String. - -class JSONAsStringRowInputFormat final : public IRowInputFormat +class JSONAsRowInputFormat : public IRowInputFormat { public: - JSONAsStringRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_); + JSONAsRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_); - String getName() const override { return "JSONAsStringRowInputFormat"; } void resetParser() override; void setReadBuffer(ReadBuffer & in_) override; private: - JSONAsStringRowInputFormat(const Block & header_, std::unique_ptr buf_, Params params_); + JSONAsRowInputFormat(const Block & header_, std::unique_ptr buf_, Params params_); bool readRow(MutableColumns & columns, RowReadExtension & ext) override; void readPrefix() override; void readSuffix() override; - void readJSONObject(IColumn & column); - +protected: + virtual void readJSONObject(IColumn & column) = 0; std::unique_ptr buf; +private: /// This flag is needed to know if data is in square brackets. bool data_in_square_brackets = false; bool allow_new_rows = true; }; +/// Each JSON object is parsed as a whole to string. +/// This format can only parse a table with single field of type String. +class JSONAsStringRowInputFormat final : public JSONAsRowInputFormat +{ +public: + JSONAsStringRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_); + String getName() const override { return "JSONAsStringRowInputFormat"; } + +private: + void readJSONObject(IColumn & column) override; +}; + + +/// Each JSON object is parsed as a whole to object. +/// This format can only parse a table with single field of type Object. +class JSONAsObjectRowInputFormat final : public JSONAsRowInputFormat +{ +public: + JSONAsObjectRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_, const FormatSettings & format_settings_); + String getName() const override { return "JSONAsObjectRowInputFormat"; } + +private: + void readJSONObject(IColumn & column) override; + const FormatSettings format_settings; +}; + class JSONAsStringExternalSchemaReader : public IExternalSchemaReader { public: diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index c9337929adc..bf8feb077ed 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -105,6 +106,7 @@ Chunk ValuesBlockInputFormat::generate() return {}; } + finalizeObjectColumns(columns); size_t rows_in_block = columns[0]->size(); return Chunk{std::move(columns), rows_in_block}; } diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index ad4d1ea86d6..1bfc1ec7306 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -68,8 +68,7 @@ ReadFromMergeTree::ReadFromMergeTree( Names virt_column_names_, const MergeTreeData & data_, const SelectQueryInfo & query_info_, - StorageMetadataPtr metadata_snapshot_, - StorageMetadataPtr metadata_snapshot_base_, + StorageSnapshotPtr storage_snapshot_, ContextPtr context_, size_t max_block_size_, size_t num_streams_, @@ -79,7 +78,7 @@ ReadFromMergeTree::ReadFromMergeTree( MergeTreeDataSelectAnalysisResultPtr analyzed_result_ptr_, bool enable_parallel_reading) : ISourceStep(DataStream{.header = MergeTreeBaseSelectProcessor::transformHeader( - metadata_snapshot_->getSampleBlockForColumns(real_column_names_, data_.getVirtuals(), data_.getStorageID()), + storage_snapshot_->getSampleBlockForColumns(real_column_names_), getPrewhereInfo(query_info_), data_.getPartitionValueType(), virt_column_names_)}) @@ -91,8 +90,8 @@ ReadFromMergeTree::ReadFromMergeTree( , query_info(query_info_) , prewhere_info(getPrewhereInfo(query_info)) , actions_settings(ExpressionActionsSettings::fromContext(context_)) - , metadata_snapshot(std::move(metadata_snapshot_)) - , metadata_snapshot_base(std::move(metadata_snapshot_base_)) + , storage_snapshot(std::move(storage_snapshot_)) + , metadata_for_reading(storage_snapshot->getMetadataForQuery()) , context(std::move(context_)) , max_block_size(max_block_size_) , requested_num_streams(num_streams_) @@ -142,7 +141,7 @@ Pipe ReadFromMergeTree::readFromPool( min_marks_for_concurrent_read, std::move(parts_with_range), data, - metadata_snapshot, + storage_snapshot, prewhere_info, required_columns, backoff_settings, @@ -169,7 +168,7 @@ Pipe ReadFromMergeTree::readFromPool( auto source = std::make_shared( i, pool, min_marks_for_concurrent_read, max_block_size, settings.preferred_block_size_bytes, settings.preferred_max_column_in_block_size_bytes, - data, metadata_snapshot, use_uncompressed_cache, + data, storage_snapshot, use_uncompressed_cache, prewhere_info, actions_settings, reader_settings, virt_column_names, std::move(extension)); /// Set the approximate number of rows for the first source only @@ -205,7 +204,7 @@ ProcessorPtr ReadFromMergeTree::createSource( }; } return std::make_shared( - data, metadata_snapshot, part.data_part, max_block_size, preferred_block_size_bytes, + data, storage_snapshot, part.data_part, max_block_size, preferred_block_size_bytes, preferred_max_column_in_block_size_bytes, required_columns, part.ranges, use_uncompressed_cache, prewhere_info, actions_settings, reader_settings, virt_column_names, part.part_index_in_query, has_limit_below_one_block, std::move(extension)); } @@ -511,12 +510,12 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsWithOrder( size_t fixed_prefix_size = input_order_info->order_key_fixed_prefix_descr.size(); size_t prefix_size = fixed_prefix_size + input_order_info->order_key_prefix_descr.size(); - auto order_key_prefix_ast = metadata_snapshot->getSortingKey().expression_list_ast->clone(); + auto order_key_prefix_ast = metadata_for_reading->getSortingKey().expression_list_ast->clone(); order_key_prefix_ast->children.resize(prefix_size); - auto syntax_result = TreeRewriter(context).analyze(order_key_prefix_ast, metadata_snapshot->getColumns().getAllPhysical()); + auto syntax_result = TreeRewriter(context).analyze(order_key_prefix_ast, metadata_for_reading->getColumns().getAllPhysical()); auto sorting_key_prefix_expr = ExpressionAnalyzer(order_key_prefix_ast, syntax_result, context).getActionsDAG(false); - const auto & sorting_columns = metadata_snapshot->getSortingKey().column_names; + const auto & sorting_columns = metadata_for_reading->getSortingKey().column_names; SortDescription sort_description; for (size_t j = 0; j < prefix_size; ++j) @@ -745,7 +744,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( } auto sorting_expr = std::make_shared( - metadata_snapshot->getSortingKey().expression->getActionsDAG().clone()); + metadata_for_reading->getSortingKey().expression->getActionsDAG().clone()); pipe.addSimpleTransform([sorting_expr](const Block & header) { @@ -762,12 +761,12 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( continue; } - Names sort_columns = metadata_snapshot->getSortingKeyColumns(); + Names sort_columns = metadata_for_reading->getSortingKeyColumns(); SortDescription sort_description; size_t sort_columns_size = sort_columns.size(); sort_description.reserve(sort_columns_size); - Names partition_key_columns = metadata_snapshot->getPartitionKey().column_names; + Names partition_key_columns = metadata_for_reading->getPartitionKey().column_names; const auto & header = pipe.getHeader(); for (size_t i = 0; i < sort_columns_size; ++i) @@ -807,7 +806,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( out_projection = createProjection(pipe.getHeader()); auto sorting_expr = std::make_shared( - metadata_snapshot->getSortingKey().expression->getActionsDAG().clone()); + metadata_for_reading->getSortingKey().expression->getActionsDAG().clone()); pipe.addSimpleTransform([sorting_expr](const Block & header) { @@ -824,8 +823,8 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(Merge { return selectRangesToRead( std::move(parts), - metadata_snapshot_base, - metadata_snapshot, + storage_snapshot->metadata, + storage_snapshot->getMetadataForQuery(), query_info, context, requested_num_streams, @@ -867,7 +866,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead( result.column_names_to_read.push_back(ExpressionActions::getSmallestColumn(available_real_columns)); } - metadata_snapshot->check(result.column_names_to_read, data.getVirtuals(), data.getStorageID()); + // storage_snapshot->check(result.column_names_to_read); // Build and check if primary key is used when necessary const auto & primary_key = metadata_snapshot->getPrimaryKey(); @@ -1045,7 +1044,7 @@ void ReadFromMergeTree::initializePipeline(QueryPipelineBuilder & pipeline, cons if (select.final()) { /// Add columns needed to calculate the sorting expression and the sign. - std::vector add_columns = metadata_snapshot->getColumnsRequiredForSortingKey(); + std::vector add_columns = metadata_for_reading->getColumnsRequiredForSortingKey(); column_names_to_read.insert(column_names_to_read.end(), add_columns.begin(), add_columns.end()); if (!data.merging_params.sign_column.empty()) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h index 0d07a3e2ea2..685b99a7bdc 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.h +++ b/src/Processors/QueryPlan/ReadFromMergeTree.h @@ -89,8 +89,7 @@ public: Names virt_column_names_, const MergeTreeData & data_, const SelectQueryInfo & query_info_, - StorageMetadataPtr metadata_snapshot_, - StorageMetadataPtr metadata_snapshot_base_, + StorageSnapshotPtr storage_snapshot, ContextPtr context_, size_t max_block_size_, size_t num_streams_, @@ -141,8 +140,8 @@ private: PrewhereInfoPtr prewhere_info; ExpressionActionsSettings actions_settings; - StorageMetadataPtr metadata_snapshot; - StorageMetadataPtr metadata_snapshot_base; + StorageSnapshotPtr storage_snapshot; + StorageMetadataPtr metadata_for_reading; ContextPtr context; diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index 3c96e12e869..b81ed099915 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -1999,7 +1999,7 @@ struct WindowFunctionLagLeadInFrame final : public WindowFunction return; } - const auto supertype = getLeastSupertype({argument_types[0], argument_types[2]}); + const auto supertype = getLeastSupertype(DataTypes{argument_types[0], argument_types[2]}); if (!supertype) { throw Exception(ErrorCodes::BAD_ARGUMENTS, diff --git a/src/QueryPipeline/RemoteQueryExecutor.cpp b/src/QueryPipeline/RemoteQueryExecutor.cpp index d1275444b84..110d4308236 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.cpp +++ b/src/QueryPipeline/RemoteQueryExecutor.cpp @@ -563,12 +563,13 @@ void RemoteQueryExecutor::sendExternalTables() { SelectQueryInfo query_info; auto metadata_snapshot = cur->getInMemoryMetadataPtr(); + auto storage_snapshot = cur->getStorageSnapshot(metadata_snapshot); QueryProcessingStage::Enum read_from_table_stage = cur->getQueryProcessingStage( - context, QueryProcessingStage::Complete, metadata_snapshot, query_info); + context, QueryProcessingStage::Complete, storage_snapshot, query_info); Pipe pipe = cur->read( metadata_snapshot->getColumns().getNamesOfPhysical(), - metadata_snapshot, query_info, context, + storage_snapshot, query_info, context, read_from_table_stage, DEFAULT_BLOCK_SIZE, 1); if (pipe.empty()) diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 8ca3c44bac2..88f38b54f2b 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -384,6 +384,56 @@ NamesAndTypesList ColumnsDescription::getAll() const return ret; } +NamesAndTypesList ColumnsDescription::getSubcolumns(const String & name_in_storage) const +{ + auto range = subcolumns.get<1>().equal_range(name_in_storage); + return NamesAndTypesList(range.first, range.second); +} + +void ColumnsDescription::addSubcolumnsToList(NamesAndTypesList & source_list) const +{ + NamesAndTypesList subcolumns_list; + for (const auto & col : source_list) + { + auto range = subcolumns.get<1>().equal_range(col.name); + if (range.first != range.second) + subcolumns_list.insert(subcolumns_list.end(), range.first, range.second); + } + + source_list.splice(source_list.end(), std::move(subcolumns_list)); +} + +NamesAndTypesList ColumnsDescription::get(const GetColumnsOptions & options) const +{ + NamesAndTypesList res; + switch (options.kind) + { + case GetColumnsOptions::All: + res = getAll(); + break; + case GetColumnsOptions::AllPhysical: + res = getAllPhysical(); + break; + case GetColumnsOptions::Ordinary: + res = getOrdinary(); + break; + case GetColumnsOptions::Materialized: + res = getMaterialized(); + break; + case GetColumnsOptions::Aliases: + res = getAliases(); + break; + case GetColumnsOptions::Ephemeral: + res = getEphemeral(); + break; + } + + if (options.with_subcolumns) + addSubcolumnsToList(res); + + return res; +} + bool ColumnsDescription::has(const String & column_name) const { return columns.get<1>().find(column_name) != columns.get<1>().end(); @@ -410,37 +460,37 @@ const ColumnDescription & ColumnsDescription::get(const String & column_name) co return *it; } -static ColumnsDescription::GetFlags defaultKindToGetFlag(ColumnDefaultKind kind) +static GetColumnsOptions::Kind defaultKindToGetKind(ColumnDefaultKind kind) { switch (kind) { case ColumnDefaultKind::Default: - return ColumnsDescription::Ordinary; + return GetColumnsOptions::Ordinary; case ColumnDefaultKind::Materialized: - return ColumnsDescription::Materialized; + return GetColumnsOptions::Materialized; case ColumnDefaultKind::Alias: - return ColumnsDescription::Aliases; + return GetColumnsOptions::Aliases; case ColumnDefaultKind::Ephemeral: - return ColumnsDescription::Ephemeral; + return GetColumnsOptions::Ephemeral; } __builtin_unreachable(); } -NamesAndTypesList ColumnsDescription::getByNames(GetFlags flags, const Names & names, bool with_subcolumns) const +NamesAndTypesList ColumnsDescription::getByNames(const GetColumnsOptions & options, const Names & names) const { NamesAndTypesList res; for (const auto & name : names) { if (auto it = columns.get<1>().find(name); it != columns.get<1>().end()) { - auto kind = defaultKindToGetFlag(it->default_desc.kind); - if (flags & kind) + auto kind = defaultKindToGetKind(it->default_desc.kind); + if (options.kind & kind) { res.emplace_back(name, it->type); continue; } } - else if (with_subcolumns) + else if (options.with_subcolumns) { auto jt = subcolumns.get<0>().find(name); if (jt != subcolumns.get<0>().end()) @@ -475,22 +525,40 @@ Names ColumnsDescription::getNamesOfPhysical() const return ret; } -std::optional ColumnsDescription::tryGetColumnOrSubcolumn(GetFlags flags, const String & column_name) const +std::optional ColumnsDescription::tryGetColumn(const GetColumnsOptions & options, const String & column_name) const { auto it = columns.get<1>().find(column_name); - if (it != columns.get<1>().end() && (defaultKindToGetFlag(it->default_desc.kind) & flags)) + if (it != columns.get<1>().end() && (defaultKindToGetKind(it->default_desc.kind) & options.kind)) return NameAndTypePair(it->name, it->type); - auto jt = subcolumns.get<0>().find(column_name); - if (jt != subcolumns.get<0>().end()) - return *jt; + if (options.with_subcolumns) + { + auto jt = subcolumns.get<0>().find(column_name); + if (jt != subcolumns.get<0>().end()) + return *jt; + } return {}; } -NameAndTypePair ColumnsDescription::getColumnOrSubcolumn(GetFlags flags, const String & column_name) const +NameAndTypePair ColumnsDescription::getColumn(const GetColumnsOptions & options, const String & column_name) const { - auto column = tryGetColumnOrSubcolumn(flags, column_name); + auto column = tryGetColumn(options, column_name); + if (!column) + throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, + "There is no column {} in table.", column_name); + + return *column; +} + +std::optional ColumnsDescription::tryGetColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const +{ + return tryGetColumn(GetColumnsOptions(kind).withSubcolumns(), column_name); +} + +NameAndTypePair ColumnsDescription::getColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const +{ + auto column = tryGetColumnOrSubcolumn(kind, column_name); if (!column) throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no column or subcolumn {} in table.", column_name); @@ -500,12 +568,7 @@ NameAndTypePair ColumnsDescription::getColumnOrSubcolumn(GetFlags flags, const S std::optional ColumnsDescription::tryGetPhysical(const String & column_name) const { - auto it = columns.get<1>().find(column_name); - if (it == columns.get<1>().end() || - it->default_desc.kind == ColumnDefaultKind::Alias || it->default_desc.kind == ColumnDefaultKind::Ephemeral) - return {}; - - return NameAndTypePair(it->name, it->type); + return tryGetColumn(GetColumnsOptions::AllPhysical, column_name); } NameAndTypePair ColumnsDescription::getPhysical(const String & column_name) const @@ -525,41 +588,14 @@ bool ColumnsDescription::hasPhysical(const String & column_name) const it->default_desc.kind != ColumnDefaultKind::Alias && it->default_desc.kind != ColumnDefaultKind::Ephemeral; } -bool ColumnsDescription::hasColumnOrSubcolumn(GetFlags flags, const String & column_name) const +bool ColumnsDescription::hasColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const { auto it = columns.get<1>().find(column_name); return (it != columns.get<1>().end() - && (defaultKindToGetFlag(it->default_desc.kind) & flags)) + && (defaultKindToGetKind(it->default_desc.kind) & kind)) || hasSubcolumn(column_name); } -void ColumnsDescription::addSubcolumnsToList(NamesAndTypesList & source_list) const -{ - NamesAndTypesList subcolumns_list; - for (const auto & col : source_list) - { - auto range = subcolumns.get<1>().equal_range(col.name); - if (range.first != range.second) - subcolumns_list.insert(subcolumns_list.end(), range.first, range.second); - } - - source_list.splice(source_list.end(), std::move(subcolumns_list)); -} - -NamesAndTypesList ColumnsDescription::getAllWithSubcolumns() const -{ - auto columns_list = getAll(); - addSubcolumnsToList(columns_list); - return columns_list; -} - -NamesAndTypesList ColumnsDescription::getAllPhysicalWithSubcolumns() const -{ - auto columns_list = getAllPhysical(); - addSubcolumnsToList(columns_list); - return columns_list; -} - bool ColumnsDescription::hasDefaults() const { for (const auto & column : columns) diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index 9fb03c70be9..4ae1dcfc2cd 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -28,6 +28,44 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +struct GetColumnsOptions +{ + enum Kind : UInt8 + { + Ordinary = 1, + Materialized = 2, + Aliases = 4, + Ephemeral = 8, + + AllPhysical = Ordinary | Materialized, + All = AllPhysical | Aliases | Ephemeral, + }; + + GetColumnsOptions(Kind kind_) : kind(kind_) {} + + GetColumnsOptions & withSubcolumns(bool value = true) + { + with_subcolumns = value; + return *this; + } + + GetColumnsOptions & withVirtuals(bool value = true) + { + with_virtuals = value; + return *this; + } + + GetColumnsOptions & withExtendedObjects(bool value = true) + { + with_extended_objects = value; + return *this; + } + + Kind kind; + bool with_subcolumns = false; + bool with_virtuals = false; + bool with_extended_objects = false; +}; /// Description of a single table column (in CREATE TABLE for example). struct ColumnDescription @@ -79,18 +117,8 @@ public: auto begin() const { return columns.begin(); } auto end() const { return columns.end(); } - enum GetFlags : UInt8 - { - Ordinary = 1, - Materialized = 2, - Aliases = 4, - Ephemeral = 8, - - AllPhysical = Ordinary | Materialized, - All = AllPhysical | Aliases | Ephemeral, - }; - - NamesAndTypesList getByNames(GetFlags flags, const Names & names, bool with_subcolumns) const; + NamesAndTypesList get(const GetColumnsOptions & options) const; + NamesAndTypesList getByNames(const GetColumnsOptions & options, const Names & names) const; NamesAndTypesList getOrdinary() const; NamesAndTypesList getMaterialized() const; @@ -99,8 +127,7 @@ public: NamesAndTypesList getEphemeral() const; NamesAndTypesList getAllPhysical() const; /// ordinary + materialized. NamesAndTypesList getAll() const; /// ordinary + materialized + aliases + ephemeral - NamesAndTypesList getAllWithSubcolumns() const; - NamesAndTypesList getAllPhysicalWithSubcolumns() const; + NamesAndTypesList getSubcolumns(const String & name_in_storage) const; using ColumnTTLs = std::unordered_map; ColumnTTLs getColumnTTLs() const; @@ -123,22 +150,27 @@ public: auto it = columns.get<1>().find(column_name); if (it == columns.get<1>().end()) throw Exception("Cannot find column " + column_name + " in ColumnsDescription", ErrorCodes::LOGICAL_ERROR); + + removeSubcolumns(it->name); if (!columns.get<1>().modify(it, std::forward(f))) throw Exception("Cannot modify ColumnDescription for column " + column_name + ": column name cannot be changed", ErrorCodes::LOGICAL_ERROR); + addSubcolumns(it->name, it->type); modifyColumnOrder(column_name, after_column, first); } Names getNamesOfPhysical() const; bool hasPhysical(const String & column_name) const; - bool hasColumnOrSubcolumn(GetFlags flags, const String & column_name) const; + bool hasColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const; NameAndTypePair getPhysical(const String & column_name) const; - NameAndTypePair getColumnOrSubcolumn(GetFlags flags, const String & column_name) const; + NameAndTypePair getColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const; + NameAndTypePair getColumn(const GetColumnsOptions & options, const String & column_name) const; std::optional tryGetPhysical(const String & column_name) const; - std::optional tryGetColumnOrSubcolumn(GetFlags flags, const String & column_name) const; + std::optional tryGetColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const; + std::optional tryGetColumn(const GetColumnsOptions & options, const String & column_name) const; ColumnDefaults getDefaults() const; /// TODO: remove bool hasDefault(const String & column_name) const; diff --git a/src/Storages/FileLog/FileLogSource.cpp b/src/Storages/FileLog/FileLogSource.cpp index 7d4b5ac6fec..be818b93a4c 100644 --- a/src/Storages/FileLog/FileLogSource.cpp +++ b/src/Storages/FileLog/FileLogSource.cpp @@ -12,25 +12,24 @@ static constexpr auto MAX_FAILED_POLL_ATTEMPTS = 10; FileLogSource::FileLogSource( StorageFileLog & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const ContextPtr & context_, const Names & columns, size_t max_block_size_, size_t poll_time_out_, size_t stream_number_, size_t max_streams_number_) - : SourceWithProgress(metadata_snapshot_->getSampleBlockForColumns(columns, storage_.getVirtuals(), storage_.getStorageID())) + : SourceWithProgress(storage_snapshot_->getSampleBlockForColumns(columns)) , storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , context(context_) , column_names(columns) , max_block_size(max_block_size_) , poll_time_out(poll_time_out_) , stream_number(stream_number_) , max_streams_number(max_streams_number_) - , non_virtual_header(metadata_snapshot_->getSampleBlockNonMaterialized()) - , virtual_header( - metadata_snapshot->getSampleBlockForColumns(storage.getVirtualColumnNames(), storage.getVirtuals(), storage.getStorageID())) + , non_virtual_header(storage_snapshot->metadata->getSampleBlockNonMaterialized()) + , virtual_header(storage_snapshot->getSampleBlockForColumns(storage.getVirtualColumnNames())) { buffer = std::make_unique(storage, max_block_size, poll_time_out, context, stream_number_, max_streams_number_); diff --git a/src/Storages/FileLog/FileLogSource.h b/src/Storages/FileLog/FileLogSource.h index f1cc83b4a06..831f4c907a5 100644 --- a/src/Storages/FileLog/FileLogSource.h +++ b/src/Storages/FileLog/FileLogSource.h @@ -15,7 +15,7 @@ class FileLogSource : public SourceWithProgress public: FileLogSource( StorageFileLog & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const ContextPtr & context_, const Names & columns, size_t max_block_size_, @@ -36,7 +36,7 @@ protected: private: StorageFileLog & storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; ContextPtr context; Names column_names; UInt64 max_block_size; diff --git a/src/Storages/FileLog/StorageFileLog.cpp b/src/Storages/FileLog/StorageFileLog.cpp index 700b35a5a48..32ca936f039 100644 --- a/src/Storages/FileLog/StorageFileLog.cpp +++ b/src/Storages/FileLog/StorageFileLog.cpp @@ -313,7 +313,7 @@ UInt64 StorageFileLog::getInode(const String & file_name) Pipe StorageFileLog::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /* query_info */, ContextPtr local_context, QueryProcessingStage::Enum /* processed_stage */, @@ -355,7 +355,7 @@ Pipe StorageFileLog::read( { pipes.emplace_back(std::make_shared( *this, - metadata_snapshot, + storage_snapshot, modified_context, column_names, getMaxBlockSize(), @@ -677,7 +677,9 @@ bool StorageFileLog::streamToViews() auto table = DatabaseCatalog::instance().getTable(table_id, getContext()); if (!table) throw Exception("Engine table " + table_id.getNameForLogs() + " doesn't exist", ErrorCodes::LOGICAL_ERROR); + auto metadata_snapshot = getInMemoryMetadataPtr(); + auto storage_snapshot = getStorageSnapshot(metadata_snapshot); auto max_streams_number = std::min(filelog_settings->max_threads.value, file_infos.file_names.size()); /// No files to parse @@ -705,7 +707,7 @@ bool StorageFileLog::streamToViews() { pipes.emplace_back(std::make_shared( *this, - metadata_snapshot, + storage_snapshot, new_context, block_io.pipeline.getHeader().getNames(), getPollMaxBatchSize(), diff --git a/src/Storages/FileLog/StorageFileLog.h b/src/Storages/FileLog/StorageFileLog.h index 98915f10a05..ded97ecbd8c 100644 --- a/src/Storages/FileLog/StorageFileLog.h +++ b/src/Storages/FileLog/StorageFileLog.h @@ -43,7 +43,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 13873aa21ab..74f6937dbae 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -292,16 +292,15 @@ Block HDFSSource::getHeader(const StorageMetadataPtr & metadata_snapshot, bool n Block HDFSSource::getBlockForSource( const StorageHDFSPtr & storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const ColumnsDescription & columns_description, bool need_path_column, bool need_file_column) { if (storage->isColumnOriented()) - return metadata_snapshot->getSampleBlockForColumns( - columns_description.getNamesOfPhysical(), storage->getVirtuals(), storage->getStorageID()); + return storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); else - return getHeader(metadata_snapshot, need_path_column, need_file_column); + return getHeader(storage_snapshot->metadata, need_path_column, need_file_column); } HDFSSource::DisclosedGlobIterator::DisclosedGlobIterator(ContextPtr context_, const String & uri) @@ -324,17 +323,17 @@ String HDFSSource::URISIterator::next() HDFSSource::HDFSSource( StorageHDFSPtr storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, ContextPtr context_, UInt64 max_block_size_, bool need_path_column_, bool need_file_column_, std::shared_ptr file_iterator_, ColumnsDescription columns_description_) - : SourceWithProgress(getBlockForSource(storage_, metadata_snapshot_, columns_description_, need_path_column_, need_file_column_)) + : SourceWithProgress(getBlockForSource(storage_, storage_snapshot_, columns_description_, need_path_column_, need_file_column_)) , WithContext(context_) , storage(std::move(storage_)) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , max_block_size(max_block_size_) , need_path_column(need_path_column_) , need_file_column(need_file_column_) @@ -365,8 +364,8 @@ bool HDFSSource::initialize() auto get_block_for_format = [&]() -> Block { if (storage->isColumnOriented()) - return metadata_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); - return metadata_snapshot->getSampleBlock(); + return storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); + return storage_snapshot->metadata->getSampleBlock(); }; auto input_format = getContext()->getInputFormat(storage->format_name, *read_buf, get_block_for_format(), max_block_size); @@ -520,7 +519,7 @@ bool StorageHDFS::isColumnOriented() const Pipe StorageHDFS::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context_, QueryProcessingStage::Enum /*processed_stage*/, @@ -571,15 +570,14 @@ Pipe StorageHDFS::read( const auto get_columns_for_format = [&]() -> ColumnsDescription { if (isColumnOriented()) - return ColumnsDescription{ - metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()).getNamesAndTypesList()}; + return ColumnsDescription{storage_snapshot->getSampleBlockForColumns(column_names).getNamesAndTypesList()}; else - return metadata_snapshot->getColumns(); + return storage_snapshot->metadata->getColumns(); }; pipes.emplace_back(std::make_shared( this_ptr, - metadata_snapshot, + storage_snapshot, context_, max_block_size, need_path_column, diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index 99b5ba95d25..e87564aef32 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -24,7 +24,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -117,14 +117,14 @@ public: static Block getBlockForSource( const StorageHDFSPtr & storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot_, const ColumnsDescription & columns_description, bool need_path_column, bool need_file_column); HDFSSource( StorageHDFSPtr storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, ContextPtr context_, UInt64 max_block_size_, bool need_path_column_, @@ -140,7 +140,7 @@ public: private: StorageHDFSPtr storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; UInt64 max_block_size; bool need_path_column; bool need_file_column; diff --git a/src/Storages/HDFS/StorageHDFSCluster.cpp b/src/Storages/HDFS/StorageHDFSCluster.cpp index dde3329040b..b039caa4330 100644 --- a/src/Storages/HDFS/StorageHDFSCluster.cpp +++ b/src/Storages/HDFS/StorageHDFSCluster.cpp @@ -50,7 +50,7 @@ StorageHDFSCluster::StorageHDFSCluster( /// The code executes on initiator Pipe StorageHDFSCluster::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -106,12 +106,12 @@ Pipe StorageHDFSCluster::read( } } - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); return Pipe::unitePipes(std::move(pipes)); } QueryProcessingStage::Enum StorageHDFSCluster::getQueryProcessingStage( - ContextPtr context, QueryProcessingStage::Enum to_stage, const StorageMetadataPtr &, SelectQueryInfo &) const + ContextPtr context, QueryProcessingStage::Enum to_stage, const StorageSnapshotPtr &, SelectQueryInfo &) const { /// Initiator executes query on remote node. if (context->getClientInfo().query_kind == ClientInfo::QueryKind::INITIAL_QUERY) diff --git a/src/Storages/HDFS/StorageHDFSCluster.h b/src/Storages/HDFS/StorageHDFSCluster.h index 0e568a9faf8..953311de056 100644 --- a/src/Storages/HDFS/StorageHDFSCluster.h +++ b/src/Storages/HDFS/StorageHDFSCluster.h @@ -24,11 +24,11 @@ class StorageHDFSCluster : public shared_ptr_helper, public public: std::string getName() const override { return "HDFSCluster"; } - Pipe read(const Names &, const StorageMetadataPtr &, SelectQueryInfo &, + Pipe read(const Names &, const StorageSnapshotPtr &, SelectQueryInfo &, ContextPtr, QueryProcessingStage::Enum, size_t /*max_block_size*/, unsigned /*num_streams*/) override; QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; NamesAndTypesList getVirtuals() const override; diff --git a/src/Storages/Hive/StorageHive.cpp b/src/Storages/Hive/StorageHive.cpp index 2ae7c30fd5b..7b6a8db568f 100644 --- a/src/Storages/Hive/StorageHive.cpp +++ b/src/Storages/Hive/StorageHive.cpp @@ -590,7 +590,7 @@ void StorageHive::getActualColumnsToRead(Block & sample_block, const Block & hea } Pipe StorageHive::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context_, QueryProcessingStage::Enum /* processed_stage */, @@ -654,7 +654,7 @@ Pipe StorageHive::read( sources_info->hive_metastore_client = hive_metastore_client; sources_info->partition_name_types = partition_name_types; - const auto & header_block = metadata_snapshot->getSampleBlock(); + const auto & header_block = storage_snapshot->metadata->getSampleBlock(); Block sample_block; for (const auto & column : column_names) { diff --git a/src/Storages/Hive/StorageHive.h b/src/Storages/Hive/StorageHive.h index 323293cbbe0..376aab311d0 100644 --- a/src/Storages/Hive/StorageHive.h +++ b/src/Storages/Hive/StorageHive.h @@ -42,7 +42,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index a923258b111..89403a773b3 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -90,7 +90,7 @@ TableExclusiveLockHolder IStorage::lockExclusively(const String & query_id, cons Pipe IStorage::read( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, @@ -103,18 +103,17 @@ Pipe IStorage::read( void IStorage::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned num_streams) { - auto pipe = read(column_names, metadata_snapshot, query_info, context, processed_stage, max_block_size, num_streams); + auto pipe = read(column_names, storage_snapshot, query_info, context, processed_stage, max_block_size, num_streams); if (pipe.empty()) { - auto header = (query_info.projection ? query_info.projection->desc->metadata : metadata_snapshot) - ->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()); + auto header = storage_snapshot->getSampleBlockForColumns(column_names); InterpreterSelectQuery::addEmptySourceToQueryPlan(query_plan, header, query_info, context); } else diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 1010164f71e..e9969859d5f 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -158,6 +159,10 @@ public: /// Returns true if the storage supports reading of subcolumns of complex types. virtual bool supportsSubcolumns() const { return false; } + /// Returns true if the storage supports storing of dynamic subcolumns. + /// For now it makes sense only for data type Object. + virtual bool supportsDynamicSubcolumns() const { return false; } + /// Requires squashing small blocks to large for optimal storage. /// This is true for most storages that store data on disk. virtual bool prefersLargeBlocks() const { return true; } @@ -269,8 +274,7 @@ public: * QueryProcessingStage::Enum required for Distributed over Distributed, * since it cannot return Complete for intermediate queries never. */ - virtual QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const + virtual QueryProcessingStage::Enum getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const { return QueryProcessingStage::FetchColumns; } @@ -331,7 +335,7 @@ public: */ virtual Pipe read( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, @@ -343,7 +347,7 @@ public: virtual void read( QueryPlan & query_plan, const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, @@ -600,6 +604,18 @@ public: /// Does not takes underlying Storage (if any) into account. virtual std::optional lifetimeBytes() const { return {}; } + /// Creates a storage snapshot from given metadata. + virtual StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot) const + { + return std::make_shared(*this, metadata_snapshot); + } + + /// Creates a storage snapshot from given metadata and columns, which are used in query. + virtual StorageSnapshotPtr getStorageSnapshotForQuery(const StorageMetadataPtr & metadata_snapshot, const ASTPtr & /*query*/) const + { + return getStorageSnapshot(metadata_snapshot); + } + private: /// Lock required for alter queries (lockForAlter). /// Allows to execute only one simultaneous alter query. diff --git a/src/Storages/Kafka/KafkaSource.cpp b/src/Storages/Kafka/KafkaSource.cpp index 3e24608a180..99130f615f5 100644 --- a/src/Storages/Kafka/KafkaSource.cpp +++ b/src/Storages/Kafka/KafkaSource.cpp @@ -19,22 +19,22 @@ const auto MAX_FAILED_POLL_ATTEMPTS = 10; KafkaSource::KafkaSource( StorageKafka & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const ContextPtr & context_, const Names & columns, Poco::Logger * log_, size_t max_block_size_, bool commit_in_suffix_) - : SourceWithProgress(metadata_snapshot_->getSampleBlockForColumns(columns, storage_.getVirtuals(), storage_.getStorageID())) + : SourceWithProgress(storage_snapshot_->getSampleBlockForColumns(columns)) , storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , context(context_) , column_names(columns) , log(log_) , max_block_size(max_block_size_) , commit_in_suffix(commit_in_suffix_) - , non_virtual_header(metadata_snapshot->getSampleBlockNonMaterialized()) - , virtual_header(metadata_snapshot->getSampleBlockForColumns(storage.getVirtualColumnNames(), storage.getVirtuals(), storage.getStorageID())) + , non_virtual_header(storage_snapshot->metadata->getSampleBlockNonMaterialized()) + , virtual_header(storage_snapshot->getSampleBlockForColumns(storage.getVirtualColumnNames())) , handle_error_mode(storage.getHandleKafkaErrorMode()) { } diff --git a/src/Storages/Kafka/KafkaSource.h b/src/Storages/Kafka/KafkaSource.h index e80edfb9ef4..59b6d370b71 100644 --- a/src/Storages/Kafka/KafkaSource.h +++ b/src/Storages/Kafka/KafkaSource.h @@ -18,7 +18,7 @@ class KafkaSource : public SourceWithProgress public: KafkaSource( StorageKafka & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const ContextPtr & context_, const Names & columns, Poco::Logger * log_, @@ -35,7 +35,7 @@ public: private: StorageKafka & storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; ContextPtr context; Names column_names; Poco::Logger * log; diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index ae470cdccc9..4c7465d587d 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -263,7 +263,7 @@ String StorageKafka::getDefaultClientId(const StorageID & table_id_) Pipe StorageKafka::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /* query_info */, ContextPtr local_context, QueryProcessingStage::Enum /* processed_stage */, @@ -291,7 +291,7 @@ Pipe StorageKafka::read( /// Use block size of 1, otherwise LIMIT won't work properly as it will buffer excess messages in the last block /// TODO: probably that leads to awful performance. /// FIXME: seems that doesn't help with extra reading and committing unprocessed messages. - pipes.emplace_back(std::make_shared(*this, metadata_snapshot, modified_context, column_names, log, 1, kafka_settings->kafka_commit_on_select)); + pipes.emplace_back(std::make_shared(*this, storage_snapshot, modified_context, column_names, log, 1, kafka_settings->kafka_commit_on_select)); } LOG_DEBUG(log, "Starting reading {} streams", pipes.size()); @@ -614,7 +614,8 @@ bool StorageKafka::streamToViews() auto table = DatabaseCatalog::instance().getTable(table_id, getContext()); if (!table) throw Exception("Engine table " + table_id.getNameForLogs() + " doesn't exist.", ErrorCodes::LOGICAL_ERROR); - auto metadata_snapshot = getInMemoryMetadataPtr(); + + auto storage_snapshot = getStorageSnapshot(getInMemoryMetadataPtr()); // Create an INSERT query for streaming data auto insert = std::make_shared(); @@ -640,7 +641,7 @@ bool StorageKafka::streamToViews() pipes.reserve(stream_count); for (size_t i = 0; i < stream_count; ++i) { - auto source = std::make_shared(*this, metadata_snapshot, kafka_context, block_io.pipeline.getHeader().getNames(), log, block_size, false); + auto source = std::make_shared(*this, storage_snapshot, kafka_context, block_io.pipeline.getHeader().getNames(), log, block_size, false); sources.emplace_back(source); pipes.emplace_back(source); diff --git a/src/Storages/Kafka/StorageKafka.h b/src/Storages/Kafka/StorageKafka.h index 03e90b1f6c3..707db7a798e 100644 --- a/src/Storages/Kafka/StorageKafka.h +++ b/src/Storages/Kafka/StorageKafka.h @@ -43,7 +43,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/LiveView/StorageBlocks.h b/src/Storages/LiveView/StorageBlocks.h index 01293a1e5d7..bc860a1fa3c 100644 --- a/src/Storages/LiveView/StorageBlocks.h +++ b/src/Storages/LiveView/StorageBlocks.h @@ -34,14 +34,14 @@ public: bool supportsFinal() const override { return true; } QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override { return to_stage; } Pipe read( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, diff --git a/src/Storages/LiveView/StorageLiveView.cpp b/src/Storages/LiveView/StorageLiveView.cpp index 83578e3b5b9..8f80f8632cc 100644 --- a/src/Storages/LiveView/StorageLiveView.cpp +++ b/src/Storages/LiveView/StorageLiveView.cpp @@ -540,7 +540,7 @@ void StorageLiveView::refresh(bool grab_lock) Pipe StorageLiveView::read( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, diff --git a/src/Storages/LiveView/StorageLiveView.h b/src/Storages/LiveView/StorageLiveView.h index 17e2f50e7ec..2fb2ec509fa 100644 --- a/src/Storages/LiveView/StorageLiveView.h +++ b/src/Storages/LiveView/StorageLiveView.h @@ -146,7 +146,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp index 87cda721862..3a823345dda 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.cpp +++ b/src/Storages/MergeTree/IMergeTreeReader.cpp @@ -62,101 +62,13 @@ const IMergeTreeReader::ValueSizeMap & IMergeTreeReader::getAvgValueSizeHints() return avg_value_size_hints; } - -static bool arrayHasNoElementsRead(const IColumn & column) -{ - const auto * column_array = typeid_cast(&column); - - if (!column_array) - return false; - - size_t size = column_array->size(); - if (!size) - return false; - - size_t data_size = column_array->getData().size(); - if (data_size) - return false; - - size_t last_offset = column_array->getOffsets()[size - 1]; - return last_offset != 0; -} - -void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_evaluate_missing_defaults, size_t num_rows) +void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_evaluate_missing_defaults, size_t num_rows) const { try { - size_t num_columns = columns.size(); - - if (res_columns.size() != num_columns) - throw Exception("invalid number of columns passed to MergeTreeReader::fillMissingColumns. " - "Expected " + toString(num_columns) + ", " - "got " + toString(res_columns.size()), ErrorCodes::LOGICAL_ERROR); - - /// For a missing column of a nested data structure we must create not a column of empty - /// arrays, but a column of arrays of correct length. - - /// First, collect offset columns for all arrays in the block. - OffsetColumns offset_columns; - auto requested_column = columns.begin(); - for (size_t i = 0; i < num_columns; ++i, ++requested_column) - { - if (res_columns[i] == nullptr) - continue; - - if (const auto * array = typeid_cast(res_columns[i].get())) - { - String offsets_name = Nested::extractTableName(requested_column->name); - auto & offsets_column = offset_columns[offsets_name]; - - /// If for some reason multiple offsets columns are present for the same nested data structure, - /// choose the one that is not empty. - if (!offsets_column || offsets_column->empty()) - offsets_column = array->getOffsetsPtr(); - } - } - - should_evaluate_missing_defaults = false; - - /// insert default values only for columns without default expressions - requested_column = columns.begin(); - for (size_t i = 0; i < num_columns; ++i, ++requested_column) - { - auto & [name, type] = *requested_column; - - if (res_columns[i] && arrayHasNoElementsRead(*res_columns[i])) - res_columns[i] = nullptr; - - if (res_columns[i] == nullptr) - { - if (metadata_snapshot->getColumns().hasDefault(name)) - { - should_evaluate_missing_defaults = true; - continue; - } - - String offsets_name = Nested::extractTableName(name); - auto offset_it = offset_columns.find(offsets_name); - const auto * array_type = typeid_cast(type.get()); - if (offset_it != offset_columns.end() && array_type) - { - const auto & nested_type = array_type->getNestedType(); - ColumnPtr offsets_column = offset_it->second; - size_t nested_rows = typeid_cast(*offsets_column).getData().back(); - - ColumnPtr nested_column = - nested_type->createColumnConstWithDefaultValue(nested_rows)->convertToFullColumnIfConst(); - - res_columns[i] = ColumnArray::create(nested_column, offsets_column); - } - else - { - /// We must turn a constant column into a full column because the interpreter could infer - /// that it is constant everywhere but in some blocks (from other parts) it can be a full column. - res_columns[i] = type->createColumnConstWithDefaultValue(num_rows)->convertToFullColumnIfConst(); - } - } - } + DB::fillMissingColumns(res_columns, num_rows, columns, metadata_snapshot); + should_evaluate_missing_defaults = std::any_of( + res_columns.begin(), res_columns.end(), [](const auto & column) { return column == nullptr; }); } catch (Exception & e) { @@ -166,7 +78,7 @@ void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_e } } -void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns & res_columns) +void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns & res_columns) const { try { @@ -245,7 +157,7 @@ NameAndTypePair IMergeTreeReader::getColumnFromPart(const NameAndTypePair & requ return {String(it->getKey()), type}; } -void IMergeTreeReader::performRequiredConversions(Columns & res_columns) +void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const { try { diff --git a/src/Storages/MergeTree/IMergeTreeReader.h b/src/Storages/MergeTree/IMergeTreeReader.h index 28334b9a8bb..7c5977b5cb2 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.h +++ b/src/Storages/MergeTree/IMergeTreeReader.h @@ -44,13 +44,13 @@ public: /// Add columns from ordered_names that are not present in the block. /// Missing columns are added in the order specified by ordered_names. /// num_rows is needed in case if all res_columns are nullptr. - void fillMissingColumns(Columns & res_columns, bool & should_evaluate_missing_defaults, size_t num_rows); + void fillMissingColumns(Columns & res_columns, bool & should_evaluate_missing_defaults, size_t num_rows) const; /// Evaluate defaulted columns if necessary. - void evaluateMissingDefaults(Block additional_columns, Columns & res_columns); + void evaluateMissingDefaults(Block additional_columns, Columns & res_columns) const; /// If part metadata is not equal to storage metadata, than /// try to perform conversions of columns. - void performRequiredConversions(Columns & res_columns); + void performRequiredConversions(Columns & res_columns) const; const NamesAndTypesList & getColumns() const { return columns; } size_t numColumnsInResult() const { return columns.size(); } diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 8b5c2e0dc6e..935a11ec5fa 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -134,6 +135,10 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() global_ctx->all_column_names = global_ctx->metadata_snapshot->getColumns().getNamesOfPhysical(); global_ctx->storage_columns = global_ctx->metadata_snapshot->getColumns().getAllPhysical(); + auto object_columns = MergeTreeData::getObjectColumns(global_ctx->future_part->parts, global_ctx->metadata_snapshot->getColumns()); + global_ctx->storage_snapshot = std::make_shared(*global_ctx->data, global_ctx->metadata_snapshot, object_columns); + extendObjectColumns(global_ctx->storage_columns, object_columns, false); + extractMergingAndGatheringColumns( global_ctx->storage_columns, global_ctx->metadata_snapshot->getSortingKey().expression, @@ -414,7 +419,7 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const for (size_t part_num = 0; part_num < global_ctx->future_part->parts.size(); ++part_num) { auto column_part_source = std::make_shared( - *global_ctx->data, global_ctx->metadata_snapshot, global_ctx->future_part->parts[part_num], column_names, ctx->read_with_direct_io, true); + *global_ctx->data, global_ctx->storage_snapshot, global_ctx->future_part->parts[part_num], column_names, ctx->read_with_direct_io, true); /// Dereference unique_ptr column_part_source->setProgressCallback( @@ -748,7 +753,7 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() for (const auto & part : global_ctx->future_part->parts) { auto input = std::make_unique( - *global_ctx->data, global_ctx->metadata_snapshot, part, global_ctx->merging_column_names, ctx->read_with_direct_io, true); + *global_ctx->data, global_ctx->storage_snapshot, part, global_ctx->merging_column_names, ctx->read_with_direct_io, true); /// Dereference unique_ptr and pass horizontal_stage_progress by reference input->setProgressCallback( diff --git a/src/Storages/MergeTree/MergeTask.h b/src/Storages/MergeTree/MergeTask.h index d3fc01980ea..04da9ad77c4 100644 --- a/src/Storages/MergeTree/MergeTask.h +++ b/src/Storages/MergeTree/MergeTask.h @@ -127,6 +127,7 @@ private: MergeTreeDataMergerMutator * mutator{nullptr}; ActionBlocker * merges_blocker{nullptr}; ActionBlocker * ttl_merges_blocker{nullptr}; + StorageSnapshotPtr storage_snapshot{nullptr}; StorageMetadataPtr metadata_snapshot{nullptr}; FutureMergedMutatedPartPtr future_part{nullptr}; /// This will be either nullptr or new_data_part, so raw pointer is ok. diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index 5b69a4e68b6..c656de61bfd 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -29,7 +29,7 @@ namespace ErrorCodes MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor( Block header, const MergeTreeData & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const PrewhereInfoPtr & prewhere_info_, ExpressionActionsSettings actions_settings, UInt64 max_block_size_rows_, @@ -41,7 +41,7 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor( std::optional extension_) : SourceWithProgress(transformHeader(std::move(header), prewhere_info_, storage_.getPartitionValueType(), virt_column_names_)) , storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , prewhere_info(prewhere_info_) , max_block_size_rows(max_block_size_rows_) , preferred_block_size_bytes(preferred_block_size_bytes_) diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h index 2e906ecfce0..4b933175ba0 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h @@ -35,7 +35,7 @@ public: MergeTreeBaseSelectProcessor( Block header, const MergeTreeData & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const PrewhereInfoPtr & prewhere_info_, ExpressionActionsSettings actions_settings, UInt64 max_block_size_rows_, @@ -87,7 +87,7 @@ protected: void initializeRangeReaders(MergeTreeReadTask & task); const MergeTreeData & storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; PrewhereInfoPtr prewhere_info; std::unique_ptr prewhere_actions; diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp index dadccd2f9dc..6e72b843f10 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -35,7 +36,7 @@ bool injectRequiredColumnsRecursively( /// stages. checkStackSize(); - auto column_in_storage = storage_columns.tryGetColumnOrSubcolumn(ColumnsDescription::AllPhysical, column_name); + auto column_in_storage = storage_columns.tryGetColumnOrSubcolumn(GetColumnsOptions::AllPhysical, column_name); if (column_in_storage) { auto column_name_in_part = column_in_storage->getNameInStorage(); @@ -92,8 +93,15 @@ NameSet injectRequiredColumns(const MergeTreeData & storage, const StorageMetada alter_conversions = storage.getAlterConversionsForPart(part); for (size_t i = 0; i < columns.size(); ++i) { + auto name_in_storage = Nested::extractTableName(columns[i]); + if (storage_columns.has(name_in_storage) && isObject(storage_columns.get(name_in_storage).type)) + { + have_at_least_one_physical_column = true; + continue; + } + /// We are going to fetch only physical columns - if (!storage_columns.hasColumnOrSubcolumn(ColumnsDescription::AllPhysical, columns[i])) + if (!storage_columns.hasColumnOrSubcolumn(GetColumnsOptions::AllPhysical, columns[i])) throw Exception("There is no physical column or subcolumn " + columns[i] + " in table.", ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); have_at_least_one_physical_column |= injectRequiredColumnsRecursively( @@ -254,7 +262,7 @@ void MergeTreeBlockSizePredictor::update(const Block & sample_block, const Colum MergeTreeReadTaskColumns getReadTaskColumns( const MergeTreeData & storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const MergeTreeData::DataPartPtr & data_part, const Names & required_columns, const PrewhereInfoPtr & prewhere_info) @@ -263,7 +271,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( Names pre_column_names; /// inject columns required for defaults evaluation - bool should_reorder = !injectRequiredColumns(storage, metadata_snapshot, data_part, column_names).empty(); + bool should_reorder = !injectRequiredColumns(storage, storage_snapshot->getMetadataForQuery(), data_part, column_names).empty(); if (prewhere_info) { @@ -288,7 +296,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( if (pre_column_names.empty()) pre_column_names.push_back(column_names[0]); - const auto injected_pre_columns = injectRequiredColumns(storage, metadata_snapshot, data_part, pre_column_names); + const auto injected_pre_columns = injectRequiredColumns(storage, storage_snapshot->getMetadataForQuery(), data_part, pre_column_names); if (!injected_pre_columns.empty()) should_reorder = true; @@ -303,12 +311,12 @@ MergeTreeReadTaskColumns getReadTaskColumns( } MergeTreeReadTaskColumns result; + NamesAndTypesList all_columns; - auto columns = metadata_snapshot->getColumns(); - result.pre_columns = columns.getByNames(ColumnsDescription::All, pre_column_names, true); - result.columns = columns.getByNames(ColumnsDescription::All, column_names, true); + auto options = GetColumnsOptions(GetColumnsOptions::All).withSubcolumns().withExtendedObjects(); + result.pre_columns = storage_snapshot->getColumnsByNames(options, pre_column_names); + result.columns = storage_snapshot->getColumnsByNames(options, column_names); result.should_reorder = should_reorder; - return result; } diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h index 1f70ca72f39..2373881f954 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h @@ -73,7 +73,7 @@ struct MergeTreeReadTaskColumns MergeTreeReadTaskColumns getReadTaskColumns( const MergeTreeData & storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const MergeTreeData::DataPartPtr & data_part, const Names & required_columns, const PrewhereInfoPtr & prewhere_info); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 7f407199e81..b789cefc40a 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -13,6 +13,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -1292,6 +1295,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) if (num_parts == 0 && parts_from_wal.empty()) { + resetObjectColumnsFromActiveParts(part_lock); LOG_DEBUG(log, "There are no data parts"); return; } @@ -1364,6 +1368,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) } } + resetObjectColumnsFromActiveParts(part_lock); calculateColumnAndSecondaryIndexSizesImpl(); LOG_DEBUG(log, "Loaded data parts ({} items)", data_parts_indexes.size()); @@ -2624,6 +2629,11 @@ bool MergeTreeData::renameTempPartAndReplace( modifyPartState(part_it, DataPartState::Active); addPartContributionToColumnAndSecondaryIndexSizes(part); + if (covered_parts.empty()) + updateObjectColumns(*part_it, lock); + else + resetObjectColumnsFromActiveParts(lock); + ssize_t diff_bytes = part->getBytesOnDisk() - reduce_bytes; ssize_t diff_rows = part->rows_count - reduce_rows; ssize_t diff_parts = 1 - reduce_parts; @@ -2664,9 +2674,10 @@ MergeTreeData::DataPartsVector MergeTreeData::renameTempPartAndReplace( return covered_parts; } -void MergeTreeData::removePartsFromWorkingSet(const MergeTreeData::DataPartsVector & remove, bool clear_without_timeout, DataPartsLock & /*acquired_lock*/) +void MergeTreeData::removePartsFromWorkingSet(const MergeTreeData::DataPartsVector & remove, bool clear_without_timeout, DataPartsLock & acquired_lock) { auto remove_time = clear_without_timeout ? 0 : time(nullptr); + bool removed_active_part = false; for (const DataPartPtr & part : remove) { @@ -2674,6 +2685,7 @@ void MergeTreeData::removePartsFromWorkingSet(const MergeTreeData::DataPartsVect { removePartContributionToColumnAndSecondaryIndexSizes(part); removePartContributionToDataVolume(part); + removed_active_part = true; } if (part->getState() == IMergeTreeDataPart::State::Active || clear_without_timeout) @@ -2685,11 +2697,15 @@ void MergeTreeData::removePartsFromWorkingSet(const MergeTreeData::DataPartsVect if (isInMemoryPart(part) && getSettings()->in_memory_parts_enable_wal) getWriteAheadLog()->dropPart(part->name); } + + if (removed_active_part) + resetObjectColumnsFromActiveParts(acquired_lock); } void MergeTreeData::removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove) { auto lock = lockParts(); + bool removed_active_part = false; for (const auto & part : remove) { @@ -2697,10 +2713,16 @@ void MergeTreeData::removePartsFromWorkingSetImmediatelyAndSetTemporaryState(con if (it_part == data_parts_by_info.end()) throw Exception("Part " + part->getNameWithState() + " not found in data_parts", ErrorCodes::LOGICAL_ERROR); + if (part->getState() == IMergeTreeDataPart::State::Active) + removed_active_part = true; + modifyPartState(part, IMergeTreeDataPart::State::Temporary); /// Erase immediately data_parts_indexes.erase(it_part); } + + if (removed_active_part) + resetObjectColumnsFromActiveParts(lock); } void MergeTreeData::removePartsFromWorkingSet(const DataPartsVector & remove, bool clear_without_timeout, DataPartsLock * acquired_lock) @@ -2793,8 +2815,7 @@ MergeTreeData::DataPartsVector MergeTreeData::removePartsInRangeFromWorkingSet(c return parts_to_remove; } -void MergeTreeData::forgetPartAndMoveToDetached(const MergeTreeData::DataPartPtr & part_to_detach, const String & prefix, bool -restore_covered) +void MergeTreeData::forgetPartAndMoveToDetached(const MergeTreeData::DataPartPtr & part_to_detach, const String & prefix, bool restore_covered) { if (prefix.empty()) LOG_INFO(log, "Renaming {} to {} and forgetting it.", part_to_detach->relative_path, part_to_detach->name); @@ -2802,6 +2823,8 @@ restore_covered) LOG_INFO(log, "Renaming {} to {}_{} and forgetting it.", part_to_detach->relative_path, prefix, part_to_detach->name); auto lock = lockParts(); + bool removed_active_part = false; + bool restored_active_part = false; auto it_part = data_parts_by_info.find(part_to_detach->info); if (it_part == data_parts_by_info.end()) @@ -2814,7 +2837,9 @@ restore_covered) { removePartContributionToDataVolume(part); removePartContributionToColumnAndSecondaryIndexSizes(part); + removed_active_part = true; } + modifyPartState(it_part, DataPartState::Deleting); part->renameToDetached(prefix); @@ -2864,6 +2889,7 @@ restore_covered) addPartContributionToColumnAndSecondaryIndexSizes(*it); addPartContributionToDataVolume(*it); modifyPartState(it, DataPartState::Active); // iterator is not invalidated here + restored_active_part = true; } pos = (*it)->info.max_block + 1; @@ -2895,6 +2921,7 @@ restore_covered) addPartContributionToColumnAndSecondaryIndexSizes(*it); addPartContributionToDataVolume(*it); modifyPartState(it, DataPartState::Active); + restored_active_part = true; } pos = (*it)->info.max_block + 1; @@ -2914,6 +2941,9 @@ restore_covered) LOG_ERROR(log, "The set of parts restored in place of {} looks incomplete. There might or might not be a data loss.{}", part->name, (error_parts.empty() ? "" : " Suspicious parts: " + error_parts)); } } + + if (removed_active_part || restored_active_part) + resetObjectColumnsFromActiveParts(lock); } @@ -3841,53 +3871,62 @@ std::set MergeTreeData::getPartitionIdsAffectedByCommands( } -MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVector( - const DataPartStates & affordable_states, DataPartStateVector * out_states, bool require_projection_parts) const +MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVectorUnlocked( + const DataPartStates & affordable_states, + const DataPartsLock & /*lock*/, + DataPartStateVector * out_states, + bool require_projection_parts) const { DataPartsVector res; DataPartsVector buf; + + for (auto state : affordable_states) { - auto lock = lockParts(); + auto range = getDataPartsStateRange(state); - for (auto state : affordable_states) + if (require_projection_parts) { - auto range = getDataPartsStateRange(state); - - if (require_projection_parts) + for (const auto & part : range) { - for (const auto & part : range) - { - for (const auto & [_, projection_part] : part->getProjectionParts()) - res.push_back(projection_part); - } - } - else - { - std::swap(buf, res); - res.clear(); - std::merge(range.begin(), range.end(), buf.begin(), buf.end(), std::back_inserter(res), LessDataPart()); //-V783 + for (const auto & [_, projection_part] : part->getProjectionParts()) + res.push_back(projection_part); } } - - if (out_states != nullptr) + else { - out_states->resize(res.size()); - if (require_projection_parts) - { - for (size_t i = 0; i < res.size(); ++i) - (*out_states)[i] = res[i]->getParentPart()->getState(); - } - else - { - for (size_t i = 0; i < res.size(); ++i) - (*out_states)[i] = res[i]->getState(); - } + std::swap(buf, res); + res.clear(); + std::merge(range.begin(), range.end(), buf.begin(), buf.end(), std::back_inserter(res), LessDataPart()); //-V783 + } + } + + if (out_states != nullptr) + { + out_states->resize(res.size()); + if (require_projection_parts) + { + for (size_t i = 0; i < res.size(); ++i) + (*out_states)[i] = res[i]->getParentPart()->getState(); + } + else + { + for (size_t i = 0; i < res.size(); ++i) + (*out_states)[i] = res[i]->getState(); } } return res; } +MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVector( + const DataPartStates & affordable_states, + DataPartStateVector * out_states, + bool require_projection_parts) const +{ + auto lock = lockParts(); + return getDataPartsVectorUnlocked(affordable_states, lock, out_states, require_projection_parts); +} + MergeTreeData::DataPartsVector MergeTreeData::getAllDataPartsVector(MergeTreeData::DataPartStateVector * out_states, bool require_projection_parts) const { @@ -4337,7 +4376,7 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData: else { total_covered_parts.insert(total_covered_parts.end(), covered_parts.begin(), covered_parts.end()); - for (const DataPartPtr & covered_part : covered_parts) + for (const auto & covered_part : covered_parts) { covered_part->remove_time.store(current_time, std::memory_order_relaxed); @@ -4347,6 +4386,7 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData: data.modifyPartState(covered_part, DataPartState::Outdated); data.removePartContributionToColumnAndSecondaryIndexSizes(covered_part); } + reduce_parts += covered_parts.size(); add_bytes += part->getBytesOnDisk(); @@ -4358,6 +4398,14 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData: } } + if (reduce_parts == 0) + { + for (const auto & part : precommitted_parts) + data.updateObjectColumns(part, parts_lock); + } + else + data.resetObjectColumnsFromActiveParts(parts_lock); + ssize_t diff_bytes = add_bytes - reduce_bytes; ssize_t diff_rows = add_rows - reduce_rows; ssize_t diff_parts = add_parts - reduce_parts; @@ -4441,7 +4489,7 @@ using PartitionIdToMaxBlock = std::unordered_map; static void selectBestProjection( const MergeTreeDataSelectExecutor & reader, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, const Names & required_columns, ProjectionCandidate & candidate, @@ -4470,7 +4518,7 @@ static void selectBestProjection( auto projection_result_ptr = reader.estimateNumMarksToRead( projection_parts, candidate.required_columns, - metadata_snapshot, + storage_snapshot->metadata, candidate.desc->metadata, query_info, query_context, @@ -4492,9 +4540,9 @@ static void selectBestProjection( auto normal_result_ptr = reader.estimateNumMarksToRead( normal_parts, required_columns, - metadata_snapshot, - metadata_snapshot, - query_info, + storage_snapshot->metadata, + storage_snapshot->metadata, + query_info, // TODO syntax_analysis_result set in index query_context, settings.max_threads, max_added_blocks); @@ -4697,8 +4745,9 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( std::optional MergeTreeData::getQueryProcessingStageWithAggregateProjection( - ContextPtr query_context, const StorageMetadataPtr & metadata_snapshot, SelectQueryInfo & query_info) const + ContextPtr query_context, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info) const { + const auto & metadata_snapshot = storage_snapshot->metadata; const auto & settings = query_context->getSettingsRef(); if (!settings.allow_experimental_projection_optimization || query_info.ignore_projections || query_info.is_projection_query) return std::nullopt; @@ -5074,7 +5123,7 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg { selectBestProjection( reader, - metadata_snapshot, + storage_snapshot, query_info, analysis_result.required_columns, candidate, @@ -5094,7 +5143,7 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg { selectBestProjection( reader, - metadata_snapshot, + storage_snapshot, query_info, analysis_result.required_columns, candidate, @@ -5134,12 +5183,12 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg QueryProcessingStage::Enum MergeTreeData::getQueryProcessingStage( ContextPtr query_context, QueryProcessingStage::Enum to_stage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info) const { if (to_stage >= QueryProcessingStage::Enum::WithMergeableState) { - if (auto projection = getQueryProcessingStageWithAggregateProjection(query_context, metadata_snapshot, query_info)) + if (auto projection = getQueryProcessingStageWithAggregateProjection(query_context, storage_snapshot, query_info)) { query_info.projection = std::move(projection); if (query_info.projection->desc->type == ProjectionDescription::Type::Aggregate) @@ -6064,6 +6113,52 @@ ReservationPtr MergeTreeData::balancedReservation( return reserved_space; } +ColumnsDescription MergeTreeData::getObjectColumns( + const DataPartsVector & parts, const ColumnsDescription & storage_columns) +{ + return DB::getObjectColumns( + parts.begin(), parts.end(), + storage_columns, [](const auto & part) -> const auto & { return part->getColumns(); }); +} + +ColumnsDescription MergeTreeData::getObjectColumns( + boost::iterator_range range, const ColumnsDescription & storage_columns) +{ + return DB::getObjectColumns( + range.begin(), range.end(), + storage_columns, [](const auto & part) -> const auto & { return part->getColumns(); }); +} + +void MergeTreeData::resetObjectColumnsFromActiveParts(const DataPartsLock & /*lock*/) +{ + auto metadata_snapshot = getInMemoryMetadataPtr(); + const auto & columns = metadata_snapshot->getColumns(); + if (!hasObjectColumns(columns)) + return; + + auto range = getDataPartsStateRange(DataPartState::Active); + object_columns = getObjectColumns(range, columns); +} + +void MergeTreeData::updateObjectColumns(const DataPartPtr & part, const DataPartsLock & /*lock*/) +{ + auto metadata_snapshot = getInMemoryMetadataPtr(); + const auto & columns = metadata_snapshot->getColumns(); + if (!hasObjectColumns(columns)) + return; + + DB::updateObjectColumns(object_columns, part->getColumns()); +} + +StorageSnapshotPtr MergeTreeData::getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot) const +{ + auto snapshot_data = std::make_unique(); + + auto lock = lockParts(); + snapshot_data->parts = getDataPartsVectorUnlocked({DataPartState::Active}, lock); + return std::make_shared(*this, metadata_snapshot, object_columns, std::move(snapshot_data)); +} + CurrentlySubmergingEmergingTagger::~CurrentlySubmergingEmergingTagger() { std::lock_guard lock(storage.currently_submerging_emerging_mutex); diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index d1c48b19985..86f60fd428d 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -397,12 +397,12 @@ public: ContextPtr query_context) const; std::optional getQueryProcessingStageWithAggregateProjection( - ContextPtr query_context, const StorageMetadataPtr & metadata_snapshot, SelectQueryInfo & query_info) const; + ContextPtr query_context, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info) const; QueryProcessingStage::Enum getQueryProcessingStage( ContextPtr query_context, QueryProcessingStage::Enum to_stage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & info) const override; ReservationPtr reserveSpace(UInt64 expected_size, VolumePtr & volume) const; @@ -417,10 +417,21 @@ public: bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } + NamesAndTypesList getVirtuals() const override; bool mayBenefitFromIndexForIn(const ASTPtr & left_in_operand, ContextPtr, const StorageMetadataPtr & metadata_snapshot) const override; + /// Snapshot for MergeTree contains the current set of data parts + /// at the moment of the start of query. + struct SnapshotData : public StorageSnapshot::Data + { + DataPartsVector parts; + }; + + StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot) const override; + /// Load the set of data parts from disk. Call once - immediately after the object is created. void loadDataParts(bool skip_sanity_checks); @@ -431,13 +442,23 @@ public: /// Returns a copy of the list so that the caller shouldn't worry about locks. DataParts getDataParts(const DataPartStates & affordable_states) const; + DataPartsVector getDataPartsVectorUnlocked( + const DataPartStates & affordable_states, + const DataPartsLock & lock, + DataPartStateVector * out_states = nullptr, + bool require_projection_parts = false) const; + /// Returns sorted list of the parts with specified states /// out_states will contain snapshot of each part state DataPartsVector getDataPartsVector( - const DataPartStates & affordable_states, DataPartStateVector * out_states = nullptr, bool require_projection_parts = false) const; + const DataPartStates & affordable_states, + DataPartStateVector * out_states = nullptr, + bool require_projection_parts = false) const; /// Returns absolutely all parts (and snapshot of their states) - DataPartsVector getAllDataPartsVector(DataPartStateVector * out_states = nullptr, bool require_projection_parts = false) const; + DataPartsVector getAllDataPartsVector( + DataPartStateVector * out_states = nullptr, + bool require_projection_parts = false) const; /// Returns all detached parts DetachedPartsInfo getDetachedParts() const; @@ -689,6 +710,12 @@ public: return column_sizes; } + const ColumnsDescription & getObjectColumns() const { return object_columns; } + + /// Creates desciprion of columns of data type Object from the range of data parts. + static ColumnsDescription getObjectColumns( + const DataPartsVector & parts, const ColumnsDescription & storage_columns); + IndexSizeByName getSecondaryIndexSizes() const override { auto lock = lockParts(); @@ -978,6 +1005,11 @@ protected: DataPartsIndexes::index::type & data_parts_by_info; DataPartsIndexes::index::type & data_parts_by_state_and_info; + /// Current descriprion of columns of data type Object. + /// It changes only when set of parts is changed and is + /// protected by @data_parts_mutex. + ColumnsDescription object_columns; + MergeTreePartsMover parts_mover; /// Executors are common for both ReplicatedMergeTree and plain MergeTree @@ -1014,6 +1046,10 @@ protected: return {begin, end}; } + /// Creates desciprion of columns of data type Object from the range of data parts. + static ColumnsDescription getObjectColumns( + boost::iterator_range range, const ColumnsDescription & storage_columns); + std::optional totalRowsByPartitionPredicateImpl( const SelectQueryInfo & query_info, ContextPtr context, const DataPartsVector & parts) const; @@ -1197,6 +1233,9 @@ private: MutableDataPartsVector & parts_from_wal, DataPartsLock & part_lock); + void resetObjectColumnsFromActiveParts(const DataPartsLock & lock); + void updateObjectColumns(const DataPartPtr & part, const DataPartsLock & lock); + /// Create zero-copy exclusive lock for part and disk. Useful for coordination of /// distributed operations which can lead to data duplication. Implemented only in ReplicatedMergeTree. virtual std::optional tryCreateZeroCopyExclusiveLock(const String &, const DiskPtr &) { return std::nullopt; } diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 092ca717964..63d2fe41e48 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -117,7 +117,7 @@ static RelativeSize convertAbsoluteSampleSizeToRelative(const ASTPtr & node, siz QueryPlanPtr MergeTreeDataSelectExecutor::read( const Names & column_names_to_return, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, const UInt64 max_block_size, @@ -130,13 +130,17 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read( return std::make_unique(); const auto & settings = context->getSettingsRef(); + const auto & metadata_for_reading = storage_snapshot->getMetadataForQuery(); + + const auto & snapshot_data = assert_cast(*storage_snapshot->data); + const auto & parts = snapshot_data.parts; + if (!query_info.projection) { auto plan = readFromParts( - query_info.merge_tree_select_result_ptr ? MergeTreeData::DataPartsVector{} : data.getDataPartsVector(), + query_info.merge_tree_select_result_ptr ? MergeTreeData::DataPartsVector{} : parts, column_names_to_return, - metadata_snapshot, - metadata_snapshot, + storage_snapshot, query_info, context, max_block_size, @@ -146,7 +150,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read( enable_parallel_reading); if (plan->isInitialized() && settings.allow_experimental_projection_optimization && settings.force_optimize_projection - && !metadata_snapshot->projections.empty()) + && !metadata_for_reading->projections.empty()) throw Exception( "No projection is used when allow_experimental_projection_optimization = 1 and force_optimize_projection = 1", ErrorCodes::PROJECTION_NOT_USED); @@ -178,8 +182,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read( projection_plan = readFromParts( {}, query_info.projection->required_columns, - metadata_snapshot, - query_info.projection->desc->metadata, + storage_snapshot, query_info, context, max_block_size, @@ -1204,8 +1207,7 @@ MergeTreeDataSelectAnalysisResultPtr MergeTreeDataSelectExecutor::estimateNumMar QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( MergeTreeData::DataPartsVector parts, const Names & column_names_to_return, - const StorageMetadataPtr & metadata_snapshot_base, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, const UInt64 max_block_size, @@ -1237,8 +1239,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( virt_column_names, data, query_info, - metadata_snapshot, - metadata_snapshot_base, + storage_snapshot, context, max_block_size, num_streams, diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h index 3dde324ce22..e0647aa1ed2 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h @@ -28,7 +28,7 @@ public: QueryPlanPtr read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, UInt64 max_block_size, @@ -41,8 +41,7 @@ public: QueryPlanPtr readFromParts( MergeTreeData::DataPartsVector parts, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot_base, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, UInt64 max_block_size, diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index d16b5274a45..4805a273c70 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -281,6 +282,16 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart( { TemporaryPart temp_part; Block & block = block_with_partition.block; + auto columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames()); + auto storage_snapshot = data.getStorageSnapshot(metadata_snapshot); + + if (!storage_snapshot->object_columns.empty()) + { + auto extended_storage_columns = storage_snapshot->getColumns( + GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects()); + + convertObjectsToTuples(columns, block, extended_storage_columns); + } static const String TMP_PREFIX = "tmp_insert_"; @@ -357,7 +368,6 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart( for (const auto & ttl_entry : move_ttl_entries) updateTTL(ttl_entry, move_ttl_infos, move_ttl_infos.moves_ttl[ttl_entry.result_column], block, false); - NamesAndTypesList columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames()); ReservationPtr reservation = data.reserveSpacePreferringTTLRules(metadata_snapshot, expected_size, move_ttl_infos, time(nullptr), 0, true); VolumePtr volume = data.getStoragePolicy()->getVolume(0); @@ -426,7 +436,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart( auto compression_codec = data.getContext()->chooseCompressionCodec(0, 0); const auto & index_factory = MergeTreeIndexFactory::instance(); - auto out = std::make_unique(new_data_part, metadata_snapshot,columns, + auto out = std::make_unique(new_data_part, metadata_snapshot, columns, index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec); out->writeWithPermutation(block, perm_ptr); diff --git a/src/Storages/MergeTree/MergeTreePartition.cpp b/src/Storages/MergeTree/MergeTreePartition.cpp index 706d72771e9..4edf23bc0fb 100644 --- a/src/Storages/MergeTree/MergeTreePartition.cpp +++ b/src/Storages/MergeTree/MergeTreePartition.cpp @@ -124,6 +124,18 @@ namespace for (const auto & elem : x) applyVisitor(*this, elem); } + void operator() (const Object & x) const + { + UInt8 type = Field::Types::Object; + hash.update(type); + hash.update(x.size()); + + for (const auto & [key, value]: x) + { + hash.update(key); + applyVisitor(*this, value); + } + } void operator() (const DecimalField & x) const { UInt8 type = Field::Types::Decimal32; diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index 3c31ffa7c97..87839edc46f 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -23,7 +23,7 @@ MergeTreeReadPool::MergeTreeReadPool( size_t min_marks_for_concurrent_read_, RangesInDataParts && parts_, const MergeTreeData & data_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const PrewhereInfoPtr & prewhere_info_, const Names & column_names_, const BackoffSettings & backoff_settings_, @@ -32,7 +32,7 @@ MergeTreeReadPool::MergeTreeReadPool( : backoff_settings{backoff_settings_} , backoff_state{threads_} , data{data_} - , metadata_snapshot{metadata_snapshot_} + , storage_snapshot{storage_snapshot_} , column_names{column_names_} , do_not_steal_tasks{do_not_steal_tasks_} , predict_block_size_bytes{preferred_block_size_bytes_ > 0} @@ -146,7 +146,7 @@ MergeTreeReadTaskPtr MergeTreeReadPool::getTask(size_t min_marks_to_read, size_t Block MergeTreeReadPool::getHeader() const { - return metadata_snapshot->getSampleBlockForColumns(column_names, data.getVirtuals(), data.getStorageID()); + return storage_snapshot->getSampleBlockForColumns(column_names); } void MergeTreeReadPool::profileFeedback(ReadBufferFromFileBase::ProfileInfo info) @@ -192,7 +192,7 @@ void MergeTreeReadPool::profileFeedback(ReadBufferFromFileBase::ProfileInfo info std::vector MergeTreeReadPool::fillPerPartInfo(const RangesInDataParts & parts) { std::vector per_part_sum_marks; - Block sample_block = metadata_snapshot->getSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); is_part_on_remote_disk.resize(parts.size()); for (const auto i : collections::range(0, parts.size())) @@ -209,7 +209,7 @@ std::vector MergeTreeReadPool::fillPerPartInfo(const RangesInDataParts & per_part_sum_marks.push_back(sum_marks); - auto task_columns = getReadTaskColumns(data, metadata_snapshot, part.data_part, column_names, prewhere_info); + auto task_columns = getReadTaskColumns(data, storage_snapshot, part.data_part, column_names, prewhere_info); auto size_predictor = !predict_block_size_bytes ? nullptr : MergeTreeBaseSelectProcessor::getSizePredictor(part.data_part, task_columns, sample_block); diff --git a/src/Storages/MergeTree/MergeTreeReadPool.h b/src/Storages/MergeTree/MergeTreeReadPool.h index 4ab4393ef5a..0c93c701448 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.h +++ b/src/Storages/MergeTree/MergeTreeReadPool.h @@ -71,7 +71,7 @@ private: public: MergeTreeReadPool( size_t threads_, size_t sum_marks_, size_t min_marks_for_concurrent_read_, - RangesInDataParts && parts_, const MergeTreeData & data_, const StorageMetadataPtr & metadata_snapshot_, + RangesInDataParts && parts_, const MergeTreeData & data_, const StorageSnapshotPtr & storage_snapshot_, const PrewhereInfoPtr & prewhere_info_, const Names & column_names_, const BackoffSettings & backoff_settings_, size_t preferred_block_size_bytes_, @@ -95,7 +95,7 @@ private: const RangesInDataParts & parts, size_t min_marks_for_concurrent_read); const MergeTreeData & data; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; const Names column_names; bool do_not_steal_tasks; bool predict_block_size_bytes; diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp index 2e17611cd93..b943c3c8718 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp @@ -52,6 +52,15 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( auto name_and_type = columns.begin(); for (size_t i = 0; i < columns_num; ++i, ++name_and_type) { + if (name_and_type->isSubcolumn()) + { + auto storage_column_from_part = getColumnFromPart( + {name_and_type->getNameInStorage(), name_and_type->getTypeInStorage()}); + + if (!storage_column_from_part.type->tryGetSubcolumnType(name_and_type->getSubcolumnName())) + continue; + } + auto column_from_part = getColumnFromPart(*name_and_type); auto position = data_part->getColumnPosition(column_from_part.getNameInStorage()); diff --git a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp index 9599e3ee82c..c1b0067dbb0 100644 --- a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index 2d4d3617cee..3245134c470 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -9,7 +9,7 @@ namespace DB MergeTreeSelectProcessor::MergeTreeSelectProcessor( const MergeTreeData & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const MergeTreeData::DataPartPtr & owned_data_part_, UInt64 max_block_size_rows_, size_t preferred_block_size_bytes_, @@ -25,13 +25,13 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor( bool has_limit_below_one_block_, std::optional extension_) : MergeTreeBaseSelectProcessor{ - metadata_snapshot_->getSampleBlockForColumns(required_columns_, storage_.getVirtuals(), storage_.getStorageID()), - storage_, metadata_snapshot_, prewhere_info_, std::move(actions_settings), max_block_size_rows_, + storage_snapshot_->getSampleBlockForColumns(required_columns_), + storage_, storage_snapshot_, prewhere_info_, std::move(actions_settings), max_block_size_rows_, preferred_block_size_bytes_, preferred_max_column_in_block_size_bytes_, reader_settings_, use_uncompressed_cache_, virt_column_names_, extension_}, required_columns{std::move(required_columns_)}, data_part{owned_data_part_}, - sample_block(metadata_snapshot_->getSampleBlock()), + sample_block(storage_snapshot_->metadata->getSampleBlock()), all_mark_ranges(std::move(mark_ranges_)), part_index_in_query(part_index_in_query_), has_limit_below_one_block(has_limit_below_one_block_), @@ -48,7 +48,7 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor( void MergeTreeSelectProcessor::initializeReaders() { task_columns = getReadTaskColumns( - storage, metadata_snapshot, data_part, + storage, storage_snapshot, data_part, required_columns, prewhere_info); /// Will be used to distinguish between PREWHERE and WHERE columns when applying filter @@ -60,12 +60,12 @@ void MergeTreeSelectProcessor::initializeReaders() owned_mark_cache = storage.getContext()->getMarkCache(); - reader = data_part->getReader(task_columns.columns, metadata_snapshot, all_mark_ranges, - owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); + reader = data_part->getReader(task_columns.columns, storage_snapshot->getMetadataForQuery(), + all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); if (prewhere_info) - pre_reader = data_part->getReader(task_columns.pre_columns, metadata_snapshot, all_mark_ranges, - owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); + pre_reader = data_part->getReader(task_columns.pre_columns, storage_snapshot->getMetadataForQuery(), + all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); } diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.h b/src/Storages/MergeTree/MergeTreeSelectProcessor.h index 2ecdc3b59a8..4b3a46fc53c 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.h @@ -18,7 +18,7 @@ class MergeTreeSelectProcessor : public MergeTreeBaseSelectProcessor public: MergeTreeSelectProcessor( const MergeTreeData & storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot_, const MergeTreeData::DataPartPtr & owned_data_part, UInt64 max_block_size_rows, size_t preferred_block_size_bytes, diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp index 37012aa6570..5dbc59ba2d5 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp @@ -11,15 +11,15 @@ namespace ErrorCodes MergeTreeSequentialSource::MergeTreeSequentialSource( const MergeTreeData & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, MergeTreeData::DataPartPtr data_part_, Names columns_to_read_, bool read_with_direct_io_, bool take_column_types_from_storage, bool quiet) - : SourceWithProgress(metadata_snapshot_->getSampleBlockForColumns(columns_to_read_, storage_.getVirtuals(), storage_.getStorageID())) + : SourceWithProgress(storage_snapshot_->getSampleBlockForColumns(columns_to_read_)) , storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , data_part(std::move(data_part_)) , columns_to_read(std::move(columns_to_read_)) , read_with_direct_io(read_with_direct_io_) @@ -41,11 +41,12 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( addTotalRowsApprox(data_part->rows_count); /// Add columns because we don't want to read empty blocks - injectRequiredColumns(storage, metadata_snapshot, data_part, columns_to_read); + injectRequiredColumns(storage, storage_snapshot->metadata, data_part, columns_to_read); NamesAndTypesList columns_for_reader; if (take_column_types_from_storage) { - columns_for_reader = metadata_snapshot->getColumns().getByNames(ColumnsDescription::AllPhysical, columns_to_read, false); + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects(); + columns_for_reader = storage_snapshot->getColumnsByNames(options, columns_to_read); } else { @@ -63,7 +64,7 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( .save_marks_in_cache = false }; - reader = data_part->getReader(columns_for_reader, metadata_snapshot, + reader = data_part->getReader(columns_for_reader, storage_snapshot->metadata, MarkRanges{MarkRange(0, data_part->getMarksCount())}, /* uncompressed_cache = */ nullptr, mark_cache.get(), reader_settings); } diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.h b/src/Storages/MergeTree/MergeTreeSequentialSource.h index a7405140c6d..962b2035b16 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.h +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.h @@ -14,7 +14,7 @@ class MergeTreeSequentialSource : public SourceWithProgress public: MergeTreeSequentialSource( const MergeTreeData & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, MergeTreeData::DataPartPtr data_part_, Names columns_to_read_, bool read_with_direct_io_, @@ -35,7 +35,7 @@ protected: private: const MergeTreeData & storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; /// Data part will not be removed if the pointer owns it MergeTreeData::DataPartPtr data_part; diff --git a/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp index 145d292138a..063f018b1a4 100644 --- a/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp @@ -20,7 +20,7 @@ MergeTreeThreadSelectProcessor::MergeTreeThreadSelectProcessor( size_t preferred_block_size_bytes_, size_t preferred_max_column_in_block_size_bytes_, const MergeTreeData & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, bool use_uncompressed_cache_, const PrewhereInfoPtr & prewhere_info_, ExpressionActionsSettings actions_settings, @@ -29,7 +29,7 @@ MergeTreeThreadSelectProcessor::MergeTreeThreadSelectProcessor( std::optional extension_) : MergeTreeBaseSelectProcessor{ - pool_->getHeader(), storage_, metadata_snapshot_, prewhere_info_, std::move(actions_settings), max_block_size_rows_, + pool_->getHeader(), storage_, storage_snapshot_, prewhere_info_, std::move(actions_settings), max_block_size_rows_, preferred_block_size_bytes_, preferred_max_column_in_block_size_bytes_, reader_settings_, use_uncompressed_cache_, virt_column_names_, extension_}, thread{thread_}, @@ -103,6 +103,7 @@ void MergeTreeThreadSelectProcessor::finalizeNewTask() /// Allows pool to reduce number of threads in case of too slow reads. auto profile_callback = [this](ReadBufferFromFileBase::ProfileInfo info_) { pool->profileFeedback(info_); }; + const auto & metadata_snapshot = storage_snapshot->metadata; if (!reader) { diff --git a/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.h b/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.h index ae25ca2a88a..3bba42bed28 100644 --- a/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.h @@ -22,7 +22,7 @@ public: size_t preferred_block_size_bytes_, size_t preferred_max_column_in_block_size_bytes_, const MergeTreeData & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, bool use_uncompressed_cache_, const PrewhereInfoPtr & prewhere_info_, ExpressionActionsSettings actions_settings, diff --git a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h index 1dc1bd1eca4..854f070d0e0 100644 --- a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h +++ b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h @@ -24,7 +24,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -35,8 +35,7 @@ public: .readFromParts( parts, column_names, - metadata_snapshot, - metadata_snapshot, + storage_snapshot, query_info, context, max_block_size, diff --git a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp index 582a568cb48..d12e91f62e4 100644 --- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp +++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp @@ -270,7 +270,7 @@ bool StorageMaterializedPostgreSQL::needRewriteQueryWithFinal(const Names & colu Pipe StorageMaterializedPostgreSQL::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & query_info, ContextPtr context_, QueryProcessingStage::Enum processed_stage, @@ -279,7 +279,7 @@ Pipe StorageMaterializedPostgreSQL::read( { auto nested_table = getNested(); - auto pipe = readFinalFromNestedStorage(nested_table, column_names, metadata_snapshot, + auto pipe = readFinalFromNestedStorage(nested_table, column_names, query_info, context_, processed_stage, max_block_size, num_streams); auto lock = lockForShare(context_->getCurrentQueryId(), context_->getSettingsRef().lock_acquire_timeout); diff --git a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h index e6ce3bbdf65..e41eb8ee98f 100644 --- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h +++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.h @@ -85,7 +85,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context_, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/RabbitMQ/RabbitMQSource.cpp b/src/Storages/RabbitMQ/RabbitMQSource.cpp index be2806eb42a..4946a3537f9 100644 --- a/src/Storages/RabbitMQ/RabbitMQSource.cpp +++ b/src/Storages/RabbitMQ/RabbitMQSource.cpp @@ -8,12 +8,11 @@ namespace DB { -static std::pair getHeaders(StorageRabbitMQ & storage, const StorageMetadataPtr & metadata_snapshot) +static std::pair getHeaders(const StorageSnapshotPtr & storage_snapshot) { - auto non_virtual_header = metadata_snapshot->getSampleBlockNonMaterialized(); - auto virtual_header = metadata_snapshot->getSampleBlockForColumns( - {"_exchange_name", "_channel_id", "_delivery_tag", "_redelivered", "_message_id", "_timestamp"}, - storage.getVirtuals(), storage.getStorageID()); + auto non_virtual_header = storage_snapshot->metadata->getSampleBlockNonMaterialized(); + auto virtual_header = storage_snapshot->getSampleBlockForColumns( + {"_exchange_name", "_channel_id", "_delivery_tag", "_redelivered", "_message_id", "_timestamp"}); return {non_virtual_header, virtual_header}; } @@ -29,15 +28,15 @@ static Block getSampleBlock(const Block & non_virtual_header, const Block & virt RabbitMQSource::RabbitMQSource( StorageRabbitMQ & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, ContextPtr context_, const Names & columns, size_t max_block_size_, bool ack_in_suffix_) : RabbitMQSource( storage_, - metadata_snapshot_, - getHeaders(storage_, metadata_snapshot_), + storage_snapshot_, + getHeaders(storage_snapshot_), context_, columns, max_block_size_, @@ -47,7 +46,7 @@ RabbitMQSource::RabbitMQSource( RabbitMQSource::RabbitMQSource( StorageRabbitMQ & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, std::pair headers, ContextPtr context_, const Names & columns, @@ -55,7 +54,7 @@ RabbitMQSource::RabbitMQSource( bool ack_in_suffix_) : SourceWithProgress(getSampleBlock(headers.first, headers.second)) , storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , context(context_) , column_names(columns) , max_block_size(max_block_size_) diff --git a/src/Storages/RabbitMQ/RabbitMQSource.h b/src/Storages/RabbitMQ/RabbitMQSource.h index f3ceac8e1e5..ff46408db42 100644 --- a/src/Storages/RabbitMQ/RabbitMQSource.h +++ b/src/Storages/RabbitMQ/RabbitMQSource.h @@ -14,7 +14,7 @@ class RabbitMQSource : public SourceWithProgress public: RabbitMQSource( StorageRabbitMQ & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, ContextPtr context_, const Names & columns, size_t max_block_size_, @@ -34,7 +34,7 @@ public: private: StorageRabbitMQ & storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; ContextPtr context; Names column_names; const size_t max_block_size; @@ -48,7 +48,7 @@ private: RabbitMQSource( StorageRabbitMQ & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, std::pair headers, ContextPtr context_, const Names & columns, diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp index ac299657ae6..cadfa85299c 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp @@ -648,7 +648,7 @@ void StorageRabbitMQ::unbindExchange() Pipe StorageRabbitMQ::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /* query_info */, ContextPtr local_context, QueryProcessingStage::Enum /* processed_stage */, @@ -669,7 +669,7 @@ Pipe StorageRabbitMQ::read( std::lock_guard lock(loop_mutex); - auto sample_block = metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()); + auto sample_block = storage_snapshot->getSampleBlockForColumns(column_names); auto modified_context = addSettings(local_context); if (!connection->isConnected()) @@ -688,7 +688,7 @@ Pipe StorageRabbitMQ::read( for (size_t i = 0; i < num_created_consumers; ++i) { auto rabbit_source = std::make_shared( - *this, metadata_snapshot, modified_context, column_names, 1, rabbitmq_settings->rabbitmq_commit_on_select); + *this, storage_snapshot, modified_context, column_names, 1, rabbitmq_settings->rabbitmq_commit_on_select); auto converting_dag = ActionsDAG::makeConvertingActions( rabbit_source->getPort().getHeader().getColumnsWithTypeAndName(), @@ -1024,9 +1024,9 @@ bool StorageRabbitMQ::streamToViews() InterpreterInsertQuery interpreter(insert, rabbitmq_context, false, true, true); auto block_io = interpreter.execute(); - auto metadata_snapshot = getInMemoryMetadataPtr(); + auto storage_snapshot = getStorageSnapshot(getInMemoryMetadataPtr()); auto column_names = block_io.pipeline.getHeader().getNames(); - auto sample_block = metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()); + auto sample_block = storage_snapshot->getSampleBlockForColumns(column_names); auto block_size = getMaxBlockSize(); @@ -1039,7 +1039,7 @@ bool StorageRabbitMQ::streamToViews() for (size_t i = 0; i < num_created_consumers; ++i) { auto source = std::make_shared( - *this, metadata_snapshot, rabbitmq_context, column_names, block_size, false); + *this, storage_snapshot, rabbitmq_context, column_names, block_size, false); sources.emplace_back(source); pipes.emplace_back(source); diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.h b/src/Storages/RabbitMQ/StorageRabbitMQ.h index 9633326366d..394845bbc2f 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.h +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.h @@ -40,7 +40,7 @@ public: /// Always return virtual columns in addition to required columns Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/ReadFinalForExternalReplicaStorage.cpp b/src/Storages/ReadFinalForExternalReplicaStorage.cpp index 58b98aaa4c6..cf1c5c35629 100644 --- a/src/Storages/ReadFinalForExternalReplicaStorage.cpp +++ b/src/Storages/ReadFinalForExternalReplicaStorage.cpp @@ -27,7 +27,6 @@ bool needRewriteQueryWithFinalForStorage(const Names & column_names, const Stora Pipe readFinalFromNestedStorage( StoragePtr nested_storage, const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -36,7 +35,7 @@ Pipe readFinalFromNestedStorage( { NameSet column_names_set = NameSet(column_names.begin(), column_names.end()); auto lock = nested_storage->lockForShare(context->getCurrentQueryId(), context->getSettingsRef().lock_acquire_timeout); - const StorageMetadataPtr & nested_metadata = nested_storage->getInMemoryMetadataPtr(); + const auto & nested_metadata = nested_storage->getInMemoryMetadataPtr(); Block nested_header = nested_metadata->getSampleBlock(); ColumnWithTypeAndName & sign_column = nested_header.getByPosition(nested_header.columns() - 2); @@ -55,7 +54,8 @@ Pipe readFinalFromNestedStorage( filter_column_name = expressions->children.back()->getColumnName(); } - Pipe pipe = nested_storage->read(require_columns_name, nested_metadata, query_info, context, processed_stage, max_block_size, num_streams); + auto nested_snapshot = nested_storage->getStorageSnapshot(nested_metadata); + Pipe pipe = nested_storage->read(require_columns_name, nested_snapshot, query_info, context, processed_stage, max_block_size, num_streams); pipe.addTableLock(lock); pipe.addStorageHolder(nested_storage); diff --git a/src/Storages/ReadFinalForExternalReplicaStorage.h b/src/Storages/ReadFinalForExternalReplicaStorage.h index 979945a38c7..f21b396513f 100644 --- a/src/Storages/ReadFinalForExternalReplicaStorage.h +++ b/src/Storages/ReadFinalForExternalReplicaStorage.h @@ -16,7 +16,6 @@ bool needRewriteQueryWithFinalForStorage(const Names & column_names, const Stora Pipe readFinalFromNestedStorage( StoragePtr nested_storage, const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp index bd525ca9e5a..05b30bb014e 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp @@ -432,19 +432,19 @@ void StorageEmbeddedRocksDB::initDB() Pipe StorageEmbeddedRocksDB::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, unsigned num_streams) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); FieldVectorPtr keys; bool all_scan = false; - Block sample_block = metadata_snapshot->getSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); auto primary_key_data_type = sample_block.getByName(primary_key).type; std::tie(keys, all_scan) = getFilterKeys(primary_key, primary_key_data_type, query_info); if (all_scan) diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h index d45d3dd320c..52a08cbefd4 100644 --- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h +++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h @@ -28,7 +28,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index ead0d6b1260..801e1b80a20 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -145,10 +146,10 @@ StorageBuffer::StorageBuffer( class BufferSource : public SourceWithProgress { public: - BufferSource(const Names & column_names_, StorageBuffer::Buffer & buffer_, const StorageBuffer & storage, const StorageMetadataPtr & metadata_snapshot) - : SourceWithProgress( - metadata_snapshot->getSampleBlockForColumns(column_names_, storage.getVirtuals(), storage.getStorageID())) - , column_names_and_types(metadata_snapshot->getColumns().getByNames(ColumnsDescription::All, column_names_, true)) + BufferSource(const Names & column_names_, StorageBuffer::Buffer & buffer_, const StorageSnapshotPtr & storage_snapshot) + : SourceWithProgress(storage_snapshot->getSampleBlockForColumns(column_names_)) + , column_names_and_types(storage_snapshot->getColumnsByNames( + GetColumnsOptions(GetColumnsOptions::All).withSubcolumns(), column_names_)) , buffer(buffer_) {} String getName() const override { return "Buffer"; } @@ -189,7 +190,7 @@ private: QueryProcessingStage::Enum StorageBuffer::getQueryProcessingStage( ContextPtr local_context, QueryProcessingStage::Enum to_stage, - const StorageMetadataPtr &, + const StorageSnapshotPtr &, SelectQueryInfo & query_info) const { if (destination_id) @@ -201,7 +202,8 @@ QueryProcessingStage::Enum StorageBuffer::getQueryProcessingStage( /// TODO: Find a way to support projections for StorageBuffer query_info.ignore_projections = true; - return destination->getQueryProcessingStage(local_context, to_stage, destination->getInMemoryMetadataPtr(), query_info); + const auto & destination_metadata = destination->getInMemoryMetadataPtr(); + return destination->getQueryProcessingStage(local_context, to_stage, destination->getStorageSnapshot(destination_metadata), query_info); } return QueryProcessingStage::FetchColumns; @@ -210,7 +212,7 @@ QueryProcessingStage::Enum StorageBuffer::getQueryProcessingStage( Pipe StorageBuffer::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -218,7 +220,7 @@ Pipe StorageBuffer::read( const unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(local_context), BuildQueryPipelineSettings::fromContext(local_context)); @@ -227,13 +229,15 @@ Pipe StorageBuffer::read( void StorageBuffer::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned num_streams) { + const auto & metadata_snapshot = storage_snapshot->metadata; + if (destination_id) { auto destination = DatabaseCatalog::instance().getTable(destination_id, local_context); @@ -244,13 +248,14 @@ void StorageBuffer::read( auto destination_lock = destination->lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout); auto destination_metadata_snapshot = destination->getInMemoryMetadataPtr(); + auto destination_snapshot = destination->getStorageSnapshot(destination_metadata_snapshot); const bool dst_has_same_structure = std::all_of(column_names.begin(), column_names.end(), [metadata_snapshot, destination_metadata_snapshot](const String& column_name) { const auto & dest_columns = destination_metadata_snapshot->getColumns(); const auto & our_columns = metadata_snapshot->getColumns(); - auto dest_columm = dest_columns.tryGetColumnOrSubcolumn(ColumnsDescription::AllPhysical, column_name); - return dest_columm && dest_columm->type->equals(*our_columns.getColumnOrSubcolumn(ColumnsDescription::AllPhysical, column_name).type); + auto dest_columm = dest_columns.tryGetColumnOrSubcolumn(GetColumnsOptions::AllPhysical, column_name); + return dest_columm && dest_columm->type->equals(*our_columns.getColumnOrSubcolumn(GetColumnsOptions::AllPhysical, column_name).type); }); if (dst_has_same_structure) @@ -260,7 +265,7 @@ void StorageBuffer::read( /// The destination table has the same structure of the requested columns and we can simply read blocks from there. destination->read( - query_plan, column_names, destination_metadata_snapshot, query_info, + query_plan, column_names, destination_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); } else @@ -295,7 +300,7 @@ void StorageBuffer::read( else { destination->read( - query_plan, columns_intersection, destination_metadata_snapshot, query_info, + query_plan, columns_intersection, destination_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); if (query_plan.isInitialized()) @@ -352,7 +357,7 @@ void StorageBuffer::read( Pipes pipes_from_buffers; pipes_from_buffers.reserve(num_shards); for (auto & buf : buffers) - pipes_from_buffers.emplace_back(std::make_shared(column_names, buf, *this, metadata_snapshot)); + pipes_from_buffers.emplace_back(std::make_shared(column_names, buf, storage_snapshot)); pipe_from_buffers = Pipe::unitePipes(std::move(pipes_from_buffers)); } diff --git a/src/Storages/StorageBuffer.h b/src/Storages/StorageBuffer.h index f04619a1d21..f589560008a 100644 --- a/src/Storages/StorageBuffer.h +++ b/src/Storages/StorageBuffer.h @@ -58,11 +58,11 @@ public: std::string getName() const override { return "Buffer"; } QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -72,7 +72,7 @@ public: void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageDictionary.cpp b/src/Storages/StorageDictionary.cpp index e6d856b80fc..b3d0c1f17de 100644 --- a/src/Storages/StorageDictionary.cpp +++ b/src/Storages/StorageDictionary.cpp @@ -163,7 +163,7 @@ void StorageDictionary::checkTableCanBeDetached() const Pipe StorageDictionary::read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & /*query_info*/, ContextPtr local_context, QueryProcessingStage::Enum /*processed_stage*/, diff --git a/src/Storages/StorageDictionary.h b/src/Storages/StorageDictionary.h index 855d02b0947..bf9e6853233 100644 --- a/src/Storages/StorageDictionary.h +++ b/src/Storages/StorageDictionary.h @@ -26,7 +26,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 5bfb3b4ce45..1a390f784a2 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include #include @@ -55,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -277,9 +280,9 @@ void replaceConstantExpressions( ContextPtr context, const NamesAndTypesList & columns, ConstStoragePtr storage, - const StorageMetadataPtr & metadata_snapshot) + const StorageSnapshotPtr & storage_snapshot) { - auto syntax_result = TreeRewriter(context).analyze(node, columns, storage, metadata_snapshot); + auto syntax_result = TreeRewriter(context).analyze(node, columns, storage, storage_snapshot); Block block_with_constants = KeyCondition::getBlockWithConstants(node, syntax_result, context); InDepthNodeVisitor visitor(block_with_constants); @@ -423,7 +426,7 @@ StorageDistributed::StorageDistributed( QueryProcessingStage::Enum StorageDistributed::getQueryProcessingStage( ContextPtr local_context, QueryProcessingStage::Enum to_stage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info) const { const auto & settings = local_context->getSettingsRef(); @@ -437,7 +440,7 @@ QueryProcessingStage::Enum StorageDistributed::getQueryProcessingStage( /// (Anyway it will be calculated in the read()) if (nodes > 1 && settings.optimize_skip_unused_shards) { - ClusterPtr optimized_cluster = getOptimizedCluster(local_context, metadata_snapshot, query_info.query); + ClusterPtr optimized_cluster = getOptimizedCluster(local_context, storage_snapshot, query_info.query); if (optimized_cluster) { LOG_DEBUG(log, "Skipping irrelevant shards - the query will be sent to the following shards of the cluster (shard numbers): {}", @@ -590,9 +593,62 @@ std::optional StorageDistributed::getOptimizedQueryP return QueryProcessingStage::Complete; } +static bool requiresObjectColumns(const ColumnsDescription & all_columns, ASTPtr query) +{ + if (!hasObjectColumns(all_columns)) + return false; + + if (!query) + return true; + + RequiredSourceColumnsVisitor::Data columns_context; + RequiredSourceColumnsVisitor(columns_context).visit(query); + + auto required_columns = columns_context.requiredColumns(); + for (const auto & required_column : required_columns) + { + auto name_in_storage = Nested::splitName(required_column).first; + auto column_in_storage = all_columns.tryGetPhysical(name_in_storage); + + if (column_in_storage && isObject(column_in_storage->type)) + return true; + } + + return false; +} + +StorageSnapshotPtr StorageDistributed::getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot) const +{ + return getStorageSnapshotForQuery(metadata_snapshot, nullptr); +} + +StorageSnapshotPtr StorageDistributed::getStorageSnapshotForQuery( + const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query) const +{ + /// If query doesn't use columns of type Object, don't deduce + /// concrete types for them, because it required extra round trip. + auto snapshot_data = std::make_unique(); + if (!requiresObjectColumns(metadata_snapshot->getColumns(), query)) + return std::make_shared(*this, metadata_snapshot, ColumnsDescription{}, std::move(snapshot_data)); + + snapshot_data->objects_by_shard = getExtendedObjectsOfRemoteTables( + *getCluster(), + StorageID{remote_database, remote_table}, + metadata_snapshot->getColumns(), + getContext()); + + auto object_columns = DB::getObjectColumns( + snapshot_data->objects_by_shard.begin(), + snapshot_data->objects_by_shard.end(), + metadata_snapshot->getColumns(), + [](const auto & shard_num_and_columns) -> const auto & { return shard_num_and_columns.second; }); + + return std::make_shared(*this, metadata_snapshot, object_columns, std::move(snapshot_data)); +} + Pipe StorageDistributed::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -600,7 +656,7 @@ Pipe StorageDistributed::read( const unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(local_context), BuildQueryPipelineSettings::fromContext(local_context)); @@ -609,7 +665,7 @@ Pipe StorageDistributed::read( void StorageDistributed::read( QueryPlan & query_plan, const Names &, - const StorageMetadataPtr &, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -642,9 +698,12 @@ void StorageDistributed::read( if (!remote_table_function_ptr) main_table = StorageID{remote_database, remote_table}; + const auto & snapshot_data = assert_cast(*storage_snapshot->data); ClusterProxy::SelectStreamFactory select_stream_factory = ClusterProxy::SelectStreamFactory( header, + snapshot_data.objects_by_shard, + storage_snapshot, processed_stage); ClusterProxy::executeQuery( @@ -1077,7 +1136,7 @@ ClusterPtr StorageDistributed::getCluster() const } ClusterPtr StorageDistributed::getOptimizedCluster( - ContextPtr local_context, const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query_ptr) const + ContextPtr local_context, const StorageSnapshotPtr & storage_snapshot, const ASTPtr & query_ptr) const { ClusterPtr cluster = getCluster(); const Settings & settings = local_context->getSettingsRef(); @@ -1086,7 +1145,7 @@ ClusterPtr StorageDistributed::getOptimizedCluster( if (has_sharding_key && sharding_key_is_usable) { - ClusterPtr optimized = skipUnusedShards(cluster, query_ptr, metadata_snapshot, local_context); + ClusterPtr optimized = skipUnusedShards(cluster, query_ptr, storage_snapshot, local_context); if (optimized) return optimized; } @@ -1142,7 +1201,7 @@ IColumn::Selector StorageDistributed::createSelector(const ClusterPtr cluster, c ClusterPtr StorageDistributed::skipUnusedShards( ClusterPtr cluster, const ASTPtr & query_ptr, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, ContextPtr local_context) const { const auto & select = query_ptr->as(); @@ -1162,7 +1221,7 @@ ClusterPtr StorageDistributed::skipUnusedShards( condition_ast = select.prewhere() ? select.prewhere()->clone() : select.where()->clone(); } - replaceConstantExpressions(condition_ast, local_context, metadata_snapshot->getColumns().getAll(), shared_from_this(), metadata_snapshot); + replaceConstantExpressions(condition_ast, local_context, storage_snapshot->metadata->getColumns().getAll(), shared_from_this(), storage_snapshot); size_t limit = local_context->getSettingsRef().optimize_skip_unused_shards_limit; if (!limit || limit > SSIZE_MAX) @@ -1463,3 +1522,4 @@ void registerStorageDistributed(StorageFactory & factory) } } + diff --git a/src/Storages/StorageDistributed.h b/src/Storages/StorageDistributed.h index 45b1cd640ee..317463783ee 100644 --- a/src/Storages/StorageDistributed.h +++ b/src/Storages/StorageDistributed.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,7 @@ public: bool supportsFinal() const override { return true; } bool supportsPrewhere() const override { return true; } bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } StoragePolicyPtr getStoragePolicy() const override; /// Do not apply moving to PREWHERE optimization for distributed tables, @@ -59,12 +61,24 @@ public: bool isRemote() const override { return true; } + /// Snapshot for StorageDistributed contains descriptions + /// of columns of type Object for each shard at the moment + /// of the start of query. + struct SnapshotData : public StorageSnapshot::Data + { + ColumnsDescriptionByShardNum objects_by_shard; + }; + + StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot) const override; + StorageSnapshotPtr getStorageSnapshotForQuery( + const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query) const override; + QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -74,7 +88,7 @@ public: void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -175,10 +189,10 @@ private: /// Apply the following settings: /// - optimize_skip_unused_shards /// - force_optimize_skip_unused_shards - ClusterPtr getOptimizedCluster(ContextPtr, const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query_ptr) const; + ClusterPtr getOptimizedCluster(ContextPtr, const StorageSnapshotPtr & storage_snapshot, const ASTPtr & query_ptr) const; ClusterPtr skipUnusedShards( - ClusterPtr cluster, const ASTPtr & query_ptr, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) const; + ClusterPtr cluster, const ASTPtr & query_ptr, const StorageSnapshotPtr & storage_snapshot, ContextPtr context) const; /// This method returns optimal query processing stage. /// diff --git a/src/Storages/StorageExecutable.cpp b/src/Storages/StorageExecutable.cpp index 21143438725..d9e97f98d56 100644 --- a/src/Storages/StorageExecutable.cpp +++ b/src/Storages/StorageExecutable.cpp @@ -105,7 +105,7 @@ StorageExecutable::StorageExecutable( Pipe StorageExecutable::read( const Names & /*column_names*/, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -142,7 +142,7 @@ Pipe StorageExecutable::read( if (settings.is_executable_pool) transformToSingleBlockSources(inputs); - auto sample_block = metadata_snapshot->getSampleBlock(); + auto sample_block = storage_snapshot->metadata->getSampleBlock(); ShellCommandSourceConfiguration configuration; configuration.max_block_size = max_block_size; diff --git a/src/Storages/StorageExecutable.h b/src/Storages/StorageExecutable.h index b6248abae97..ede98ea5b47 100644 --- a/src/Storages/StorageExecutable.h +++ b/src/Storages/StorageExecutable.h @@ -31,7 +31,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageExternalDistributed.cpp b/src/Storages/StorageExternalDistributed.cpp index 40a2ad0b85e..18b8d4c037a 100644 --- a/src/Storages/StorageExternalDistributed.cpp +++ b/src/Storages/StorageExternalDistributed.cpp @@ -172,7 +172,7 @@ StorageExternalDistributed::StorageExternalDistributed( Pipe StorageExternalDistributed::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -184,7 +184,7 @@ Pipe StorageExternalDistributed::read( { pipes.emplace_back(shard->read( column_names, - metadata_snapshot, + storage_snapshot, query_info, context, processed_stage, diff --git a/src/Storages/StorageExternalDistributed.h b/src/Storages/StorageExternalDistributed.h index 33a58d324da..57767db10b0 100644 --- a/src/Storages/StorageExternalDistributed.h +++ b/src/Storages/StorageExternalDistributed.h @@ -32,7 +32,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 452eb16ebc5..93cceadaf93 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -447,28 +447,27 @@ public: static Block getBlockForSource( const StorageFilePtr & storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const ColumnsDescription & columns_description, const FilesInfoPtr & files_info) { if (storage->isColumnOriented()) - return metadata_snapshot->getSampleBlockForColumns( - columns_description.getNamesOfPhysical(), storage->getVirtuals(), storage->getStorageID()); + return storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); else - return getHeader(metadata_snapshot, files_info->need_path_column, files_info->need_file_column); + return getHeader(storage_snapshot->metadata, files_info->need_path_column, files_info->need_file_column); } StorageFileSource( std::shared_ptr storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, ContextPtr context_, UInt64 max_block_size_, FilesInfoPtr files_info_, ColumnsDescription columns_description_, std::unique_ptr read_buf_) - : SourceWithProgress(getBlockForSource(storage_, metadata_snapshot_, columns_description_, files_info_)) + : SourceWithProgress(getBlockForSource(storage_, storage_snapshot_, columns_description_, files_info_)) , storage(std::move(storage_)) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , files_info(std::move(files_info_)) , read_buf(std::move(read_buf_)) , columns_description(std::move(columns_description_)) @@ -518,8 +517,8 @@ public: auto get_block_for_format = [&]() -> Block { if (storage->isColumnOriented()) - return metadata_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); - return metadata_snapshot->getSampleBlock(); + return storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); + return storage_snapshot->metadata->getSampleBlock(); }; auto format = context->getInputFormat( @@ -582,7 +581,7 @@ public: private: std::shared_ptr storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; FilesInfoPtr files_info; String current_path; Block sample_block; @@ -603,7 +602,7 @@ private: Pipe StorageFile::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -617,7 +616,7 @@ Pipe StorageFile::read( if (paths.size() == 1 && !fs::exists(paths[0])) { if (context->getSettingsRef().engine_file_empty_if_not_exists) - return Pipe(std::make_shared(metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()))); + return Pipe(std::make_shared(storage_snapshot->getSampleBlockForColumns(column_names))); else throw Exception("File " + paths[0] + " doesn't exist", ErrorCodes::FILE_DOESNT_EXIST); } @@ -653,9 +652,9 @@ Pipe StorageFile::read( { if (isColumnOriented()) return ColumnsDescription{ - metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()).getNamesAndTypesList()}; + storage_snapshot->getSampleBlockForColumns(column_names).getNamesAndTypesList()}; else - return metadata_snapshot->getColumns(); + return storage_snapshot->metadata->getColumns(); }; /// In case of reading from fd we have to check whether we have already created @@ -667,7 +666,7 @@ Pipe StorageFile::read( read_buffer = std::move(peekable_read_buffer_from_fd); pipes.emplace_back(std::make_shared( - this_ptr, metadata_snapshot, context, max_block_size, files_info, get_columns_for_format(), std::move(read_buffer))); + this_ptr, storage_snapshot, context, max_block_size, files_info, get_columns_for_format(), std::move(read_buffer))); } return Pipe::unitePipes(std::move(pipes)); diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index bc2bd3bc933..86e75588e14 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -22,7 +22,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageGenerateRandom.cpp b/src/Storages/StorageGenerateRandom.cpp index 8934fd0ccbf..aa7b17191b6 100644 --- a/src/Storages/StorageGenerateRandom.cpp +++ b/src/Storages/StorageGenerateRandom.cpp @@ -486,19 +486,19 @@ void registerStorageGenerateRandom(StorageFactory & factory) Pipe StorageGenerateRandom::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, unsigned num_streams) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); Pipes pipes; pipes.reserve(num_streams); - const ColumnsDescription & our_columns = metadata_snapshot->getColumns(); + const ColumnsDescription & our_columns = storage_snapshot->metadata->getColumns(); Block block_header; for (const auto & name : column_names) { diff --git a/src/Storages/StorageGenerateRandom.h b/src/Storages/StorageGenerateRandom.h index d11a43b1dd6..2894b17d409 100644 --- a/src/Storages/StorageGenerateRandom.h +++ b/src/Storages/StorageGenerateRandom.h @@ -17,7 +17,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageInMemoryMetadata.cpp b/src/Storages/StorageInMemoryMetadata.cpp index 8439c721c1f..15a761a5b84 100644 --- a/src/Storages/StorageInMemoryMetadata.cpp +++ b/src/Storages/StorageInMemoryMetadata.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -342,39 +343,6 @@ Block StorageInMemoryMetadata::getSampleBlock() const return res; } -Block StorageInMemoryMetadata::getSampleBlockForColumns( - const Names & column_names, const NamesAndTypesList & virtuals, const StorageID & storage_id) const -{ - Block res; - - HashMapWithSavedHash virtuals_map; - - /// Virtual columns must be appended after ordinary, because user can - /// override them. - for (const auto & column : virtuals) - virtuals_map[column.name] = &column.type; - - for (const auto & name : column_names) - { - auto column = getColumns().tryGetColumnOrSubcolumn(ColumnsDescription::All, name); - if (column) - { - res.insert({column->type->createColumn(), column->type, column->name}); - } - else if (auto * it = virtuals_map.find(name); it != virtuals_map.end()) - { - const auto & type = *it->getMapped(); - res.insert({type->createColumn(), type, name}); - } - else - throw Exception( - "Column " + backQuote(name) + " not found in table " + (storage_id.empty() ? "" : storage_id.getNameForLogs()), - ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); - } - - return res; -} - const KeyDescription & StorageInMemoryMetadata::getPartitionKey() const { return partition_key; @@ -499,18 +467,6 @@ namespace using NamesAndTypesMap = HashMapWithSavedHash; using UniqueStrings = HashSetWithSavedHash; - String listOfColumns(const NamesAndTypesList & available_columns) - { - WriteBufferFromOwnString ss; - for (auto it = available_columns.begin(); it != available_columns.end(); ++it) - { - if (it != available_columns.begin()) - ss << ", "; - ss << it->name; - } - return ss.str(); - } - NamesAndTypesMap getColumnsMap(const NamesAndTypesList & columns) { NamesAndTypesMap res; @@ -539,36 +495,16 @@ namespace } } -void StorageInMemoryMetadata::check(const Names & column_names, const NamesAndTypesList & virtuals, const StorageID & storage_id) const +String listOfColumns(const NamesAndTypesList & available_columns) { - if (column_names.empty()) + WriteBufferFromOwnString ss; + for (auto it = available_columns.begin(); it != available_columns.end(); ++it) { - auto list_of_columns = listOfColumns(getColumns().getAllPhysicalWithSubcolumns()); - throw Exception(ErrorCodes::EMPTY_LIST_OF_COLUMNS_QUERIED, - "Empty list of columns queried. There are columns: {}", list_of_columns); - } - - const auto virtuals_map = getColumnsMap(virtuals); - UniqueStrings unique_names; - - for (const auto & name : column_names) - { - bool has_column = getColumns().hasColumnOrSubcolumn(ColumnsDescription::AllPhysical, name) - || virtuals_map.find(name) != nullptr; - - if (!has_column) - { - auto list_of_columns = listOfColumns(getColumns().getAllPhysicalWithSubcolumns()); - throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, - "There is no column with name {} in table {}. There are columns: {}", - backQuote(name), storage_id.getNameForLogs(), list_of_columns); - } - - if (unique_names.end() != unique_names.find(name)) - throw Exception(ErrorCodes::COLUMN_QUERIED_MORE_THAN_ONCE, "Column {} queried more than once", name); - - unique_names.insert(name); + if (it != available_columns.begin()) + ss << ", "; + ss << it->name; } + return ss.str(); } void StorageInMemoryMetadata::check(const NamesAndTypesList & provided_columns) const @@ -589,7 +525,10 @@ void StorageInMemoryMetadata::check(const NamesAndTypesList & provided_columns) listOfColumns(available_columns)); const auto * available_type = it->getMapped(); - if (!column.type->equals(*available_type) && !isCompatibleEnumTypes(available_type, column.type.get())) + + if (!isObject(*available_type) + && !column.type->equals(*available_type) + && !isCompatibleEnumTypes(available_type, column.type.get())) throw Exception( ErrorCodes::TYPE_MISMATCH, "Type mismatch for column {}. Column has type {}, got type {}", @@ -636,7 +575,9 @@ void StorageInMemoryMetadata::check(const NamesAndTypesList & provided_columns, const auto * provided_column_type = it->getMapped(); const auto * available_column_type = jt->getMapped(); - if (!provided_column_type->equals(*available_column_type) && !isCompatibleEnumTypes(available_column_type, provided_column_type)) + if (!isObject(*provided_column_type) + && !provided_column_type->equals(*available_column_type) + && !isCompatibleEnumTypes(available_column_type, provided_column_type)) throw Exception( ErrorCodes::TYPE_MISMATCH, "Type mismatch for column {}. Column has type {}, got type {}", @@ -678,7 +619,9 @@ void StorageInMemoryMetadata::check(const Block & block, bool need_all) const listOfColumns(available_columns)); const auto * available_type = it->getMapped(); - if (!column.type->equals(*available_type) && !isCompatibleEnumTypes(available_type, column.type.get())) + if (!isObject(*available_type) + && !column.type->equals(*available_type) + && !isCompatibleEnumTypes(available_type, column.type.get())) throw Exception( ErrorCodes::TYPE_MISMATCH, "Type mismatch for column {}. Column has type {}, got type {}", diff --git a/src/Storages/StorageInMemoryMetadata.h b/src/Storages/StorageInMemoryMetadata.h index bdaed8b2624..a9ab96909f4 100644 --- a/src/Storages/StorageInMemoryMetadata.h +++ b/src/Storages/StorageInMemoryMetadata.h @@ -165,13 +165,6 @@ struct StorageInMemoryMetadata /// Storage metadata. Block getSampleBlockWithVirtuals(const NamesAndTypesList & virtuals) const; - - /// Block with ordinary + materialized + aliases + virtuals. Virtuals have - /// to be explicitly specified, because they are part of Storage type, not - /// Storage metadata. StorageID required only for more clear exception - /// message. - Block getSampleBlockForColumns( - const Names & column_names, const NamesAndTypesList & virtuals = {}, const StorageID & storage_id = StorageID::createEmpty()) const; /// Returns structure with partition key. const KeyDescription & getPartitionKey() const; /// Returns ASTExpressionList of partition key expression for storage or nullptr if there is none. @@ -234,10 +227,6 @@ struct StorageInMemoryMetadata const SelectQueryDescription & getSelectQuery() const; bool hasSelectQuery() const; - /// Verify that all the requested names are in the table and are set correctly: - /// list of names is not empty and the names do not repeat. - void check(const Names & column_names, const NamesAndTypesList & virtuals, const StorageID & storage_id) const; - /// Check that all the requested names are in the table and have the correct types. void check(const NamesAndTypesList & columns) const; @@ -253,4 +242,6 @@ struct StorageInMemoryMetadata using StorageMetadataPtr = std::shared_ptr; using MultiVersionStorageMetadataPtr = MultiVersion; +String listOfColumns(const NamesAndTypesList & available_columns); + } diff --git a/src/Storages/StorageInput.cpp b/src/Storages/StorageInput.cpp index 2ed7a77b59d..a21a14cc240 100644 --- a/src/Storages/StorageInput.cpp +++ b/src/Storages/StorageInput.cpp @@ -52,7 +52,7 @@ void StorageInput::setPipe(Pipe pipe_) Pipe StorageInput::read( const Names & /*column_names*/, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -66,7 +66,7 @@ Pipe StorageInput::read( { /// Send structure to the client. query_context->initializeInput(shared_from_this()); - return Pipe(std::make_shared(query_context, metadata_snapshot->getSampleBlock())); + return Pipe(std::make_shared(query_context, storage_snapshot->metadata->getSampleBlock())); } if (pipe.empty()) diff --git a/src/Storages/StorageInput.h b/src/Storages/StorageInput.h index b28bc143bb0..4c44213a06b 100644 --- a/src/Storages/StorageInput.h +++ b/src/Storages/StorageInput.h @@ -20,7 +20,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageJoin.cpp b/src/Storages/StorageJoin.cpp index 9b3ec6617c0..ecd182457e2 100644 --- a/src/Storages/StorageJoin.cpp +++ b/src/Storages/StorageJoin.cpp @@ -579,16 +579,16 @@ private: // TODO: multiple stream read and index read Pipe StorageJoin::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); - Block source_sample_block = metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()); + Block source_sample_block = storage_snapshot->getSampleBlockForColumns(column_names); RWLockImpl::LockHolder holder = tryLockTimedWithContext(rwlock, RWLockImpl::Read, context); return Pipe(std::make_shared(join, std::move(holder), max_block_size, source_sample_block)); } diff --git a/src/Storages/StorageJoin.h b/src/Storages/StorageJoin.h index ba59bc06334..ea71ff4be8f 100644 --- a/src/Storages/StorageJoin.h +++ b/src/Storages/StorageJoin.h @@ -51,7 +51,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index d5d1f312bec..d3923a190a1 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -765,14 +765,14 @@ void StorageLog::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr Pipe StorageLog::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, unsigned num_streams) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); auto lock_timeout = getLockTimeout(context); loadMarks(lock_timeout); @@ -782,7 +782,7 @@ Pipe StorageLog::read( throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED); if (!num_data_files || !file_checker.getFileSize(data_files[INDEX_WITH_REAL_ROW_COUNT].path)) - return Pipe(std::make_shared(metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()))); + return Pipe(std::make_shared(storage_snapshot->getSampleBlockForColumns(column_names))); const Marks & marks_with_real_row_count = data_files[INDEX_WITH_REAL_ROW_COUNT].marks; size_t num_marks = marks_with_real_row_count.size(); @@ -791,7 +791,8 @@ Pipe StorageLog::read( if (num_streams > max_streams) num_streams = max_streams; - auto all_columns = metadata_snapshot->getColumns().getByNames(ColumnsDescription::All, column_names, true); + auto options = GetColumnsOptions(GetColumnsOptions::All).withSubcolumns(); + auto all_columns = storage_snapshot->getColumnsByNames(options, column_names); all_columns = Nested::convertToSubcolumns(all_columns); std::vector offsets; diff --git a/src/Storages/StorageLog.h b/src/Storages/StorageLog.h index 8b2ef0ccac1..d6f3208f693 100644 --- a/src/Storages/StorageLog.h +++ b/src/Storages/StorageLog.h @@ -31,7 +31,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageMaterializedMySQL.cpp b/src/Storages/StorageMaterializedMySQL.cpp index 07817aa634e..dbc0dd9ae92 100644 --- a/src/Storages/StorageMaterializedMySQL.cpp +++ b/src/Storages/StorageMaterializedMySQL.cpp @@ -34,7 +34,7 @@ bool StorageMaterializedMySQL::needRewriteQueryWithFinal(const Names & column_na Pipe StorageMaterializedMySQL::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & /*storage_snapshot*/, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -44,7 +44,7 @@ Pipe StorageMaterializedMySQL::read( if (const auto * db = typeid_cast(database)) db->rethrowExceptionIfNeeded(); - return readFinalFromNestedStorage(nested_storage, column_names, metadata_snapshot, + return readFinalFromNestedStorage(nested_storage, column_names, query_info, context, processed_stage, max_block_size, num_streams); } diff --git a/src/Storages/StorageMaterializedMySQL.h b/src/Storages/StorageMaterializedMySQL.h index ae874649b40..953d83360fd 100644 --- a/src/Storages/StorageMaterializedMySQL.h +++ b/src/Storages/StorageMaterializedMySQL.h @@ -25,7 +25,7 @@ public: bool needRewriteQueryWithFinal(const Names & column_names) const override; Pipe read( - const Names & column_names, const StorageMetadataPtr & metadata_snapshot, SelectQueryInfo & query_info, + const Names & column_names, const StorageSnapshotPtr & metadata_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned num_streams) override; SinkToStoragePtr write(const ASTPtr &, const StorageMetadataPtr &, ContextPtr) override { throwNotAllowed(); } diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index 7c5ef5ac04c..008b42e3299 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -132,19 +132,20 @@ StorageMaterializedView::StorageMaterializedView( QueryProcessingStage::Enum StorageMaterializedView::getQueryProcessingStage( ContextPtr local_context, QueryProcessingStage::Enum to_stage, - const StorageMetadataPtr &, + const StorageSnapshotPtr &, SelectQueryInfo & query_info) const { /// TODO: Find a way to support projections for StorageMaterializedView. Why do we use different /// metadata for materialized view and target table? If they are the same, we can get rid of all /// converting and use it just like a normal view. query_info.ignore_projections = true; - return getTargetTable()->getQueryProcessingStage(local_context, to_stage, getTargetTable()->getInMemoryMetadataPtr(), query_info); + const auto & target_metadata = getTargetTable()->getInMemoryMetadataPtr(); + return getTargetTable()->getQueryProcessingStage(local_context, to_stage, getTargetTable()->getStorageSnapshot(target_metadata), query_info); } Pipe StorageMaterializedView::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -152,7 +153,7 @@ Pipe StorageMaterializedView::read( const unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(local_context), BuildQueryPipelineSettings::fromContext(local_context)); @@ -161,7 +162,7 @@ Pipe StorageMaterializedView::read( void StorageMaterializedView::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -171,15 +172,16 @@ void StorageMaterializedView::read( auto storage = getTargetTable(); auto lock = storage->lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout); auto target_metadata_snapshot = storage->getInMemoryMetadataPtr(); + auto target_storage_snapshot = storage->getStorageSnapshot(target_metadata_snapshot); if (query_info.order_optimizer) query_info.input_order_info = query_info.order_optimizer->getInputOrder(target_metadata_snapshot, local_context); - storage->read(query_plan, column_names, target_metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + storage->read(query_plan, column_names, target_storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); if (query_plan.isInitialized()) { - auto mv_header = getHeaderForProcessingStage(*this, column_names, metadata_snapshot, query_info, local_context, processed_stage); + auto mv_header = getHeaderForProcessingStage(column_names, storage_snapshot, query_info, local_context, processed_stage); auto target_header = query_plan.getCurrentDataStream().header; /// No need to convert columns that does not exists in MV diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index 395560c1ca7..838f5278aa9 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -66,7 +66,7 @@ public: void shutdown() override; QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; StoragePtr getTargetTable() const; StoragePtr tryGetTargetTable() const; @@ -78,7 +78,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -88,7 +88,7 @@ public: void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index 72851472b79..c3601b33a04 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -2,9 +2,13 @@ #include #include +#include +#include #include #include #include +#include +#include #include #include @@ -30,13 +34,13 @@ public: MemorySource( Names column_names_, - const StorageMemory & storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, std::shared_ptr data_, std::shared_ptr> parallel_execution_index_, InitializerFunc initializer_func_ = {}) - : SourceWithProgress(metadata_snapshot->getSampleBlockForColumns(column_names_, storage.getVirtuals(), storage.getStorageID())) - , column_names_and_types(metadata_snapshot->getColumns().getByNames(ColumnsDescription::All, column_names_, true)) + : SourceWithProgress(storage_snapshot->getSampleBlockForColumns(column_names_)) + , column_names_and_types(storage_snapshot->getColumnsByNames( + GetColumnsOptions(GetColumnsOptions::All).withSubcolumns().withExtendedObjects(), column_names_)) , data(data_) , parallel_execution_index(parallel_execution_index_) , initializer_func(std::move(initializer_func_)) @@ -62,12 +66,20 @@ protected: } const Block & src = (*data)[current_index]; - Columns columns; - columns.reserve(column_names_and_types.size()); - /// Add only required columns to `res`. - for (const auto & elem : column_names_and_types) - columns.emplace_back(getColumnFromBlock(src, elem)); + Columns columns; + size_t num_columns = column_names_and_types.size(); + columns.reserve(num_columns); + + auto name_and_type = column_names_and_types.begin(); + for (size_t i = 0; i < num_columns; ++i) + { + columns.emplace_back(tryGetColumnFromBlock(src, *name_and_type)); + ++name_and_type; + } + + fillMissingColumns(columns, src.rows(), column_names_and_types, /*metadata_snapshot=*/ nullptr); + assert(std::all_of(columns.begin(), columns.end(), [](const auto & column) { return column != nullptr; })); return Chunk(std::move(columns), src.rows()); } @@ -101,7 +113,7 @@ public: const StorageMetadataPtr & metadata_snapshot_) : SinkToStorage(metadata_snapshot_->getSampleBlock()) , storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_.getStorageSnapshot(metadata_snapshot_)) { } @@ -110,7 +122,15 @@ public: void consume(Chunk chunk) override { auto block = getHeader().cloneWithColumns(chunk.getColumns()); - metadata_snapshot->check(block, true); + storage_snapshot->metadata->check(block, true); + if (!storage_snapshot->object_columns.empty()) + { + auto columns = storage_snapshot->metadata->getColumns().getAllPhysical().filter(block.getNames()); + auto extended_storage_columns = storage_snapshot->getColumns( + GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects()); + + convertObjectsToTuples(columns, block, extended_storage_columns); + } if (storage.compress) { @@ -151,7 +171,7 @@ private: Blocks new_blocks; StorageMemory & storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; }; @@ -170,17 +190,36 @@ StorageMemory::StorageMemory( setInMemoryMetadata(storage_metadata); } +StorageSnapshotPtr StorageMemory::getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot) const +{ + auto snapshot_data = std::make_unique(); + snapshot_data->blocks = data.get(); + + if (!hasObjectColumns(metadata_snapshot->getColumns())) + return std::make_shared(*this, metadata_snapshot, ColumnsDescription{}, std::move(snapshot_data)); + + auto object_columns = getObjectColumns( + snapshot_data->blocks->begin(), + snapshot_data->blocks->end(), + metadata_snapshot->getColumns(), + [](const auto & block) -> const auto & { return block.getColumnsWithTypeAndName(); }); + + return std::make_shared(*this, metadata_snapshot, object_columns, std::move(snapshot_data)); +} Pipe StorageMemory::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, size_t /*max_block_size*/, unsigned num_streams) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); + + const auto & snapshot_data = assert_cast(*storage_snapshot->data); + auto current_data = snapshot_data.blocks; if (delay_read_for_global_subqueries) { @@ -194,17 +233,15 @@ Pipe StorageMemory::read( return Pipe(std::make_shared( column_names, - *this, - metadata_snapshot, + storage_snapshot, nullptr /* data */, nullptr /* parallel execution index */, - [this](std::shared_ptr & data_to_initialize) + [current_data](std::shared_ptr & data_to_initialize) { - data_to_initialize = data.get(); + data_to_initialize = current_data; })); } - auto current_data = data.get(); size_t size = current_data->size(); if (num_streams > size) @@ -216,7 +253,7 @@ Pipe StorageMemory::read( for (size_t stream = 0; stream < num_streams; ++stream) { - pipes.emplace_back(std::make_shared(column_names, *this, metadata_snapshot, current_data, parallel_execution_index)); + pipes.emplace_back(std::make_shared(column_names, storage_snapshot, current_data, parallel_execution_index)); } return Pipe::unitePipes(std::move(pipes)); diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h index 063802faf1a..1c4421e51a6 100644 --- a/src/Storages/StorageMemory.h +++ b/src/Storages/StorageMemory.h @@ -29,9 +29,18 @@ public: size_t getSize() const { return data.get()->size(); } + /// Snapshot for StorageMemory contains current set of blocks + /// at the moment of the start of query. + struct SnapshotData : public StorageSnapshot::Data + { + std::shared_ptr blocks; + }; + + StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot) const override; + Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -40,6 +49,7 @@ public: bool supportsParallelInsert() const override { return true; } bool supportsSubcolumns() const override { return true; } + bool supportsDynamicSubcolumns() const override { return true; } /// Smaller blocks (e.g. 64K rows) are better for CPU cache. bool prefersLargeBlocks() const override { return false; } diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 15e499c6e6c..96e6070e09e 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -169,7 +169,7 @@ bool StorageMerge::mayBenefitFromIndexForIn(const ASTPtr & left_in_operand, Cont QueryProcessingStage::Enum StorageMerge::getQueryProcessingStage( ContextPtr local_context, QueryProcessingStage::Enum to_stage, - const StorageMetadataPtr &, + const StorageSnapshotPtr &, SelectQueryInfo & query_info) const { /// In case of JOIN the first stage (which includes JOIN) @@ -200,7 +200,8 @@ QueryProcessingStage::Enum StorageMerge::getQueryProcessingStage( ++selected_table_size; stage_in_source_tables = std::max( stage_in_source_tables, - table->getQueryProcessingStage(local_context, to_stage, table->getInMemoryMetadataPtr(), query_info)); + table->getQueryProcessingStage(local_context, to_stage, + table->getStorageSnapshot(table->getInMemoryMetadataPtr()), query_info)); } iterator->next(); @@ -235,7 +236,7 @@ SelectQueryInfo StorageMerge::getModifiedQueryInfo( Pipe StorageMerge::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -251,9 +252,9 @@ Pipe StorageMerge::read( for (const auto & column_name : column_names) { - if (column_name == "_database" && isVirtualColumn(column_name, metadata_snapshot)) + if (column_name == "_database" && isVirtualColumn(column_name, storage_snapshot->metadata)) has_database_virtual_column = true; - else if (column_name == "_table" && isVirtualColumn(column_name, metadata_snapshot)) + else if (column_name == "_table" && isVirtualColumn(column_name, storage_snapshot->metadata)) has_table_virtual_column = true; else real_column_names.push_back(column_name); @@ -266,7 +267,7 @@ Pipe StorageMerge::read( modified_context->setSetting("optimize_move_to_prewhere", false); /// What will be result structure depending on query processed stage in source tables? - Block header = getHeaderForProcessingStage(*this, column_names, metadata_snapshot, query_info, local_context, processed_stage); + Block header = getHeaderForProcessingStage(column_names, storage_snapshot, query_info, local_context, processed_stage); /** First we make list of selected tables to find out its size. * This is necessary to correctly pass the recommended number of threads to each table. @@ -337,9 +338,11 @@ Pipe StorageMerge::read( Aliases aliases; auto storage_metadata_snapshot = storage->getInMemoryMetadataPtr(); auto storage_columns = storage_metadata_snapshot->getColumns(); + auto nested_storage_snaphsot = storage->getStorageSnapshot(storage_metadata_snapshot); auto modified_query_info = getModifiedQueryInfo(query_info, modified_context, storage->getStorageID(), storage->as()); - auto syntax_result = TreeRewriter(local_context).analyzeSelect(modified_query_info.query, TreeRewriterResult({}, storage, storage_metadata_snapshot)); + auto syntax_result = TreeRewriter(local_context).analyzeSelect( + modified_query_info.query, TreeRewriterResult({}, storage, nested_storage_snaphsot)); Names column_names_as_aliases; bool with_aliases = processed_stage == QueryProcessingStage::FetchColumns && !storage_columns.getAliases().empty(); @@ -374,7 +377,8 @@ Pipe StorageMerge::read( } syntax_result = TreeRewriter(local_context).analyze( - required_columns_expr_list, storage_columns.getAllPhysical(), storage, storage_metadata_snapshot); + required_columns_expr_list, storage_columns.getAllPhysical(), storage, storage->getStorageSnapshot(storage_metadata_snapshot)); + auto alias_actions = ExpressionAnalyzer(required_columns_expr_list, syntax_result, local_context).getActionsDAG(true); column_names_as_aliases = alias_actions->getRequiredColumns().getNames(); @@ -383,7 +387,7 @@ Pipe StorageMerge::read( } auto source_pipe = createSources( - storage_metadata_snapshot, + nested_storage_snaphsot, modified_query_info, processed_stage, max_block_size, @@ -411,7 +415,7 @@ Pipe StorageMerge::read( } Pipe StorageMerge::createSources( - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & modified_query_info, const QueryProcessingStage::Enum & processed_stage, const UInt64 max_block_size, @@ -449,16 +453,16 @@ Pipe StorageMerge::createSources( } auto storage_stage - = storage->getQueryProcessingStage(modified_context, QueryProcessingStage::Complete, metadata_snapshot, modified_query_info); + = storage->getQueryProcessingStage(modified_context, QueryProcessingStage::Complete, storage_snapshot, modified_query_info); if (processed_stage <= storage_stage) { /// If there are only virtual columns in query, you must request at least one other column. if (real_column_names.empty()) - real_column_names.push_back(ExpressionActions::getSmallestColumn(metadata_snapshot->getColumns().getAllPhysical())); + real_column_names.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical())); pipe = storage->read( real_column_names, - metadata_snapshot, + storage_snapshot, modified_query_info, modified_context, processed_stage, @@ -538,7 +542,7 @@ Pipe StorageMerge::createSources( /// Subordinary tables could have different but convertible types, like numeric types of different width. /// We must return streams with structure equals to structure of Merge table. - convertingSourceStream(header, metadata_snapshot, aliases, modified_context, modified_query_info.query, pipe, processed_stage); + convertingSourceStream(header, storage_snapshot->metadata, aliases, modified_context, modified_query_info.query, pipe, processed_stage); pipe.addTableLock(struct_lock); pipe.addStorageHolder(storage); diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index e0d81531325..b7bdd957164 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -30,11 +30,11 @@ public: bool canMoveConditionsToPrewhere() const override; QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -116,7 +116,7 @@ protected: using Aliases = std::vector; Pipe createSources( - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, const QueryProcessingStage::Enum & processed_stage, UInt64 max_block_size, diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index a05ed04a66c..812e2264adb 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -193,7 +193,7 @@ StorageMergeTree::~StorageMergeTree() void StorageMergeTree::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -207,13 +207,13 @@ void StorageMergeTree::read( LOG_TRACE(log, "Parallel reading from replicas enabled {}", enable_parallel_reading); if (auto plan = reader.read( - column_names, metadata_snapshot, query_info, local_context, max_block_size, num_streams, processed_stage, nullptr, enable_parallel_reading)) + column_names, storage_snapshot, query_info, local_context, max_block_size, num_streams, processed_stage, nullptr, enable_parallel_reading)) query_plan = std::move(*plan); } Pipe StorageMergeTree::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -221,7 +221,7 @@ Pipe StorageMergeTree::read( const unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(local_context), BuildQueryPipelineSettings::fromContext(local_context)); diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h index d3970449ceb..abdaf6934d6 100644 --- a/src/Storages/StorageMergeTree.h +++ b/src/Storages/StorageMergeTree.h @@ -43,7 +43,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -53,7 +53,7 @@ public: void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageMongoDB.cpp b/src/Storages/StorageMongoDB.cpp index 9b25b44c0e7..dcb3eaa77db 100644 --- a/src/Storages/StorageMongoDB.cpp +++ b/src/Storages/StorageMongoDB.cpp @@ -90,7 +90,7 @@ void StorageMongoDB::connectIfNotConnected() Pipe StorageMongoDB::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, @@ -99,12 +99,12 @@ Pipe StorageMongoDB::read( { connectIfNotConnected(); - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); Block sample_block; for (const String & column_name : column_names) { - auto column_data = metadata_snapshot->getColumns().getPhysical(column_name); + auto column_data = storage_snapshot->metadata->getColumns().getPhysical(column_name); sample_block.insert({ column_data.type, column_data.name }); } diff --git a/src/Storages/StorageMongoDB.h b/src/Storages/StorageMongoDB.h index 0edfb558759..549d444d7bb 100644 --- a/src/Storages/StorageMongoDB.h +++ b/src/Storages/StorageMongoDB.h @@ -36,7 +36,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageMySQL.cpp b/src/Storages/StorageMySQL.cpp index 9dcbec0caae..5e7c2ae95ae 100644 --- a/src/Storages/StorageMySQL.cpp +++ b/src/Storages/StorageMySQL.cpp @@ -75,17 +75,17 @@ StorageMySQL::StorageMySQL( Pipe StorageMySQL::read( const Names & column_names_, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info_, ContextPtr context_, QueryProcessingStage::Enum /*processed_stage*/, size_t /*max_block_size*/, unsigned) { - metadata_snapshot->check(column_names_, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names_); String query = transformQueryForExternalDatabase( query_info_, - metadata_snapshot->getColumns().getOrdinary(), + storage_snapshot->metadata->getColumns().getOrdinary(), IdentifierQuotingStyle::BackticksMySQL, remote_database_name, remote_table_name, @@ -95,7 +95,7 @@ Pipe StorageMySQL::read( Block sample_block; for (const String & column_name : column_names_) { - auto column_data = metadata_snapshot->getColumns().getPhysical(column_name); + auto column_data = storage_snapshot->metadata->getColumns().getPhysical(column_name); WhichDataType which(column_data.type); /// Convert enum to string. diff --git a/src/Storages/StorageMySQL.h b/src/Storages/StorageMySQL.h index fe2ee8439bc..03ebaaf87d7 100644 --- a/src/Storages/StorageMySQL.h +++ b/src/Storages/StorageMySQL.h @@ -44,7 +44,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageNull.h b/src/Storages/StorageNull.h index 82baa98834d..c5b2e2bf161 100644 --- a/src/Storages/StorageNull.h +++ b/src/Storages/StorageNull.h @@ -23,7 +23,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo &, ContextPtr /*context*/, QueryProcessingStage::Enum /*processing_stage*/, @@ -31,7 +31,7 @@ public: unsigned) override { return Pipe( - std::make_shared(metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()))); + std::make_shared(storage_snapshot->getSampleBlockForColumns(column_names))); } bool supportsParallelInsert() const override { return true; } diff --git a/src/Storages/StoragePostgreSQL.cpp b/src/Storages/StoragePostgreSQL.cpp index aa54663ca10..c50086d2c52 100644 --- a/src/Storages/StoragePostgreSQL.cpp +++ b/src/Storages/StoragePostgreSQL.cpp @@ -76,26 +76,26 @@ StoragePostgreSQL::StoragePostgreSQL( Pipe StoragePostgreSQL::read( const Names & column_names_, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info_, ContextPtr context_, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size_, unsigned) { - metadata_snapshot->check(column_names_, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names_); /// Connection is already made to the needed database, so it should not be present in the query; /// remote_table_schema is empty if it is not specified, will access only table_name. String query = transformQueryForExternalDatabase( - query_info_, metadata_snapshot->getColumns().getOrdinary(), + query_info_, storage_snapshot->metadata->getColumns().getOrdinary(), IdentifierQuotingStyle::DoubleQuotes, remote_table_schema, remote_table_name, context_); LOG_TRACE(log, "Query: {}", query); Block sample_block; for (const String & column_name : column_names_) { - auto column_data = metadata_snapshot->getColumns().getPhysical(column_name); + auto column_data = storage_snapshot->metadata->getColumns().getPhysical(column_name); WhichDataType which(column_data.type); if (which.isEnum()) column_data.type = std::make_shared(); diff --git a/src/Storages/StoragePostgreSQL.h b/src/Storages/StoragePostgreSQL.h index 7d8752c91b9..ae41a713285 100644 --- a/src/Storages/StoragePostgreSQL.h +++ b/src/Storages/StoragePostgreSQL.h @@ -35,7 +35,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageProxy.h b/src/Storages/StorageProxy.h index 894b470ef22..d5af81ced3d 100644 --- a/src/Storages/StorageProxy.h +++ b/src/Storages/StorageProxy.h @@ -35,12 +35,13 @@ public: QueryProcessingStage::Enum getQueryProcessingStage( ContextPtr context, QueryProcessingStage::Enum to_stage, - const StorageMetadataPtr &, + const StorageSnapshotPtr &, SelectQueryInfo & info) const override { /// TODO: Find a way to support projections for StorageProxy info.ignore_projections = true; - return getNested()->getQueryProcessingStage(context, to_stage, getNested()->getInMemoryMetadataPtr(), info); + const auto & nested_metadata = getNested()->getInMemoryMetadataPtr(); + return getNested()->getQueryProcessingStage(context, to_stage, getNested()->getStorageSnapshot(nested_metadata), info); } Pipe watch( @@ -56,14 +57,14 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned num_streams) override { - return getNested()->read(column_names, metadata_snapshot, query_info, context, processed_stage, max_block_size, num_streams); + return getNested()->read(column_names, storage_snapshot, query_info, context, processed_stage, max_block_size, num_streams); } SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) override diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 9a5e1cfbabd..508b023bb79 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4216,7 +4216,7 @@ ReplicatedMergeTreeQuorumAddedParts::PartitionIdToMaxBlock StorageReplicatedMerg void StorageReplicatedMergeTree::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -4235,14 +4235,14 @@ void StorageReplicatedMergeTree::read( { auto max_added_blocks = std::make_shared(getMaxAddedBlocks()); if (auto plan = reader.read( - column_names, metadata_snapshot, query_info, local_context, + column_names, storage_snapshot, query_info, local_context, max_block_size, num_streams, processed_stage, std::move(max_added_blocks), enable_parallel_reading)) query_plan = std::move(*plan); return; } if (auto plan = reader.read( - column_names, metadata_snapshot, query_info, local_context, + column_names, storage_snapshot, query_info, local_context, max_block_size, num_streams, processed_stage, nullptr, enable_parallel_reading)) { query_plan = std::move(*plan); @@ -4251,7 +4251,7 @@ void StorageReplicatedMergeTree::read( Pipe StorageReplicatedMergeTree::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -4259,7 +4259,7 @@ Pipe StorageReplicatedMergeTree::read( const unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(local_context), BuildQueryPipelineSettings::fromContext(local_context)); diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 935bd048603..ff806d7a9b2 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -98,7 +98,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -108,7 +108,7 @@ public: void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index ec506ad0cd0..f319bd1097b 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -622,7 +622,7 @@ bool StorageS3::isColumnOriented() const Pipe StorageS3::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr local_context, QueryProcessingStage::Enum /*processed_stage*/, @@ -649,13 +649,13 @@ Pipe StorageS3::read( if (isColumnOriented()) { columns_description = ColumnsDescription{ - metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()).getNamesAndTypesList()}; - block_for_format = metadata_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); + storage_snapshot->getSampleBlockForColumns(column_names).getNamesAndTypesList()}; + block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); } else { - columns_description = metadata_snapshot->getColumns(); - block_for_format = metadata_snapshot->getSampleBlock(); + columns_description = storage_snapshot->metadata->getColumns(); + block_for_format = storage_snapshot->metadata->getSampleBlock(); } for (size_t i = 0; i < num_streams; ++i) diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index b2283687e2b..300b7becb93 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -146,7 +146,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageS3Cluster.cpp b/src/Storages/StorageS3Cluster.cpp index 57220c68347..b5549b32554 100644 --- a/src/Storages/StorageS3Cluster.cpp +++ b/src/Storages/StorageS3Cluster.cpp @@ -73,7 +73,7 @@ StorageS3Cluster::StorageS3Cluster( /// The code executes on initiator Pipe StorageS3Cluster::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -132,12 +132,12 @@ Pipe StorageS3Cluster::read( } } - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); return Pipe::unitePipes(std::move(pipes)); } QueryProcessingStage::Enum StorageS3Cluster::getQueryProcessingStage( - ContextPtr context, QueryProcessingStage::Enum to_stage, const StorageMetadataPtr &, SelectQueryInfo &) const + ContextPtr context, QueryProcessingStage::Enum to_stage, const StorageSnapshotPtr &, SelectQueryInfo &) const { /// Initiator executes query on remote node. if (context->getClientInfo().query_kind == ClientInfo::QueryKind::INITIAL_QUERY) diff --git a/src/Storages/StorageS3Cluster.h b/src/Storages/StorageS3Cluster.h index d1e02c5a730..6d64c56020f 100644 --- a/src/Storages/StorageS3Cluster.h +++ b/src/Storages/StorageS3Cluster.h @@ -25,11 +25,11 @@ class StorageS3Cluster : public shared_ptr_helper, public ISto public: std::string getName() const override { return "S3Cluster"; } - Pipe read(const Names &, const StorageMetadataPtr &, SelectQueryInfo &, + Pipe read(const Names &, const StorageSnapshotPtr &, SelectQueryInfo &, ContextPtr, QueryProcessingStage::Enum, size_t /*max_block_size*/, unsigned /*num_streams*/) override; QueryProcessingStage::Enum - getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageMetadataPtr &, SelectQueryInfo &) const override; + getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; NamesAndTypesList getVirtuals() const override; diff --git a/src/Storages/StorageSQLite.cpp b/src/Storages/StorageSQLite.cpp index f93584ab374..bc4e2b1dfe8 100644 --- a/src/Storages/StorageSQLite.cpp +++ b/src/Storages/StorageSQLite.cpp @@ -52,7 +52,7 @@ StorageSQLite::StorageSQLite( Pipe StorageSQLite::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context_, QueryProcessingStage::Enum, @@ -62,11 +62,11 @@ Pipe StorageSQLite::read( if (!sqlite_db) sqlite_db = openSQLiteDB(database_path, getContext(), /* throw_on_error */true); - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); String query = transformQueryForExternalDatabase( query_info, - metadata_snapshot->getColumns().getOrdinary(), + storage_snapshot->metadata->getColumns().getOrdinary(), IdentifierQuotingStyle::DoubleQuotes, "", remote_table_name, @@ -76,7 +76,7 @@ Pipe StorageSQLite::read( Block sample_block; for (const String & column_name : column_names) { - auto column_data = metadata_snapshot->getColumns().getPhysical(column_name); + auto column_data = storage_snapshot->metadata->getColumns().getPhysical(column_name); sample_block.insert({column_data.type, column_data.name}); } diff --git a/src/Storages/StorageSQLite.h b/src/Storages/StorageSQLite.h index e8fd0771ff4..367e6ee9e80 100644 --- a/src/Storages/StorageSQLite.h +++ b/src/Storages/StorageSQLite.h @@ -36,7 +36,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageSnapshot.cpp b/src/Storages/StorageSnapshot.cpp new file mode 100644 index 00000000000..e214afc6a90 --- /dev/null +++ b/src/Storages/StorageSnapshot.cpp @@ -0,0 +1,175 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_FOUND_COLUMN_IN_BLOCK; + extern const int EMPTY_LIST_OF_COLUMNS_QUERIED; + extern const int NO_SUCH_COLUMN_IN_TABLE; + extern const int COLUMN_QUERIED_MORE_THAN_ONCE; +} + +void StorageSnapshot::init() +{ + for (const auto & [name, type] : storage.getVirtuals()) + virtual_columns[name] = type; +} + +NamesAndTypesList StorageSnapshot::getColumns(const GetColumnsOptions & options) const +{ + auto all_columns = getMetadataForQuery()->getColumns().get(options); + + if (options.with_extended_objects) + extendObjectColumns(all_columns, object_columns, options.with_subcolumns); + + if (options.with_virtuals) + { + /// Virtual columns must be appended after ordinary, + /// because user can override them. + if (!virtual_columns.empty()) + { + NameSet column_names; + for (const auto & column : all_columns) + column_names.insert(column.name); + + for (const auto & [name, type] : virtual_columns) + if (!column_names.count(name)) + all_columns.emplace_back(name, type); + } + } + + return all_columns; +} + +NamesAndTypesList StorageSnapshot::getColumnsByNames(const GetColumnsOptions & options, const Names & names) const +{ + NamesAndTypesList res; + const auto & columns = getMetadataForQuery()->getColumns(); + for (const auto & name : names) + { + auto column = columns.tryGetColumn(options, name); + if (column && !isObject(column->type)) + { + res.emplace_back(std::move(*column)); + continue; + } + + if (options.with_extended_objects) + { + auto object_column = object_columns.tryGetColumn(options, name); + if (object_column) + { + res.emplace_back(std::move(*object_column)); + continue; + } + } + + if (options.with_virtuals) + { + auto it = virtual_columns.find(name); + if (it != virtual_columns.end()) + { + res.emplace_back(name, it->second); + continue; + } + } + + throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no column {} in table", name); + } + + return res; +} + +Block StorageSnapshot::getSampleBlockForColumns(const Names & column_names) const +{ + Block res; + + const auto & columns = getMetadataForQuery()->getColumns(); + for (const auto & name : column_names) + { + auto column = columns.tryGetColumnOrSubcolumn(GetColumnsOptions::All, name); + auto object_column = object_columns.tryGetColumnOrSubcolumn(GetColumnsOptions::All, name); + + if (column && !object_column) + { + res.insert({column->type->createColumn(), column->type, column->name}); + } + else if (object_column) + { + res.insert({object_column->type->createColumn(), object_column->type, object_column->name}); + } + else if (auto it = virtual_columns.find(name); it != virtual_columns.end()) + { + /// Virtual columns must be appended after ordinary, because user can + /// override them. + const auto & type = it->second; + res.insert({type->createColumn(), type, name}); + } + else + { + throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, + "Column {} not found in table {}", backQuote(name), storage.getStorageID().getNameForLogs()); + } + } + + return res; +} + +namespace +{ + using DenseHashSet = google::dense_hash_set; +} + +void StorageSnapshot::check(const Names & column_names) const +{ + const auto & columns = getMetadataForQuery()->getColumns(); + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withSubcolumns(); + + if (column_names.empty()) + { + auto list_of_columns = listOfColumns(columns.get(options)); + throw Exception(ErrorCodes::EMPTY_LIST_OF_COLUMNS_QUERIED, + "Empty list of columns queried. There are columns: {}", list_of_columns); + } + + DenseHashSet unique_names; + unique_names.set_empty_key(StringRef()); + + for (const auto & name : column_names) + { + bool has_column = columns.hasColumnOrSubcolumn(GetColumnsOptions::AllPhysical, name) + || object_columns.hasColumnOrSubcolumn(GetColumnsOptions::AllPhysical, name) + || virtual_columns.count(name); + + if (!has_column) + { + auto list_of_columns = listOfColumns(columns.get(options)); + throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, + "There is no column with name {} in table {}. There are columns: {}", + backQuote(name), storage.getStorageID().getNameForLogs(), list_of_columns); + } + + if (unique_names.count(name)) + throw Exception(ErrorCodes::COLUMN_QUERIED_MORE_THAN_ONCE, "Column {} queried more than once", name); + + unique_names.insert(name); + } +} + +DataTypePtr StorageSnapshot::getConcreteType(const String & column_name) const +{ + auto object_column = object_columns.tryGetColumnOrSubcolumn(GetColumnsOptions::All, column_name); + if (object_column) + return object_column->type; + + return metadata->getColumns().get(column_name).type; +} + +} diff --git a/src/Storages/StorageSnapshot.h b/src/Storages/StorageSnapshot.h new file mode 100644 index 00000000000..46244827f6c --- /dev/null +++ b/src/Storages/StorageSnapshot.h @@ -0,0 +1,86 @@ +#pragma once +#include + +namespace DB +{ + +class IStorage; + +/// Snapshot of storage that fixes set columns that can be read in query. +/// There are 3 sources of columns: regular columns from metadata, +/// dynamic columns from object Types, virtual columns. +struct StorageSnapshot +{ + const IStorage & storage; + const StorageMetadataPtr metadata; + const ColumnsDescription object_columns; + + /// Additional data, on which set of columns may depend. + /// E.g. data parts in MergeTree, list of blocks in Memory, etc. + struct Data + { + virtual ~Data() = default; + }; + + using DataPtr = std::unique_ptr; + const DataPtr data; + + /// Projection that is used in query. + mutable const ProjectionDescription * projection = nullptr; + + StorageSnapshot( + const IStorage & storage_, + const StorageMetadataPtr & metadata_) + : storage(storage_), metadata(metadata_) + { + init(); + } + + StorageSnapshot( + const IStorage & storage_, + const StorageMetadataPtr & metadata_, + const ColumnsDescription & object_columns_) + : storage(storage_), metadata(metadata_), object_columns(object_columns_) + { + init(); + } + + StorageSnapshot( + const IStorage & storage_, + const StorageMetadataPtr & metadata_, + const ColumnsDescription & object_columns_, + DataPtr data_) + : storage(storage_), metadata(metadata_), object_columns(object_columns_), data(std::move(data_)) + { + init(); + } + + /// Get all available columns with types according to options. + NamesAndTypesList getColumns(const GetColumnsOptions & options) const; + + /// Get columns with types according to options only for requested names. + NamesAndTypesList getColumnsByNames(const GetColumnsOptions & options, const Names & names) const; + + /// Block with ordinary + materialized + aliases + virtuals + subcolumns. + Block getSampleBlockForColumns(const Names & column_names) const; + + /// Verify that all the requested names are in the table and are set correctly: + /// list of names is not empty and the names do not repeat. + void check(const Names & column_names) const; + + DataTypePtr getConcreteType(const String & column_name) const; + + void addProjection(const ProjectionDescription * projection_) const { projection = projection_; } + + /// If we have a projection then we should use its metadata. + StorageMetadataPtr getMetadataForQuery() const { return projection ? projection->metadata : metadata; } + +private: + void init(); + + std::unordered_map virtual_columns; +}; + +using StorageSnapshotPtr = std::shared_ptr; + +} diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp index c401d27a8fc..f1f84a88c36 100644 --- a/src/Storages/StorageStripeLog.cpp +++ b/src/Storages/StorageStripeLog.cpp @@ -63,14 +63,13 @@ class StripeLogSource final : public SourceWithProgress { public: static Block getHeader( - const StorageStripeLog & storage, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const Names & column_names, IndexForNativeFormat::Blocks::const_iterator index_begin, IndexForNativeFormat::Blocks::const_iterator index_end) { if (index_begin == index_end) - return metadata_snapshot->getSampleBlockForColumns(column_names, storage.getVirtuals(), storage.getStorageID()); + return storage_snapshot->getSampleBlockForColumns(column_names); /// TODO: check if possible to always return storage.getSampleBlock() @@ -87,16 +86,16 @@ public: StripeLogSource( const StorageStripeLog & storage_, - const StorageMetadataPtr & metadata_snapshot_, + const StorageSnapshotPtr & storage_snapshot_, const Names & column_names, ReadSettings read_settings_, std::shared_ptr indices_, IndexForNativeFormat::Blocks::const_iterator index_begin_, IndexForNativeFormat::Blocks::const_iterator index_end_, size_t file_size_) - : SourceWithProgress(getHeader(storage_, metadata_snapshot_, column_names, index_begin_, index_end_)) + : SourceWithProgress(getHeader(storage_snapshot_, column_names, index_begin_, index_end_)) , storage(storage_) - , metadata_snapshot(metadata_snapshot_) + , storage_snapshot(storage_snapshot_) , read_settings(std::move(read_settings_)) , indices(indices_) , index_begin(index_begin_) @@ -131,7 +130,7 @@ protected: private: const StorageStripeLog & storage; - StorageMetadataPtr metadata_snapshot; + StorageSnapshotPtr storage_snapshot; ReadSettings read_settings; std::shared_ptr indices; @@ -343,14 +342,14 @@ static std::chrono::seconds getLockTimeout(ContextPtr context) Pipe StorageStripeLog::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t /*max_block_size*/, unsigned num_streams) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); auto lock_timeout = getLockTimeout(context); loadIndices(lock_timeout); @@ -361,7 +360,7 @@ Pipe StorageStripeLog::read( size_t data_file_size = file_checker.getFileSize(data_file_path); if (!data_file_size) - return Pipe(std::make_shared(metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()))); + return Pipe(std::make_shared(storage_snapshot->getSampleBlockForColumns(column_names))); auto indices_for_selected_columns = std::make_shared(indices.extractIndexForColumns(NameSet{column_names.begin(), column_names.end()})); @@ -382,7 +381,7 @@ Pipe StorageStripeLog::read( std::advance(end, (stream + 1) * size / num_streams); pipes.emplace_back(std::make_shared( - *this, metadata_snapshot, column_names, read_settings, indices_for_selected_columns, begin, end, data_file_size)); + *this, storage_snapshot, column_names, read_settings, indices_for_selected_columns, begin, end, data_file_size)); } /// We do not keep read lock directly at the time of reading, because we read ranges of data that do not change. diff --git a/src/Storages/StorageStripeLog.h b/src/Storages/StorageStripeLog.h index 579e2f991e7..bab5116cfc1 100644 --- a/src/Storages/StorageStripeLog.h +++ b/src/Storages/StorageStripeLog.h @@ -32,7 +32,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageTableFunction.h b/src/Storages/StorageTableFunction.h index 8054762d389..4616421b24a 100644 --- a/src/Storages/StorageTableFunction.h +++ b/src/Storages/StorageTableFunction.h @@ -93,7 +93,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -104,12 +104,12 @@ public: for (const auto & c : column_names) cnames += c + " "; auto storage = getNested(); - auto nested_metadata = storage->getInMemoryMetadataPtr(); - auto pipe = storage->read(column_names, nested_metadata, query_info, context, + auto nested_snapshot = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr()); + auto pipe = storage->read(column_names, nested_snapshot, query_info, context, processed_stage, max_block_size, num_streams); if (!pipe.empty() && add_conversion) { - auto to_header = getHeaderForProcessingStage(*this, column_names, metadata_snapshot, + auto to_header = getHeaderForProcessingStage(column_names, storage_snapshot, query_info, context, processed_stage); auto convert_actions_dag = ActionsDAG::makeConvertingActions( diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index bb8c583e329..5c8a7ea2be5 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -531,7 +531,7 @@ std::string IStorageURLBase::getReadMethod() const std::vector> IStorageURLBase::getReadURIParams( const Names & /*column_names*/, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & /*storage_snapshot*/, const SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum & /*processed_stage*/, @@ -630,27 +630,27 @@ bool IStorageURLBase::isColumnOriented() const Pipe IStorageURLBase::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned num_streams) { - auto params = getReadURIParams(column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size); + auto params = getReadURIParams(column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size); ColumnsDescription columns_description; Block block_for_format; if (isColumnOriented()) { columns_description = ColumnsDescription{ - metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()).getNamesAndTypesList()}; - block_for_format = metadata_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); + storage_snapshot->getSampleBlockForColumns(column_names).getNamesAndTypesList()}; + block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); } else { - columns_description = metadata_snapshot->getColumns(); - block_for_format = metadata_snapshot->getSampleBlock(); + columns_description = storage_snapshot->metadata->getColumns(); + block_for_format = storage_snapshot->metadata->getSampleBlock(); } size_t max_download_threads = local_context->getSettingsRef().max_download_threads; @@ -720,7 +720,7 @@ Pipe IStorageURLBase::read( Pipe StorageURLWithFailover::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, @@ -732,16 +732,16 @@ Pipe StorageURLWithFailover::read( if (isColumnOriented()) { columns_description = ColumnsDescription{ - metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()).getNamesAndTypesList()}; - block_for_format = metadata_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); + storage_snapshot->getSampleBlockForColumns(column_names).getNamesAndTypesList()}; + block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical()); } else { - columns_description = metadata_snapshot->getColumns(); - block_for_format = metadata_snapshot->getSampleBlock(); + columns_description = storage_snapshot->metadata->getColumns(); + block_for_format = storage_snapshot->metadata->getSampleBlock(); } - auto params = getReadURIParams(column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size); + auto params = getReadURIParams(column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size); auto uri_info = std::make_shared(); uri_info->uri_list_to_read.emplace_back(uri_options); diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index 79d2489f241..a035b1bb93d 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -30,7 +30,7 @@ class IStorageURLBase : public IStorage public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -80,7 +80,7 @@ protected: virtual std::vector> getReadURIParams( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum & processed_stage, @@ -97,7 +97,7 @@ protected: bool isColumnOriented() const override; private: - virtual Block getHeaderBlock(const Names & column_names, const StorageMetadataPtr & metadata_snapshot) const = 0; + virtual Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const = 0; }; class StorageURLSink : public SinkToStorage @@ -145,9 +145,9 @@ public: return "URL"; } - Block getHeaderBlock(const Names & /*column_names*/, const StorageMetadataPtr & metadata_snapshot) const override + Block getHeaderBlock(const Names & /*column_names*/, const StorageSnapshotPtr & storage_snapshot) const override { - return metadata_snapshot->getSampleBlock(); + return storage_snapshot->metadata->getSampleBlock(); } static FormatSettings getFormatSettingsFromArgs(const StorageFactory::Arguments & args); @@ -172,7 +172,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageValues.cpp b/src/Storages/StorageValues.cpp index 650782afbba..2a3e1743983 100644 --- a/src/Storages/StorageValues.cpp +++ b/src/Storages/StorageValues.cpp @@ -22,14 +22,14 @@ StorageValues::StorageValues( Pipe StorageValues::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, size_t /*max_block_size*/, unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); /// Get only required columns. Block block; diff --git a/src/Storages/StorageValues.h b/src/Storages/StorageValues.h index 69b2f757046..21156ec27cc 100644 --- a/src/Storages/StorageValues.h +++ b/src/Storages/StorageValues.h @@ -17,7 +17,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageView.cpp b/src/Storages/StorageView.cpp index bcf7d7856cf..68b16de5a80 100644 --- a/src/Storages/StorageView.cpp +++ b/src/Storages/StorageView.cpp @@ -107,7 +107,7 @@ StorageView::StorageView( Pipe StorageView::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -115,7 +115,7 @@ Pipe StorageView::read( const unsigned num_streams) { QueryPlan plan; - read(plan, column_names, metadata_snapshot, query_info, context, processed_stage, max_block_size, num_streams); + read(plan, column_names, storage_snapshot, query_info, context, processed_stage, max_block_size, num_streams); return plan.convertToPipe( QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); @@ -124,14 +124,14 @@ Pipe StorageView::read( void StorageView::read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t /*max_block_size*/, const unsigned /*num_streams*/) { - ASTPtr current_inner_query = metadata_snapshot->getSelectQuery().inner_query; + ASTPtr current_inner_query = storage_snapshot->metadata->getSelectQuery().inner_query; if (query_info.view_query) { @@ -154,7 +154,7 @@ void StorageView::read( query_plan.addStep(std::move(materializing)); /// And also convert to expected structure. - const auto & expected_header = metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()); + const auto & expected_header = storage_snapshot->getSampleBlockForColumns(column_names); const auto & header = query_plan.getCurrentDataStream().header; const auto * select_with_union = current_inner_query->as(); diff --git a/src/Storages/StorageView.h b/src/Storages/StorageView.h index cd36a10aae7..f49736afe4a 100644 --- a/src/Storages/StorageView.h +++ b/src/Storages/StorageView.h @@ -23,7 +23,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -33,7 +33,7 @@ public: void read( QueryPlan & query_plan, const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/StorageXDBC.cpp b/src/Storages/StorageXDBC.cpp index 3cb6c9d0359..d9a2a77515e 100644 --- a/src/Storages/StorageXDBC.cpp +++ b/src/Storages/StorageXDBC.cpp @@ -57,7 +57,7 @@ std::string StorageXDBC::getReadMethod() const std::vector> StorageXDBC::getReadURIParams( const Names & /* column_names */, - const StorageMetadataPtr & /* metadata_snapshot */, + const StorageSnapshotPtr & /*storage_snapshot*/, const SelectQueryInfo & /*query_info*/, ContextPtr /*context*/, QueryProcessingStage::Enum & /*processed_stage*/, @@ -101,17 +101,17 @@ std::function StorageXDBC::getReadPOSTDataCallback( Pipe StorageXDBC::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr local_context, QueryProcessingStage::Enum processed_stage, size_t max_block_size, unsigned num_streams) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); bridge_helper->startBridgeSync(); - return IStorageURLBase::read(column_names, metadata_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); + return IStorageURLBase::read(column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size, num_streams); } SinkToStoragePtr StorageXDBC::write(const ASTPtr & /* query */, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context) @@ -145,9 +145,9 @@ bool StorageXDBC::isColumnOriented() const return true; } -Block StorageXDBC::getHeaderBlock(const Names & column_names, const StorageMetadataPtr & metadata_snapshot) const +Block StorageXDBC::getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const { - return metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()); + return storage_snapshot->getSampleBlockForColumns(column_names); } std::string StorageXDBC::getName() const diff --git a/src/Storages/StorageXDBC.h b/src/Storages/StorageXDBC.h index d8771c4ed83..514171026fc 100644 --- a/src/Storages/StorageXDBC.h +++ b/src/Storages/StorageXDBC.h @@ -21,7 +21,7 @@ class StorageXDBC : public IStorageURLBase public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -51,7 +51,7 @@ private: std::vector> getReadURIParams( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum & processed_stage, @@ -65,7 +65,7 @@ private: QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const override; - Block getHeaderBlock(const Names & column_names, const StorageMetadataPtr & metadata_snapshot) const override; + Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const override; bool isColumnOriented() const override; }; diff --git a/src/Storages/System/IStorageSystemOneBlock.h b/src/Storages/System/IStorageSystemOneBlock.h index d78c8179a71..7cc94a3f2f6 100644 --- a/src/Storages/System/IStorageSystemOneBlock.h +++ b/src/Storages/System/IStorageSystemOneBlock.h @@ -41,17 +41,16 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, size_t /*max_block_size*/, unsigned /*num_streams*/) override { - auto virtuals_names_and_types = getVirtuals(); - metadata_snapshot->check(column_names, virtuals_names_and_types, getStorageID()); + storage_snapshot->check(column_names); - Block sample_block = metadata_snapshot->getSampleBlockWithVirtuals(virtuals_names_and_types); + Block sample_block = storage_snapshot->metadata->getSampleBlockWithVirtuals(getVirtuals()); MutableColumns res_columns = sample_block.cloneEmptyColumns(); fillData(res_columns, context, query_info); diff --git a/src/Storages/System/StorageSystemColumns.cpp b/src/Storages/System/StorageSystemColumns.cpp index d847c00846c..082b46f5a7e 100644 --- a/src/Storages/System/StorageSystemColumns.cpp +++ b/src/Storages/System/StorageSystemColumns.cpp @@ -304,20 +304,20 @@ private: Pipe StorageSystemColumns::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t max_block_size, const unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); /// Create a mask of what columns are needed in the result. NameSet names_set(column_names.begin(), column_names.end()); - Block sample_block = metadata_snapshot->getSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); Block header; std::vector columns_mask(sample_block.columns()); diff --git a/src/Storages/System/StorageSystemColumns.h b/src/Storages/System/StorageSystemColumns.h index dc184b1ae42..126deef1921 100644 --- a/src/Storages/System/StorageSystemColumns.h +++ b/src/Storages/System/StorageSystemColumns.h @@ -19,7 +19,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemDataSkippingIndices.cpp b/src/Storages/System/StorageSystemDataSkippingIndices.cpp index d7fc06da953..42b214bf101 100644 --- a/src/Storages/System/StorageSystemDataSkippingIndices.cpp +++ b/src/Storages/System/StorageSystemDataSkippingIndices.cpp @@ -165,18 +165,18 @@ private: Pipe StorageSystemDataSkippingIndices::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /* processed_stage */, size_t max_block_size, unsigned int /* num_streams */) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); NameSet names_set(column_names.begin(), column_names.end()); - Block sample_block = metadata_snapshot->getSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); Block header; std::vector columns_mask(sample_block.columns()); diff --git a/src/Storages/System/StorageSystemDataSkippingIndices.h b/src/Storages/System/StorageSystemDataSkippingIndices.h index 4af2398a04b..93511d0d591 100644 --- a/src/Storages/System/StorageSystemDataSkippingIndices.h +++ b/src/Storages/System/StorageSystemDataSkippingIndices.h @@ -16,7 +16,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemDetachedParts.cpp b/src/Storages/System/StorageSystemDetachedParts.cpp index 5a24809d05a..4797dff2fd1 100644 --- a/src/Storages/System/StorageSystemDetachedParts.cpp +++ b/src/Storages/System/StorageSystemDetachedParts.cpp @@ -31,7 +31,7 @@ StorageSystemDetachedParts::StorageSystemDetachedParts(const StorageID & table_i Pipe StorageSystemDetachedParts::read( const Names & /* column_names */, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, @@ -41,7 +41,7 @@ Pipe StorageSystemDetachedParts::read( StoragesInfoStream stream(query_info, context); /// Create the result. - Block block = metadata_snapshot->getSampleBlock(); + Block block = storage_snapshot->metadata->getSampleBlock(); MutableColumns new_columns = block.cloneEmptyColumns(); while (StoragesInfo info = stream.next()) diff --git a/src/Storages/System/StorageSystemDetachedParts.h b/src/Storages/System/StorageSystemDetachedParts.h index 51ee93a2f15..8ed11eb306c 100644 --- a/src/Storages/System/StorageSystemDetachedParts.h +++ b/src/Storages/System/StorageSystemDetachedParts.h @@ -25,7 +25,7 @@ protected: Pipe read( const Names & /* column_names */, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, diff --git a/src/Storages/System/StorageSystemDisks.cpp b/src/Storages/System/StorageSystemDisks.cpp index 749717922da..3841abc2f2d 100644 --- a/src/Storages/System/StorageSystemDisks.cpp +++ b/src/Storages/System/StorageSystemDisks.cpp @@ -28,14 +28,14 @@ StorageSystemDisks::StorageSystemDisks(const StorageID & table_id_) Pipe StorageSystemDisks::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t /*max_block_size*/, const unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); MutableColumnPtr col_name = ColumnString::create(); MutableColumnPtr col_path = ColumnString::create(); @@ -65,7 +65,7 @@ Pipe StorageSystemDisks::read( UInt64 num_rows = res_columns.at(0)->size(); Chunk chunk(std::move(res_columns), num_rows); - return Pipe(std::make_shared(metadata_snapshot->getSampleBlock(), std::move(chunk))); + return Pipe(std::make_shared(storage_snapshot->metadata->getSampleBlock(), std::move(chunk))); } } diff --git a/src/Storages/System/StorageSystemDisks.h b/src/Storages/System/StorageSystemDisks.h index 1404d6023d4..2640ab7149b 100644 --- a/src/Storages/System/StorageSystemDisks.h +++ b/src/Storages/System/StorageSystemDisks.h @@ -22,7 +22,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemNumbers.cpp b/src/Storages/System/StorageSystemNumbers.cpp index c09279e65ac..2e48bb857ce 100644 --- a/src/Storages/System/StorageSystemNumbers.cpp +++ b/src/Storages/System/StorageSystemNumbers.cpp @@ -124,14 +124,14 @@ StorageSystemNumbers::StorageSystemNumbers(const StorageID & table_id, bool mult Pipe StorageSystemNumbers::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo &, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, unsigned num_streams) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); if (limit && *limit < max_block_size) { diff --git a/src/Storages/System/StorageSystemNumbers.h b/src/Storages/System/StorageSystemNumbers.h index 32105bb055d..5f3a12c530d 100644 --- a/src/Storages/System/StorageSystemNumbers.h +++ b/src/Storages/System/StorageSystemNumbers.h @@ -31,7 +31,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemOne.cpp b/src/Storages/System/StorageSystemOne.cpp index 7558ae0ae92..f262c981b83 100644 --- a/src/Storages/System/StorageSystemOne.cpp +++ b/src/Storages/System/StorageSystemOne.cpp @@ -22,14 +22,14 @@ StorageSystemOne::StorageSystemOne(const StorageID & table_id_) Pipe StorageSystemOne::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo &, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, const size_t /*max_block_size*/, const unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); Block header{ColumnWithTypeAndName( DataTypeUInt8().createColumn(), diff --git a/src/Storages/System/StorageSystemOne.h b/src/Storages/System/StorageSystemOne.h index cc1d5e05b75..b0ca389b76f 100644 --- a/src/Storages/System/StorageSystemOne.h +++ b/src/Storages/System/StorageSystemOne.h @@ -23,7 +23,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemPartsBase.cpp b/src/Storages/System/StorageSystemPartsBase.cpp index f4dd9cbd45d..dc2a353de27 100644 --- a/src/Storages/System/StorageSystemPartsBase.cpp +++ b/src/Storages/System/StorageSystemPartsBase.cpp @@ -26,7 +26,7 @@ namespace ErrorCodes extern const int TABLE_IS_DROPPED; } -bool StorageSystemPartsBase::hasStateColumn(const Names & column_names, const StorageMetadataPtr & metadata_snapshot) const +bool StorageSystemPartsBase::hasStateColumn(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const { bool has_state_column = false; Names real_column_names; @@ -41,7 +41,7 @@ bool StorageSystemPartsBase::hasStateColumn(const Names & column_names, const St /// Do not check if only _state column is requested if (!(has_state_column && real_column_names.empty())) - metadata_snapshot->check(real_column_names, {}, getStorageID()); + storage_snapshot->check(real_column_names); return has_state_column; } @@ -235,14 +235,14 @@ StoragesInfo StoragesInfoStream::next() Pipe StorageSystemPartsBase::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t /*max_block_size*/, const unsigned /*num_streams*/) { - bool has_state_column = hasStateColumn(column_names, metadata_snapshot); + bool has_state_column = hasStateColumn(column_names, storage_snapshot); StoragesInfoStream stream(query_info, context); @@ -250,7 +250,7 @@ Pipe StorageSystemPartsBase::read( NameSet names_set(column_names.begin(), column_names.end()); - Block sample = metadata_snapshot->getSampleBlock(); + Block sample = storage_snapshot->metadata->getSampleBlock(); Block header; std::vector columns_mask(sample.columns()); diff --git a/src/Storages/System/StorageSystemPartsBase.h b/src/Storages/System/StorageSystemPartsBase.h index bf19771c940..39b6d0f033a 100644 --- a/src/Storages/System/StorageSystemPartsBase.h +++ b/src/Storages/System/StorageSystemPartsBase.h @@ -58,7 +58,7 @@ class StorageSystemPartsBase : public IStorage public: Pipe read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, @@ -70,7 +70,7 @@ public: bool isSystemStorage() const override { return true; } private: - bool hasStateColumn(const Names & column_names, const StorageMetadataPtr & metadata_snapshot) const; + bool hasStateColumn(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const; protected: const FormatSettings format_settings; diff --git a/src/Storages/System/StorageSystemReplicas.cpp b/src/Storages/System/StorageSystemReplicas.cpp index 467226c3b7a..e018ccc0733 100644 --- a/src/Storages/System/StorageSystemReplicas.cpp +++ b/src/Storages/System/StorageSystemReplicas.cpp @@ -61,14 +61,14 @@ StorageSystemReplicas::StorageSystemReplicas(const StorageID & table_id_) Pipe StorageSystemReplicas::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t /*max_block_size*/, const unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); const auto access = context->getAccess(); const bool check_access_for_databases = !access->isGranted(AccessType::SHOW_TABLES); @@ -149,7 +149,7 @@ Pipe StorageSystemReplicas::read( col_engine = filtered_block.getByName("engine").column; } - MutableColumns res_columns = metadata_snapshot->getSampleBlock().cloneEmptyColumns(); + MutableColumns res_columns = storage_snapshot->metadata->getSampleBlock().cloneEmptyColumns(); for (size_t i = 0, size = col_database->size(); i < size; ++i) { @@ -203,8 +203,6 @@ Pipe StorageSystemReplicas::read( res_columns[col_num++]->insert(std::move(replica_is_active_values)); } - Block header = metadata_snapshot->getSampleBlock(); - Columns fin_columns; fin_columns.reserve(res_columns.size()); @@ -218,7 +216,7 @@ Pipe StorageSystemReplicas::read( UInt64 num_rows = fin_columns.at(0)->size(); Chunk chunk(std::move(fin_columns), num_rows); - return Pipe(std::make_shared(metadata_snapshot->getSampleBlock(), std::move(chunk))); + return Pipe(std::make_shared(storage_snapshot->metadata->getSampleBlock(), std::move(chunk))); } diff --git a/src/Storages/System/StorageSystemReplicas.h b/src/Storages/System/StorageSystemReplicas.h index 500b4e97546..1b93d10367b 100644 --- a/src/Storages/System/StorageSystemReplicas.h +++ b/src/Storages/System/StorageSystemReplicas.h @@ -20,7 +20,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemStoragePolicies.cpp b/src/Storages/System/StorageSystemStoragePolicies.cpp index 036e4748e65..04c98e6be9c 100644 --- a/src/Storages/System/StorageSystemStoragePolicies.cpp +++ b/src/Storages/System/StorageSystemStoragePolicies.cpp @@ -38,14 +38,14 @@ StorageSystemStoragePolicies::StorageSystemStoragePolicies(const StorageID & tab Pipe StorageSystemStoragePolicies::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & /*query_info*/, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t /*max_block_size*/, const unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); MutableColumnPtr col_policy_name = ColumnString::create(); MutableColumnPtr col_volume_name = ColumnString::create(); @@ -89,7 +89,7 @@ Pipe StorageSystemStoragePolicies::read( UInt64 num_rows = res_columns.at(0)->size(); Chunk chunk(std::move(res_columns), num_rows); - return Pipe(std::make_shared(metadata_snapshot->getSampleBlock(), std::move(chunk))); + return Pipe(std::make_shared(storage_snapshot->metadata->getSampleBlock(), std::move(chunk))); } } diff --git a/src/Storages/System/StorageSystemStoragePolicies.h b/src/Storages/System/StorageSystemStoragePolicies.h index 28730ce33c4..e2890c42897 100644 --- a/src/Storages/System/StorageSystemStoragePolicies.h +++ b/src/Storages/System/StorageSystemStoragePolicies.h @@ -22,7 +22,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemTables.cpp b/src/Storages/System/StorageSystemTables.cpp index 9332bc6a004..98a07d0f4c3 100644 --- a/src/Storages/System/StorageSystemTables.cpp +++ b/src/Storages/System/StorageSystemTables.cpp @@ -556,20 +556,20 @@ private: Pipe StorageSystemTables::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum /*processed_stage*/, const size_t max_block_size, const unsigned /*num_streams*/) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); /// Create a mask of what columns are needed in the result. NameSet names_set(column_names.begin(), column_names.end()); - Block sample_block = metadata_snapshot->getSampleBlock(); + Block sample_block = storage_snapshot->metadata->getSampleBlock(); Block res_block; std::vector columns_mask(sample_block.columns()); diff --git a/src/Storages/System/StorageSystemTables.h b/src/Storages/System/StorageSystemTables.h index 23f3aedb164..7f6a099a824 100644 --- a/src/Storages/System/StorageSystemTables.h +++ b/src/Storages/System/StorageSystemTables.h @@ -20,7 +20,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/System/StorageSystemZeros.cpp b/src/Storages/System/StorageSystemZeros.cpp index 624fc54998c..b6a623c3071 100644 --- a/src/Storages/System/StorageSystemZeros.cpp +++ b/src/Storages/System/StorageSystemZeros.cpp @@ -92,14 +92,14 @@ StorageSystemZeros::StorageSystemZeros(const StorageID & table_id_, bool multith Pipe StorageSystemZeros::read( const Names & column_names, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo &, ContextPtr /*context*/, QueryProcessingStage::Enum /*processed_stage*/, size_t max_block_size, unsigned num_streams) { - metadata_snapshot->check(column_names, getVirtuals(), getStorageID()); + storage_snapshot->check(column_names); bool use_multiple_streams = multithreaded; diff --git a/src/Storages/System/StorageSystemZeros.h b/src/Storages/System/StorageSystemZeros.h index f5b2bb43117..bf72352b7be 100644 --- a/src/Storages/System/StorageSystemZeros.h +++ b/src/Storages/System/StorageSystemZeros.h @@ -22,7 +22,7 @@ public: Pipe read( const Names & column_names, - const StorageMetadataPtr & /*metadata_snapshot*/, + const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, diff --git a/src/Storages/getStructureOfRemoteTable.cpp b/src/Storages/getStructureOfRemoteTable.cpp index 532abb8e2f3..8fa4d02e8e1 100644 --- a/src/Storages/getStructureOfRemoteTable.cpp +++ b/src/Storages/getStructureOfRemoteTable.cpp @@ -58,7 +58,6 @@ ColumnsDescription getStructureOfRemoteTableInShard( } ColumnsDescription res; - auto new_context = ClusterProxy::updateSettingsForCluster(cluster, context, context->getSettingsRef()); /// Expect only needed columns from the result of DESC TABLE. NOTE 'comment' column is ignored for compatibility reasons. @@ -150,4 +149,69 @@ ColumnsDescription getStructureOfRemoteTable( ErrorCodes::NO_REMOTE_SHARD_AVAILABLE); } +ColumnsDescriptionByShardNum getExtendedObjectsOfRemoteTables( + const Cluster & cluster, + const StorageID & remote_table_id, + const ColumnsDescription & storage_columns, + ContextPtr context) +{ + const auto & shards_info = cluster.getShardsInfo(); + auto query = "DESC TABLE " + remote_table_id.getFullTableName(); + + auto new_context = ClusterProxy::updateSettingsForCluster(cluster, context, context->getSettingsRef()); + new_context->setSetting("describe_extend_object_types", true); + + /// Expect only needed columns from the result of DESC TABLE. + Block sample_block + { + { ColumnString::create(), std::make_shared(), "name" }, + { ColumnString::create(), std::make_shared(), "type" }, + }; + + auto execute_query_on_shard = [&](const auto & shard_info) + { + /// Execute remote query without restrictions (because it's not real user query, but part of implementation) + RemoteQueryExecutor executor(shard_info.pool, query, sample_block, new_context); + + executor.setPoolMode(PoolMode::GET_ONE); + executor.setMainTable(remote_table_id); + + ColumnsDescription res; + while (auto block = executor.read()) + { + const auto & name_col = *block.getByName("name").column; + const auto & type_col = *block.getByName("type").column; + + size_t size = name_col.size(); + for (size_t i = 0; i < size; ++i) + { + auto name = get(name_col[i]); + auto type_name = get(type_col[i]); + + auto storage_column = storage_columns.tryGetPhysical(name); + if (storage_column && isObject(storage_column->type)) + res.add(ColumnDescription(std::move(name), DataTypeFactory::instance().get(type_name))); + } + } + + return res; + }; + + ColumnsDescriptionByShardNum columns; + for (const auto & shard_info : shards_info) + { + auto res = execute_query_on_shard(shard_info); + + /// Expect at least some columns. + /// This is a hack to handle the empty block case returned by Connection when skip_unavailable_shards is set. + if (!res.empty()) + columns.emplace(shard_info.shard_num, std::move(res)); + } + + if (columns.empty()) + throw NetException("All attempts to get table structure failed", ErrorCodes::NO_REMOTE_SHARD_AVAILABLE); + + return columns; +} + } diff --git a/src/Storages/getStructureOfRemoteTable.h b/src/Storages/getStructureOfRemoteTable.h index 3f77236c756..62f93dccf1a 100644 --- a/src/Storages/getStructureOfRemoteTable.h +++ b/src/Storages/getStructureOfRemoteTable.h @@ -8,6 +8,7 @@ namespace DB { + class Context; struct StorageID; @@ -19,4 +20,14 @@ ColumnsDescription getStructureOfRemoteTable( ContextPtr context, const ASTPtr & table_func_ptr = nullptr); + +using ColumnsDescriptionByShardNum = std::unordered_map; + +/// Returns descriptions of columns of type Object for each shard. +ColumnsDescriptionByShardNum getExtendedObjectsOfRemoteTables( + const Cluster & cluster, + const StorageID & remote_table_id, + const ColumnsDescription & storage_columns, + ContextPtr context); + } diff --git a/src/Storages/tests/gtest_storage_log.cpp b/src/Storages/tests/gtest_storage_log.cpp index a48b764b62c..4cda9d6c9f5 100644 --- a/src/Storages/tests/gtest_storage_log.cpp +++ b/src/Storages/tests/gtest_storage_log.cpp @@ -117,15 +117,16 @@ std::string readData(DB::StoragePtr & table, const DB::ContextPtr context) { using namespace DB; auto metadata_snapshot = table->getInMemoryMetadataPtr(); + auto storage_snapshot = table->getStorageSnapshot(metadata_snapshot); Names column_names; column_names.push_back("a"); SelectQueryInfo query_info; QueryProcessingStage::Enum stage = table->getQueryProcessingStage( - context, QueryProcessingStage::Complete, metadata_snapshot, query_info); + context, QueryProcessingStage::Complete, storage_snapshot, query_info); - QueryPipeline pipeline(table->read(column_names, metadata_snapshot, query_info, context, stage, 8192, 1)); + QueryPipeline pipeline(table->read(column_names, storage_snapshot, query_info, context, stage, 8192, 1)); Block sample; { diff --git a/src/TableFunctions/TableFunctionValues.cpp b/src/TableFunctions/TableFunctionValues.cpp index 07019d26067..595e8f9cf41 100644 --- a/src/TableFunctions/TableFunctionValues.cpp +++ b/src/TableFunctions/TableFunctionValues.cpp @@ -109,7 +109,7 @@ void TableFunctionValues::parseArguments(const ASTPtr & ast_function, ContextPtr "Cannot determine common structure for {} function arguments: the amount of columns is differ for different arguments", getName()); for (size_t j = 0; j != arg_types.size(); ++j) - data_types[j] = getLeastSupertype({data_types[j], arg_types[j]}); + data_types[j] = getLeastSupertype(DataTypes{data_types[j], arg_types[j]}); } NamesAndTypesList names_and_types; diff --git a/tests/integration/helpers/0_common_instance_config.xml b/tests/integration/helpers/0_common_instance_config.xml index 71a2f8f4b13..493366b1209 100644 --- a/tests/integration/helpers/0_common_instance_config.xml +++ b/tests/integration/helpers/0_common_instance_config.xml @@ -1,5 +1,5 @@ - Europe/Moscow + Etc/UTC 0.0.0.0 custom_ /var/lib/clickhouse/ diff --git a/tests/integration/helpers/external_sources.py b/tests/integration/helpers/external_sources.py index 32ebdfa58c6..93247e7b617 100644 --- a/tests/integration/helpers/external_sources.py +++ b/tests/integration/helpers/external_sources.py @@ -10,8 +10,6 @@ import pymongo import pymysql.cursors import redis import logging -from tzlocal import get_localzone - class ExternalSource(object): def __init__(self, name, internal_hostname, internal_port, @@ -166,8 +164,9 @@ class SourceMongo(ExternalSource): if field.field_type == "Date": self.converters[field.name] = lambda x: datetime.datetime.strptime(x, "%Y-%m-%d") elif field.field_type == "DateTime": - self.converters[field.name] = lambda x: get_localzone().localize( - datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S")) + def converter(x): + return datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') + self.converters[field.name] = converter else: self.converters[field.name] = lambda x: x @@ -482,8 +481,7 @@ class SourceCassandra(ExternalSource): if type == 'UUID': return uuid.UUID(value) elif type == 'DateTime': - local_datetime = datetime.datetime.strptime(value, '%Y-%m-%d %H:%M:%S') - return get_localzone().localize(local_datetime) + return datetime.datetime.strptime(value, '%Y-%m-%d %H:%M:%S') return value def load_data(self, data, table_name): diff --git a/tests/integration/test_config_corresponding_root/configs/config.xml b/tests/integration/test_config_corresponding_root/configs/config.xml index 5a0179fa25f..e1a1c1c75df 100644 --- a/tests/integration/test_config_corresponding_root/configs/config.xml +++ b/tests/integration/test_config_corresponding_root/configs/config.xml @@ -129,20 +129,6 @@ default - - - diff --git a/tests/integration/test_config_xml_full/configs/config.xml b/tests/integration/test_config_xml_full/configs/config.xml index 76eceedbcea..4c0b8275869 100644 --- a/tests/integration/test_config_xml_full/configs/config.xml +++ b/tests/integration/test_config_xml_full/configs/config.xml @@ -490,20 +490,6 @@ default - - - diff --git a/tests/integration/test_config_xml_yaml_mix/configs/config.d/0_common_instance_config.yaml b/tests/integration/test_config_xml_yaml_mix/configs/config.d/0_common_instance_config.yaml index 62e4ba8c744..8603c85e940 100644 --- a/tests/integration/test_config_xml_yaml_mix/configs/config.d/0_common_instance_config.yaml +++ b/tests/integration/test_config_xml_yaml_mix/configs/config.d/0_common_instance_config.yaml @@ -1,4 +1,4 @@ -timezone: Europe/Moscow +timezone: Etc/UTC listen_host: 0.0.0.0 custom_settings_prefixes: custom_ path: /var/lib/clickhouse/ diff --git a/tests/integration/test_config_yaml_full/configs/config.d/0_common_instance_config.yaml b/tests/integration/test_config_yaml_full/configs/config.d/0_common_instance_config.yaml index 62e4ba8c744..8603c85e940 100644 --- a/tests/integration/test_config_yaml_full/configs/config.d/0_common_instance_config.yaml +++ b/tests/integration/test_config_yaml_full/configs/config.d/0_common_instance_config.yaml @@ -1,4 +1,4 @@ -timezone: Europe/Moscow +timezone: Etc/UTC listen_host: 0.0.0.0 custom_settings_prefixes: custom_ path: /var/lib/clickhouse/ diff --git a/tests/integration/test_config_yaml_main/configs/config.d/0_common_instance_config.yaml b/tests/integration/test_config_yaml_main/configs/config.d/0_common_instance_config.yaml index 62e4ba8c744..8603c85e940 100644 --- a/tests/integration/test_config_yaml_main/configs/config.d/0_common_instance_config.yaml +++ b/tests/integration/test_config_yaml_main/configs/config.d/0_common_instance_config.yaml @@ -1,4 +1,4 @@ -timezone: Europe/Moscow +timezone: Etc/UTC listen_host: 0.0.0.0 custom_settings_prefixes: custom_ path: /var/lib/clickhouse/ diff --git a/tests/integration/test_dictionaries_all_layouts_separate_sources/common.py b/tests/integration/test_dictionaries_all_layouts_separate_sources/common.py index a3d0e8a019b..20d086afe8c 100644 --- a/tests/integration/test_dictionaries_all_layouts_separate_sources/common.py +++ b/tests/integration/test_dictionaries_all_layouts_separate_sources/common.py @@ -175,7 +175,9 @@ class SimpleLayoutTester(BaseLayoutTester): # print query if isinstance(answer, list): answer = str(answer).replace(' ', '') - assert node.query(query) == str(answer) + '\n' + answer = str(answer) + '\n' + node_answer = node.query(query) + assert str(node_answer).strip() == answer.strip(), f"Expected '{answer.strip()}', got '{node_answer.strip()}' in query '{query}'" class ComplexLayoutTester(BaseLayoutTester): @@ -210,7 +212,9 @@ class ComplexLayoutTester(BaseLayoutTester): for query, answer in queries_with_answers: # print query - assert node.query(query) == str(answer) + '\n' + node_answer = node.query(query) + answer = str(answer) + '\n' + assert node_answer == answer, f"Expected '{answer.strip()}', got '{node_answer.strip()}' in query '{query}'" class RangedLayoutTester(BaseLayoutTester): @@ -240,5 +244,6 @@ class RangedLayoutTester(BaseLayoutTester): for query, answer in queries_with_answers: # print query - assert node.query(query) == str(answer) + '\n' - + node_answer = node.query(query) + answer = str(answer) + '\n' + assert node_answer == answer, f"Expected '{answer.strip()}', got '{node_answer.strip()}' in query '{query}'" diff --git a/tests/integration/test_distributed_type_object/__init__.py b/tests/integration/test_distributed_type_object/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_distributed_type_object/configs/remote_servers.xml b/tests/integration/test_distributed_type_object/configs/remote_servers.xml new file mode 100644 index 00000000000..ebce4697529 --- /dev/null +++ b/tests/integration/test_distributed_type_object/configs/remote_servers.xml @@ -0,0 +1,18 @@ + + + + + + node1 + 9000 + + + + + node2 + 9000 + + + + + diff --git a/tests/integration/test_distributed_type_object/test.py b/tests/integration/test_distributed_type_object/test.py new file mode 100644 index 00000000000..faf509c46cd --- /dev/null +++ b/tests/integration/test_distributed_type_object/test.py @@ -0,0 +1,57 @@ +import pytest + +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import TSV + +cluster = ClickHouseCluster(__file__) + +node1 = cluster.add_instance('node1', main_configs=['configs/remote_servers.xml']) +node2 = cluster.add_instance('node2', main_configs=['configs/remote_servers.xml']) + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + for node in (node1, node2): + node.query("CREATE TABLE local_table(id UInt32, data JSON) ENGINE = MergeTree ORDER BY id", settings={"allow_experimental_object_type": 1}) + node.query("CREATE TABLE dist_table AS local_table ENGINE = Distributed(test_cluster, default, local_table)", settings={"allow_experimental_object_type": 1}) + + yield cluster + + finally: + cluster.shutdown() + + +def test_distributed_type_object(started_cluster): + node1.query('INSERT INTO local_table FORMAT JSONEachRow {"id": 1, "data": {"k1": 10}}') + node2.query('INSERT INTO local_table FORMAT JSONEachRow {"id": 2, "data": {"k1": 20}}') + + expected = TSV("10\n20\n") + assert TSV(node1.query("SELECT data.k1 FROM dist_table ORDER BY id")) == expected + + node1.query('INSERT INTO local_table FORMAT JSONEachRow {"id": 3, "data": {"k1": "str1"}}') + + expected = TSV("10\n20\nstr1\n") + assert TSV(node1.query("SELECT data.k1 FROM dist_table ORDER BY id")) == expected + + node1.query('INSERT INTO local_table FORMAT JSONEachRow {"id": 4, "data": {"k2": 30}}') + + expected = TSV("10\t0\n20\t0\nstr1\t0\n\t30") + assert TSV(node1.query("SELECT data.k1, data.k2 FROM dist_table ORDER BY id")) == expected + + expected = TSV("120\n") + assert TSV(node1.query("SELECT sum(data.k2 * id) FROM dist_table")) == expected + + node1.query("TRUNCATE TABLE local_table") + node2.query("TRUNCATE TABLE local_table") + + node1.query('INSERT INTO local_table FORMAT JSONEachRow {"id": 1, "data": {"k1": "aa", "k2": {"k3": "bb", "k4": "c"}}} {"id": 2, "data": {"k1": "ee", "k5": "ff"}};') + node2.query('INSERT INTO local_table FORMAT JSONEachRow {"id": 3, "data": {"k5":"foo"}};') + + expected = TSV(""" +1\taa\tbb\tc\t +2\tee\t\t\tff +3\t\t\t\tfoo""") + + assert TSV(node1.query("SELECT id, data.k1, data.k2.k3, data.k2.k4, data.k5 FROM dist_table ORDER BY id")) == expected diff --git a/tests/integration/test_dotnet_client/configs/config.xml b/tests/integration/test_dotnet_client/configs/config.xml index 9bcadc43f10..fe64e47d384 100644 --- a/tests/integration/test_dotnet_client/configs/config.xml +++ b/tests/integration/test_dotnet_client/configs/config.xml @@ -1,5 +1,6 @@ + trace /var/log/clickhouse-server/clickhouse-server.log diff --git a/tests/integration/test_dotnet_client/dotnet.reference b/tests/integration/test_dotnet_client/dotnet.reference index a3d6e1d5ba8..8f1d786a237 100644 Binary files a/tests/integration/test_dotnet_client/dotnet.reference and b/tests/integration/test_dotnet_client/dotnet.reference differ diff --git a/tests/integration/test_graphite_merge_tree/configs/graphite_rollup.xml b/tests/integration/test_graphite_merge_tree/configs/graphite_rollup.xml index 9373aed9b14..4ccdfa477c3 100644 --- a/tests/integration/test_graphite_merge_tree/configs/graphite_rollup.xml +++ b/tests/integration/test_graphite_merge_tree/configs/graphite_rollup.xml @@ -1,4 +1,6 @@ + + Asia/Istanbul metric diff --git a/tests/integration/test_graphite_merge_tree_typed/configs/graphite_rollup.xml b/tests/integration/test_graphite_merge_tree_typed/configs/graphite_rollup.xml index c716540a61c..3369c9621e4 100644 --- a/tests/integration/test_graphite_merge_tree_typed/configs/graphite_rollup.xml +++ b/tests/integration/test_graphite_merge_tree_typed/configs/graphite_rollup.xml @@ -1,4 +1,6 @@ + + Asia/Istanbul metric diff --git a/tests/integration/test_https_replication/configs/config.xml b/tests/integration/test_https_replication/configs/config.xml index 4b8a61bc20b..4b3088d21e1 100644 --- a/tests/integration/test_https_replication/configs/config.xml +++ b/tests/integration/test_https_replication/configs/config.xml @@ -122,20 +122,6 @@ default - - - diff --git a/tests/integration/test_materialized_mysql_database/configs/timezone_config.xml b/tests/integration/test_materialized_mysql_database/configs/timezone_config.xml new file mode 100644 index 00000000000..42369fdf488 --- /dev/null +++ b/tests/integration/test_materialized_mysql_database/configs/timezone_config.xml @@ -0,0 +1,3 @@ + + Asia/Istanbul + diff --git a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py index fef2b8a6ffb..48d577a9250 100644 --- a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py +++ b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py @@ -27,7 +27,8 @@ def check_query(clickhouse_node, query, result_set, retry_count=10, interval_sec logging.debug(f"check_query retry {i+1} exception {e}") time.sleep(interval_seconds) else: - assert clickhouse_node.query(query) == result_set + result_got = clickhouse_node.query(query) + assert result_got == result_set, f"Got result {result_got}, while expected result {result_set}" def dml_with_materialized_mysql_database(clickhouse_node, mysql_node, service_name): @@ -1229,17 +1230,14 @@ def materialized_database_mysql_date_type_to_date32(clickhouse_node, mysql_node, # can't support date that less than 1925 year for now mysql_node.query("INSERT INTO test_database.a VALUES(1, '1900-04-16')") # test date that is older than 1925 - mysql_node.query("INSERT INTO test_database.a VALUES(2, '1925-03-16')") mysql_node.query("INSERT INTO test_database.a VALUES(3, '1971-02-16')") mysql_node.query("INSERT INTO test_database.a VALUES(4, '2101-05-16')") clickhouse_node.query("CREATE DATABASE test_database ENGINE = MaterializedMySQL('{}:3306', 'test_database', 'root', 'clickhouse')".format(service_name)) - check_query(clickhouse_node, "SELECT b from test_database.a order by a FORMAT TSV", "1970-01-01\n1925-03-16\n1971-02-16\n2101-05-16\n") + check_query(clickhouse_node, "SELECT b from test_database.a order by a FORMAT TSV", "1970-01-01\n1971-02-16\n2101-05-16\n") - mysql_node.query("INSERT INTO test_database.a VALUES(5, '1925-04-16')") mysql_node.query("INSERT INTO test_database.a VALUES(6, '2022-02-16')") - mysql_node.query("INSERT INTO test_database.a VALUES(7, '2283-11-11')") - - check_query(clickhouse_node, "SELECT b from test_database.a order by a FORMAT TSV", "1970-01-01\n1925-03-16\n1971-02-16\n2101-05-16\n1925-04-16\n2022-02-16\n" + - "2283-11-11\n") + mysql_node.query("INSERT INTO test_database.a VALUES(7, '2104-06-06')") + check_query(clickhouse_node, "SELECT b from test_database.a order by a FORMAT TSV", "1970-01-01\n1971-02-16\n2101-05-16\n2022-02-16\n" + + "2104-06-06\n") diff --git a/tests/integration/test_materialized_mysql_database/test.py b/tests/integration/test_materialized_mysql_database/test.py index 027f874596d..f4fb957a547 100644 --- a/tests/integration/test_materialized_mysql_database/test.py +++ b/tests/integration/test_materialized_mysql_database/test.py @@ -17,9 +17,9 @@ cluster = ClickHouseCluster(__file__) mysql_node = None mysql8_node = None -node_db = cluster.add_instance('node1', user_configs=["configs/users.xml"], with_mysql=True, with_mysql8=True, stay_alive=True) -node_disable_bytes_settings = cluster.add_instance('node2', user_configs=["configs/users_disable_bytes_settings.xml"], with_mysql=False, stay_alive=True) -node_disable_rows_settings = cluster.add_instance('node3', user_configs=["configs/users_disable_rows_settings.xml"], with_mysql=False, stay_alive=True) +node_db = cluster.add_instance('node1', main_configs=["configs/timezone_config.xml"], user_configs=["configs/users.xml"], with_mysql=True, with_mysql8=True, stay_alive=True) +node_disable_bytes_settings = cluster.add_instance('node2', main_configs=["configs/timezone_config.xml"], user_configs=["configs/users_disable_bytes_settings.xml"], with_mysql=False, stay_alive=True) +node_disable_rows_settings = cluster.add_instance('node3', main_configs=["configs/timezone_config.xml"], user_configs=["configs/users_disable_rows_settings.xml"], with_mysql=False, stay_alive=True) @pytest.fixture(scope="module") @@ -264,4 +264,4 @@ def test_materialized_database_settings_materialized_mysql_tables_list(started_c def test_materialized_database_mysql_date_type_to_date32(started_cluster, started_mysql_8_0, started_mysql_5_7, clickhouse_node): materialize_with_ddl.materialized_database_mysql_date_type_to_date32(clickhouse_node, started_mysql_8_0, "mysql80") - materialize_with_ddl.materialized_database_mysql_date_type_to_date32(clickhouse_node, started_mysql_5_7, "mysql57") \ No newline at end of file + materialize_with_ddl.materialized_database_mysql_date_type_to_date32(clickhouse_node, started_mysql_5_7, "mysql57") diff --git a/tests/integration/test_odbc_interaction/test.py b/tests/integration/test_odbc_interaction/test.py index 8d3a8773bc4..613d6a98030 100644 --- a/tests/integration/test_odbc_interaction/test.py +++ b/tests/integration/test_odbc_interaction/test.py @@ -480,9 +480,9 @@ def test_odbc_postgres_conversions(started_cluster): node1.query( """INSERT INTO test_types - SELECT toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow'), toDecimal32(1.1, 1)""") + SELECT toDateTime64('2019-01-01 00:00:00', 3, 'Etc/UTC'), toDecimal32(1.1, 1)""") - expected = node1.query("SELECT toDateTime64('2019-01-01 00:00:00', 3, 'Europe/Moscow'), toDecimal32(1.1, 1)") + expected = node1.query("SELECT toDateTime64('2019-01-01 00:00:00', 3, 'Etc/UTC'), toDecimal32(1.1, 1)") result = node1.query("SELECT * FROM test_types") logging.debug(result) cursor.execute("DROP TABLE IF EXISTS clickhouse.test_types") diff --git a/tests/integration/test_postgresql_database_engine/test.py b/tests/integration/test_postgresql_database_engine/test.py index 7cd632cae6e..855f365a438 100644 --- a/tests/integration/test_postgresql_database_engine/test.py +++ b/tests/integration/test_postgresql_database_engine/test.py @@ -226,6 +226,10 @@ def test_predefined_connection_configuration(started_cluster): node1.query("DROP DATABASE IF EXISTS postgres_database") node1.query("CREATE DATABASE postgres_database ENGINE = PostgreSQL(postgres1)") + + result = node1.query("select create_table_query from system.tables where database ='postgres_database'") + assert(result.strip().endswith("ENGINE = PostgreSQL(postgres1, table = \\'test_table\\')")) + node1.query("INSERT INTO postgres_database.test_table SELECT number, number from numbers(100)") assert (node1.query(f"SELECT count() FROM postgres_database.test_table").rstrip() == '100') diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py index 8326797f96d..7a6f27ffa0a 100644 --- a/tests/integration/test_storage_kafka/test.py +++ b/tests/integration/test_storage_kafka/test.py @@ -3322,7 +3322,7 @@ def test_issue26643(kafka_cluster): CREATE TABLE test.log ( - `tnow` DateTime CODEC(DoubleDelta, LZ4), + `tnow` DateTime('Asia/Istanbul') CODEC(DoubleDelta, LZ4), `server` LowCardinality(String), `client` LowCardinality(String), `sPort` LowCardinality(UInt16), diff --git a/tests/performance/classification.xml b/tests/performance/classification.xml index 370e2c49d29..9c55a6c7f29 100644 --- a/tests/performance/classification.xml +++ b/tests/performance/classification.xml @@ -7,14 +7,14 @@ hits_100m_single - SELECT detectLanguage(SearchPhrase) FROM hits_100m_single FORMAT Null - SELECT detectLanguageMixed(SearchPhrase) FROM hits_100m_single FORMAT Null + SELECT detectLanguage(SearchPhrase) FROM hits_100m_single LIMIT 10000000 FORMAT Null + SELECT detectLanguageMixed(SearchPhrase) FROM hits_100m_single LIMIT 10000000 FORMAT Null SELECT detectTonality(SearchPhrase) FROM hits_100m_single FORMAT Null - SELECT detectProgrammingLanguage(SearchPhrase) FROM hits_100m_single FORMAT Null - SELECT detectLanguageUnknown(SearchPhrase) FROM hits_100m_single FORMAT Null - SELECT detectCharset(SearchPhrase) FROM hits_100m_single FORMAT Null + SELECT detectProgrammingLanguage(SearchPhrase) FROM hits_100m_single LIMIT 10000000 FORMAT Null + SELECT detectLanguageUnknown(SearchPhrase) FROM hits_100m_single LIMIT 500000 FORMAT Null + SELECT detectCharset(SearchPhrase) FROM hits_100m_single LIMIT 500000 FORMAT Null diff --git a/tests/performance/merge_tree_many_partitions.xml b/tests/performance/merge_tree_many_partitions.xml index 2a8a52943a3..5de6061abf3 100644 --- a/tests/performance/merge_tree_many_partitions.xml +++ b/tests/performance/merge_tree_many_partitions.xml @@ -1,11 +1,13 @@ - CREATE TABLE bad_partitions (x UInt64) ENGINE = MergeTree PARTITION BY x ORDER BY x - INSERT INTO bad_partitions SELECT * FROM numbers(10000) - 0 + 1 + 20G + CREATE TABLE bad_partitions (x UInt64) ENGINE = MergeTree PARTITION BY x ORDER BY x + INSERT INTO bad_partitions SELECT * FROM numbers(10000) + SELECT count() FROM bad_partitions DROP TABLE IF EXISTS bad_partitions diff --git a/tests/performance/merge_tree_many_partitions_2.xml b/tests/performance/merge_tree_many_partitions_2.xml index 6799153ed65..a265713269f 100644 --- a/tests/performance/merge_tree_many_partitions_2.xml +++ b/tests/performance/merge_tree_many_partitions_2.xml @@ -1,14 +1,14 @@ - CREATE TABLE bad_partitions (a UInt64, b UInt64, c UInt64, d UInt64, e UInt64, f UInt64, g UInt64, h UInt64, i UInt64, j UInt64, k UInt64, l UInt64, m UInt64, n UInt64, o UInt64, p UInt64, q UInt64, r UInt64, s UInt64, t UInt64, u UInt64, v UInt64, w UInt64, x UInt64, y UInt64, z UInt64) ENGINE = MergeTree PARTITION BY x ORDER BY x - INSERT INTO bad_partitions (x) SELECT * FROM numbers_mt(3000) - - - 0 + 1 + 20G + CREATE TABLE bad_partitions (a UInt64, b UInt64, c UInt64, d UInt64, e UInt64, f UInt64, g UInt64, h UInt64, i UInt64, j UInt64, k UInt64, l UInt64, m UInt64, n UInt64, o UInt64, p UInt64, q UInt64, r UInt64, s UInt64, t UInt64, u UInt64, v UInt64, w UInt64, x UInt64, y UInt64, z UInt64) ENGINE = MergeTree PARTITION BY x ORDER BY x + INSERT INTO bad_partitions (x) SELECT * FROM numbers_mt(3000) + SELECT sum(ignore(*)) FROM bad_partitions DROP TABLE IF EXISTS bad_partitions diff --git a/tests/performance/parallel_final.xml b/tests/performance/parallel_final.xml index bd6a921fc68..775926d1ee8 100644 --- a/tests/performance/parallel_final.xml +++ b/tests/performance/parallel_final.xml @@ -2,7 +2,8 @@ 1024 - 16 + 1 + 20G diff --git a/tests/performance/read_in_order_many_parts.xml b/tests/performance/read_in_order_many_parts.xml index 065d12fadd2..d0b2c0e87c8 100644 --- a/tests/performance/read_in_order_many_parts.xml +++ b/tests/performance/read_in_order_many_parts.xml @@ -5,6 +5,7 @@ 2000 10000000 8 + 15G diff --git a/tests/queries/0_stateless/00076_ip_coding_functions.sql b/tests/queries/0_stateless/00076_ip_coding_functions.sql index 659267c61ed..f693b336e57 100644 --- a/tests/queries/0_stateless/00076_ip_coding_functions.sql +++ b/tests/queries/0_stateless/00076_ip_coding_functions.sql @@ -1,3 +1,5 @@ +SET cast_ipv4_ipv6_default_on_conversion_error = 1; + select IPv4StringToNum('') == 0; select IPv4StringToNum(materialize('')) == 0; select IPv4StringToNum('not an ip string') == 0; diff --git a/tests/queries/0_stateless/00938_ipv6_cidr_range.sql b/tests/queries/0_stateless/00938_ipv6_cidr_range.sql index 3fa4c7c5d3f..1ceefa8cfb3 100644 --- a/tests/queries/0_stateless/00938_ipv6_cidr_range.sql +++ b/tests/queries/0_stateless/00938_ipv6_cidr_range.sql @@ -9,7 +9,7 @@ SELECT 'tests'; DROP TABLE IF EXISTS ipv6_range; CREATE TABLE ipv6_range(ip IPv6, cidr UInt8) ENGINE = Memory; -INSERT INTO ipv6_range (ip, cidr) VALUES (IPv6StringToNum('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 0), (IPv6StringToNum('2001:0db8:0000:85a3:ffff:ffff:ffff:ffff'), 32), (IPv6StringToNum('ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff'), 16), (IPv6StringToNum('2001:df8:0:85a3::ac1f:8001'), 32), (IPv6StringToNum('2001:0db8:85a3:85a3:0000:0000:ac1f:8001'), 16), (IPv6StringToNum('0000:0000:0000:0000:0000:0000:0000:0000'), 8), (IPv6StringToNum('ffff:0000:0000:0000:0000:0000:0000:0000'), 4); +INSERT INTO ipv6_range (ip, cidr) VALUES ('2001:0db8:0000:85a3:0000:0000:ac1f:8001', 0), ('2001:0db8:0000:85a3:ffff:ffff:ffff:ffff', 32), ('ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff', 16), ('2001:df8:0:85a3::ac1f:8001', 32), ('2001:0db8:85a3:85a3:0000:0000:ac1f:8001', 16), ('0000:0000:0000:0000:0000:0000:0000:0000', 8), ('ffff:0000:0000:0000:0000:0000:0000:0000', 4); WITH IPv6CIDRToRange(IPv6StringToNum('2001:0db8:0000:85a3:0000:0000:ac1f:8001'), 32) as ip_range SELECT COUNT(*) FROM ipv6_range WHERE ip BETWEEN tupleElement(ip_range, 1) AND tupleElement(ip_range, 2); diff --git a/tests/queries/0_stateless/01018_ip_dictionary_long.sql b/tests/queries/0_stateless/01018_ip_dictionary_long.sql index 7d9dfeb1bae..647c36429cc 100644 --- a/tests/queries/0_stateless/01018_ip_dictionary_long.sql +++ b/tests/queries/0_stateless/01018_ip_dictionary_long.sql @@ -44,7 +44,7 @@ LAYOUT(IP_TRIE()) LIFETIME(MIN 10 MAX 100); -- fuzzer -SELECT '127.0.0.0/24' = dictGetString('database_for_dict.dict_ipv4_trie', 'prefixprefixprefixprefix', tuple(IPv4StringToNum('127.0.0.0127.0.0.0'))); -- { serverError 36 } +SELECT '127.0.0.0/24' = dictGetString('database_for_dict.dict_ipv4_trie', 'prefixprefixprefixprefix', tuple(IPv4StringToNumOrDefault('127.0.0.0127.0.0.0'))); -- { serverError 36 } SELECT 0 == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'asn', tuple(IPv4StringToNum('0.0.0.0'))); SELECT 1 == dictGetUInt32('database_for_dict.dict_ipv4_trie', 'asn', tuple(IPv4StringToNum('128.0.0.0'))); diff --git a/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql b/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql index b0900073151..a5090551c89 100644 --- a/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql +++ b/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql @@ -134,8 +134,8 @@ create table data_01756_str (key String) engine=Memory(); create table dist_01756_str as data_01756_str engine=Distributed(test_cluster_two_shards, currentDatabase(), data_01756_str, cityHash64(key)); select * from dist_01756_str where key in ('0', '2'); select * from dist_01756_str where key in ('0', Null); -- { serverError 507 } -select * from dist_01756_str where key in (0, 2); -- { serverError 53 } -select * from dist_01756_str where key in (0, Null); -- { serverError 53 } +-- select * from dist_01756_str where key in (0, 2); -- { serverError 53 } +-- select * from dist_01756_str where key in (0, Null); -- { serverError 53 } -- different type #2 select 'different types -- conversion'; diff --git a/tests/queries/0_stateless/01825_type_json_1.reference b/tests/queries/0_stateless/01825_type_json_1.reference new file mode 100644 index 00000000000..857c624fb9b --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_1.reference @@ -0,0 +1,27 @@ +1 aa bb c +2 ee ff +3 foo +all_1_1_0 data Tuple(k1 String, k2 Tuple(k3 String, k4 String), k5 String) +all_2_2_0 data Tuple(k5 String) +all_1_2_1 data Tuple(k1 String, k2 Tuple(k3 String, k4 String), k5 String) +============ +1 ['aaa','ddd'] [['bbb','ccc'],['eee','fff']] +all_3_3_0 data Tuple(k1 Nested(k2 String, k3 Nested(k4 String))) +============ +1 a 42 +2 b 4200 +4242 +all_4_4_0 data Tuple(name String, value Int16) +1 a 42 +2 b 4200 +3 a 42.123 +all_4_4_0 data Tuple(name String, value Int16) +all_5_5_0 data Tuple(name String, value Float64) +1 a 42 +2 b 4200 +3 a 42.123 +4 a some +all_4_4_0 data Tuple(name String, value Int16) +all_5_5_0 data Tuple(name String, value Float64) +all_6_6_0 data Tuple(name String, value String) +all_4_6_1 data Tuple(name String, value String) diff --git a/tests/queries/0_stateless/01825_type_json_1.sql b/tests/queries/0_stateless/01825_type_json_1.sql new file mode 100644 index 00000000000..e74faf2d4c7 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_1.sql @@ -0,0 +1,85 @@ +-- Tags: no-fasttest + +SET allow_experimental_object_type = 1; + +DROP TABLE IF EXISTS t_json; + +CREATE TABLE t_json(id UInt64, data Object('JSON')) +ENGINE = MergeTree ORDER BY tuple(); + +SYSTEM STOP MERGES t_json; + +INSERT INTO t_json FORMAT JSONEachRow {"id": 1, "data": {"k1": "aa", "k2": {"k3": "bb", "k4": "c"}}} {"id": 2, "data": {"k1": "ee", "k5": "ff"}}; +INSERT INTO t_json FORMAT JSONEachRow {"id": 3, "data": {"k5":"foo"}}; + +SELECT id, data.k1, data.k2.k3, data.k2.k4, data.k5 FROM t_json ORDER BY id; + +SELECT name, column, type +FROM system.parts_columns +WHERE table = 't_json' AND database = currentDatabase() AND active AND column = 'data' +ORDER BY name; + +SYSTEM START MERGES t_json; + +OPTIMIZE TABLE t_json FINAL; + +SELECT name, column, type +FROM system.parts_columns +WHERE table = 't_json' AND database = currentDatabase() AND active AND column = 'data' +ORDER BY name; + +SELECT '============'; +TRUNCATE TABLE t_json; + +INSERT INTO t_json FORMAT JSONEachRow {"id": 1, "data": {"k1":[{"k2":"aaa","k3":[{"k4":"bbb"},{"k4":"ccc"}]},{"k2":"ddd","k3":[{"k4":"eee"},{"k4":"fff"}]}]}}; +SELECT id, data.k1.k2, data.k1.k3.k4 FROM t_json ORDER BY id; + +SELECT name, column, type +FROM system.parts_columns +WHERE table = 't_json' AND database = currentDatabase() AND active AND column = 'data' +ORDER BY name; + +SELECT '============'; +TRUNCATE TABLE t_json; + +SYSTEM STOP MERGES t_json; + +INSERT INTO t_json FORMAT JSONEachRow {"id": 1, "data": {"name": "a", "value": 42 }}, {"id": 2, "data": {"name": "b", "value": 4200 }}; + +SELECT id, data.name, data.value FROM t_json ORDER BY id; +SELECT sum(data.value) FROM t_json; + +SELECT name, column, type +FROM system.parts_columns +WHERE table = 't_json' AND database = currentDatabase() AND active AND column = 'data' +ORDER BY name; + +INSERT INTO t_json FORMAT JSONEachRow {"id": 3, "data": {"name": "a", "value": 42.123 }}; + +SELECT id, data.name, data.value FROM t_json ORDER BY id; + +SELECT name, column, type +FROM system.parts_columns +WHERE table = 't_json' AND database = currentDatabase() AND active AND column = 'data' +ORDER BY name; + +INSERT INTO t_json FORMAT JSONEachRow {"id": 4, "data": {"name": "a", "value": "some" }}; + +SELECT id, data.name, data.value FROM t_json ORDER BY id; + +SELECT name, column, type +FROM system.parts_columns +WHERE table = 't_json' AND database = currentDatabase() AND active AND column = 'data' +ORDER BY name; + +SYSTEM START MERGES t_json; +OPTIMIZE TABLE t_json FINAL; + +SELECT name, column, type +FROM system.parts_columns +WHERE table = 't_json' AND database = currentDatabase() AND active AND column = 'data' +ORDER BY name; + +DROP TABLE IF EXISTS t_json; + +CREATE TABLE t_json(id UInt64, data Object('JSON')) ENGINE = Log; -- { serverError 44 } diff --git a/tests/queries/0_stateless/01825_type_json_2.reference b/tests/queries/0_stateless/01825_type_json_2.reference new file mode 100644 index 00000000000..8524035a3a4 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_2.reference @@ -0,0 +1,24 @@ +1 (1,2,0) Tuple(k1 Int8, k2 Int8, k3 Int8) +2 (0,3,4) Tuple(k1 Int8, k2 Int8, k3 Int8) +1 1 2 0 +2 0 3 4 +1 (1,2,'0') Tuple(k1 Int8, k2 Int8, k3 String) +2 (0,3,'4') Tuple(k1 Int8, k2 Int8, k3 String) +3 (0,0,'10') Tuple(k1 Int8, k2 Int8, k3 String) +4 (0,5,'str') Tuple(k1 Int8, k2 Int8, k3 String) +1 1 2 0 +2 0 3 4 +3 0 0 10 +4 0 5 str +============ +1 ([1,2,3.3]) Tuple(k1 Array(Float64)) +1 [1,2,3.3] +1 (['1','2','3.3']) Tuple(k1 Array(String)) +2 (['a','4','b']) Tuple(k1 Array(String)) +1 ['1','2','3.3'] +2 ['a','4','b'] +============ +1 ([(11,0,0),(0,22,0)]) Tuple(k1 Nested(k2 Int8, k3 Int8, k4 Int8)) +2 ([(0,33,0),(0,0,44),(0,55,66)]) Tuple(k1 Nested(k2 Int8, k3 Int8, k4 Int8)) +1 [11,0] [0,22] [0,0] +2 [0,0,0] [33,0,55] [0,44,66] diff --git a/tests/queries/0_stateless/01825_type_json_2.sql b/tests/queries/0_stateless/01825_type_json_2.sql new file mode 100644 index 00000000000..d2d26ce4106 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_2.sql @@ -0,0 +1,41 @@ +-- Tags: no-fasttest + +SET allow_experimental_object_type = 1; + +DROP TABLE IF EXISTS t_json_2; + +CREATE TABLE t_json_2(id UInt64, data Object('JSON')) +ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test_01825_2/t_json_2', 'r1') ORDER BY tuple(); + +INSERT INTO t_json_2 FORMAT JSONEachRow {"id": 1, "data": {"k1": 1, "k2" : 2}} {"id": 2, "data": {"k2": 3, "k3" : 4}}; + +SELECT id, data, toTypeName(data) FROM t_json_2 ORDER BY id; +SELECT id, data.k1, data.k2, data.k3 FROM t_json_2 ORDER BY id; + +INSERT INTO t_json_2 FORMAT JSONEachRow {"id": 3, "data": {"k3" : 10}} {"id": 4, "data": {"k2": 5, "k3" : "str"}}; + +SELECT id, data, toTypeName(data) FROM t_json_2 ORDER BY id; +SELECT id, data.k1, data.k2, data.k3 FROM t_json_2 ORDER BY id; + +SELECT '============'; +TRUNCATE TABLE t_json_2; + +INSERT INTO TABLE t_json_2 FORMAT JSONEachRow {"id": 1, "data": {"k1" : [1, 2, 3.3]}}; + +SELECT id, data, toTypeName(data) FROM t_json_2 ORDER BY id; +SELECT id, data.k1 FROM t_json_2 ORDEr BY id; + +INSERT INTO TABLE t_json_2 FORMAT JSONEachRow {"id": 2, "data": {"k1" : ["a", 4, "b"]}}; + +SELECT id, data, toTypeName(data) FROM t_json_2 ORDER BY id; +SELECT id, data.k1 FROM t_json_2 ORDER BY id; + +SELECT '============'; +TRUNCATE TABLE t_json_2; + +INSERT INTO TABLE t_json_2 FORMAT JSONEachRow {"id": 1, "data": {"k1" : [{"k2" : 11}, {"k3" : 22}]}} {"id": 2, "data": {"k1" : [{"k3" : 33}, {"k4" : 44}, {"k3" : 55, "k4" : 66}]}}; + +SELECT id, data, toTypeName(data) FROM t_json_2 ORDER BY id; +SELECT id, data.k1.k2, data.k1.k3, data.k1.k4 FROM t_json_2 ORDER BY id; + +DROP TABLE t_json_2; diff --git a/tests/queries/0_stateless/01825_type_json_3.reference.j2 b/tests/queries/0_stateless/01825_type_json_3.reference.j2 new file mode 100644 index 00000000000..23f38b74fd1 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_3.reference.j2 @@ -0,0 +1,35 @@ +{% for engine in ["ReplicatedMergeTree('/clickhouse/tables/{database}/test_01825_3/t_json_3', 'r1') ORDER BY tuple()", "Memory"] -%} +1 ('',0) Tuple(k1 String, k2 Int8) +2 ('v1',2) Tuple(k1 String, k2 Int8) +1 0 +2 v1 2 +======== +1 ([]) Tuple(k1 Nested(k2 String, k3 String)) +2 ([('v1','v3'),('v4','')]) Tuple(k1 Nested(k2 String, k3 String)) +1 [] [] +2 ['v1','v4'] ['v3',''] +1 ([]) Tuple(k1 Nested(k2 String, k3 String)) +2 ([('v1','v3'),('v4','')]) Tuple(k1 Nested(k2 String, k3 String)) +3 ([]) Tuple(k1 Nested(k2 String, k3 String)) +4 ([]) Tuple(k1 Nested(k2 String, k3 String)) +1 [] [] +2 ['v1','v4'] ['v3',''] +3 [] [] +4 [] [] +{%- if 'MergeTree' in engine %} +all_2_2_0 data Tuple(k1 Nested(k2 String, k3 String)) +all_3_3_0 data Tuple(_dummy UInt8) +data Tuple(k1 Nested(k2 String, k3 String)) +{%- endif %} +1 [] [] +2 ['v1','v4'] ['v3',''] +3 [] [] +4 [] [] +======== +1 ((1,'foo'),[]) Tuple(k1 Tuple(k2 Int8, k3 String), k4 Array(Int8)) +2 ((0,''),[1,2,3]) Tuple(k1 Tuple(k2 Int8, k3 String), k4 Array(Int8)) +3 ((10,''),[]) Tuple(k1 Tuple(k2 Int8, k3 String), k4 Array(Int8)) +1 1 foo [] +2 0 [1,2,3] +3 10 [] +{% endfor -%} diff --git a/tests/queries/0_stateless/01825_type_json_3.sql.j2 b/tests/queries/0_stateless/01825_type_json_3.sql.j2 new file mode 100644 index 00000000000..62d86c3efd4 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_3.sql.j2 @@ -0,0 +1,61 @@ +-- Tags: no-fasttest + +{% for engine in ["ReplicatedMergeTree('/clickhouse/tables/{database}/test_01825_3/t_json_3', 'r1') ORDER BY tuple()", "Memory"] -%} + +SET allow_experimental_object_type = 1; + +DROP TABLE IF EXISTS t_json_3; + +CREATE TABLE t_json_3(id UInt64, data JSON) +ENGINE = {{ engine }}; + +{% if 'MergeTree' in engine %} + SYSTEM STOP MERGES t_json_3; +{% endif %} + +INSERT INTO t_json_3 FORMAT JSONEachRow {"id": 1, "data": {"k1": null}}, {"id": 2, "data": {"k1": "v1", "k2" : 2}}; + +SELECT id, data, toTypeName(data) FROM t_json_3 ORDER BY id; +SELECT id, data.k1, data.k2 FROM t_json_3 ORDER BY id; + +SELECT '========'; +TRUNCATE TABLE t_json_3; + +INSERT INTO t_json_3 FORMAT JSONEachRow {"id": 1, "data": {"k1" : []}} {"id": 2, "data": {"k1" : [{"k2" : "v1", "k3" : "v3"}, {"k2" : "v4"}]}}; + +SELECT id, data, toTypeName(data) FROM t_json_3 ORDER BY id; +SELECT id, `data.k1.k2`, `data.k1.k3` FROM t_json_3 ORDER BY id; + +INSERT INTO t_json_3 FORMAT JSONEachRow {"id": 3, "data": {"k1" : []}} {"id": 4, "data": {"k1" : []}}; + +SELECT id, data, toTypeName(data) FROM t_json_3 ORDER BY id; +SELECT id, data.k1.k2, data.k1.k3 FROM t_json_3 ORDER BY id; + +{% if 'MergeTree' in engine %} + SELECT name, column, type + FROM system.parts_columns + WHERE table = 't_json_3' AND database = currentDatabase() AND active AND column = 'data' + ORDER BY name; + + SYSTEM START MERGES t_json_3; + OPTIMIZE TABLE t_json_3 FINAL; + + SELECT column, type + FROM system.parts_columns + WHERE table = 't_json_3' AND database = currentDatabase() AND active AND column = 'data' + ORDER BY name; +{% endif %} + +SELECT id, data.k1.k2, data.k1.k3 FROM t_json_3 ORDER BY id; + +SELECT '========'; +TRUNCATE TABLE t_json_3; + +INSERT INTO t_json_3 FORMAT JSONEachRow {"id": 1, "data": {"k1" : {"k2" : 1, "k3" : "foo"}}} {"id": 2, "data": {"k1" : null, "k4" : [1, 2, 3]}}, {"id" : 3, "data": {"k1" : {"k2" : 10}, "k4" : []}}; + +SELECT id, data, toTypeName(data) FROM t_json_3 ORDER BY id; +SELECT id, data.k1.k2, data.k1.k3, data.k4 FROM t_json_3 ORDER BY id; + +DROP TABLE t_json_3; + +{% endfor -%} diff --git a/tests/queries/0_stateless/01825_type_json_4.reference b/tests/queries/0_stateless/01825_type_json_4.reference new file mode 100644 index 00000000000..1b23bf2213e --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_4.reference @@ -0,0 +1,5 @@ +Code: 645 +Code: 15 +Code: 53 +1 ('v1') Tuple(k1 String) +1 v1 diff --git a/tests/queries/0_stateless/01825_type_json_4.sh b/tests/queries/0_stateless/01825_type_json_4.sh new file mode 100755 index 00000000000..4d81e9516c9 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_4.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS t_json_4" + +$CLICKHOUSE_CLIENT -q "CREATE TABLE t_json_4(id UInt64, data JSON) \ +ENGINE = MergeTree ORDER BY tuple()" --allow_experimental_object_type 1 + +echo '{"id": 1, "data": {"k1": "v1"}}, {"id": 2, "data": {"k1": [1, 2]}}' \ + | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json_4 FORMAT JSONEachRow" 2>&1 | grep -o -m1 "Code: 645" + +echo '{"id": 1, "data": {"k1": "v1"}}, {"id": 2, "data": {"k1": [{"k2" : 1}, {"k2" : 2}]}}' \ + | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json_4 FORMAT JSONEachRow" 2>&1 | grep -o -m1 "Code: 15" + +echo '{"id": 1, "data": {"k1": "v1"}}' \ + | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json_4 FORMAT JSONEachRow" + +echo '{"id": 2, "data": {"k1": [1, 2]}}' \ + | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json_4 FORMAT JSONEachRow" 2>&1 | grep -o -m1 "Code: 53" + +$CLICKHOUSE_CLIENT -q "SELECT id, data, toTypeName(data) FROM t_json_4" +$CLICKHOUSE_CLIENT -q "SELECT id, data.k1 FROM t_json_4 ORDER BY id" + +$CLICKHOUSE_CLIENT -q "DROP TABLE t_json_4" diff --git a/tests/queries/0_stateless/01825_type_json_5.reference b/tests/queries/0_stateless/01825_type_json_5.reference new file mode 100644 index 00000000000..4ac0aa26ffd --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_5.reference @@ -0,0 +1,5 @@ +{"a.b":1,"a.c":2} +{"s":{"a.b":1,"a.c":2}} +1 [22,33] +2 qqq [44] +Tuple(k1 Int8, k2 Tuple(k3 String, k4 Array(Int8))) diff --git a/tests/queries/0_stateless/01825_type_json_5.sql b/tests/queries/0_stateless/01825_type_json_5.sql new file mode 100644 index 00000000000..b939a960e32 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_5.sql @@ -0,0 +1,23 @@ +-- Tags: no-fasttest + +SET allow_experimental_object_type = 1; + +SELECT '{"a": {"b": 1, "c": 2}}'::JSON AS s; +SELECT '{"a": {"b": 1, "c": 2}}'::JSON AS s format JSONEachRow; + +DROP TABLE IF EXISTS t_json_5; +DROP TABLE IF EXISTS t_json_str_5; + +CREATE TABLE t_json_str_5 (data String) ENGINE = MergeTree ORDER BY tuple(); +CREATE TABLE t_json_5 (data JSON) ENGINE = MergeTree ORDER BY tuple(); + +INSERT INTO t_json_str_5 FORMAT JSONAsString {"k1": 1, "k2": {"k4": [22, 33]}}, {"k1": 2, "k2": {"k3": "qqq", "k4": [44]}} +; + +INSERT INTO t_json_5 SELECT data FROM t_json_str_5; + +SELECT data.k1, data.k2.k3, data.k2.k4 FROM t_json_5 ORDER BY data.k1; +SELECT DISTINCT toTypeName(data) FROM t_json_5; + +DROP TABLE t_json_5; +DROP TABLE t_json_str_5; diff --git a/tests/queries/0_stateless/01825_type_json_6.reference b/tests/queries/0_stateless/01825_type_json_6.reference new file mode 100644 index 00000000000..7fcd2a40826 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_6.reference @@ -0,0 +1,3 @@ +Tuple(key String, out Nested(outputs Nested(index Int32, n Int8), type Int8, value Int8)) +v1 [0,0] [1,2] [[],[1960131]] [[],[0]] +v2 [1,1] [4,3] [[1881212],[]] [[1],[]] diff --git a/tests/queries/0_stateless/01825_type_json_6.sh b/tests/queries/0_stateless/01825_type_json_6.sh new file mode 100755 index 00000000000..8bbb1abee4a --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_6.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS t_json_6;" + +$CLICKHOUSE_CLIENT -q "CREATE TABLE t_json_6 (data JSON) ENGINE = MergeTree ORDER BY tuple()" --allow_experimental_object_type 1 + +cat < notEmpty(x), outpoints)" + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS btc" diff --git a/tests/queries/0_stateless/01825_type_json_describe.reference b/tests/queries/0_stateless/01825_type_json_describe.reference new file mode 100644 index 00000000000..629b60cb629 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_describe.reference @@ -0,0 +1,3 @@ +data Object(\'json\') +data Tuple(k1 Int8) +data Tuple(k1 String, k2 Array(Int8)) diff --git a/tests/queries/0_stateless/01825_type_json_describe.sql b/tests/queries/0_stateless/01825_type_json_describe.sql new file mode 100644 index 00000000000..cd7c4ff8c8c --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_describe.sql @@ -0,0 +1,21 @@ +-- Tags: no-fasttest + +SET allow_experimental_object_type = 1; + + +DROP TABLE IF EXISTS t_json_desc; + +CREATE TABLE t_json_desc (data JSON) ENGINE = MergeTree ORDER BY tuple(); + +INSERT INTO t_json_desc FORMAT JSONAsObject {"k1": 10} +; + +DESC TABLE t_json_desc; +DESC TABLE t_json_desc SETTINGS describe_extend_object_types = 1; + +INSERT INTO t_json_desc FORMAT JSONAsObject {"k1": "q", "k2": [1, 2, 3]} +; + +DESC TABLE t_json_desc SETTINGS describe_extend_object_types = 1; + +DROP TABLE IF EXISTS t_json_desc; diff --git a/tests/queries/0_stateless/01825_type_json_distributed.reference b/tests/queries/0_stateless/01825_type_json_distributed.reference new file mode 100644 index 00000000000..9ae85ac888c --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_distributed.reference @@ -0,0 +1,4 @@ +(2,('qqq',[44,55])) Tuple(k1 Int8, k2 Tuple(k3 String, k4 Array(Int8))) +(2,('qqq',[44,55])) Tuple(k1 Int8, k2 Tuple(k3 String, k4 Array(Int8))) +2 qqq [44,55] +2 qqq [44,55] diff --git a/tests/queries/0_stateless/01825_type_json_distributed.sql b/tests/queries/0_stateless/01825_type_json_distributed.sql new file mode 100644 index 00000000000..70cc0743556 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_distributed.sql @@ -0,0 +1,18 @@ +-- Tags: no-fasttest + +SET allow_experimental_object_type = 1; + +DROP TABLE IF EXISTS t_json_local; +DROP TABLE IF EXISTS t_json_dist; + +CREATE TABLE t_json_local(data JSON) ENGINE = MergeTree ORDER BY tuple(); +CREATE TABLE t_json_dist AS t_json_local ENGINE = Distributed(test_cluster_two_shards, currentDatabase(), t_json_local); + +INSERT INTO t_json_local FORMAT JSONAsObject {"k1": 2, "k2": {"k3": "qqq", "k4": [44, 55]}} +; + +SELECT data, toTypeName(data) FROM t_json_dist; +SELECT data.k1, data.k2.k3, data.k2.k4 FROM t_json_dist; + +DROP TABLE IF EXISTS t_json_local; +DROP TABLE IF EXISTS t_json_dist; diff --git a/tests/queries/0_stateless/01825_type_json_field.reference b/tests/queries/0_stateless/01825_type_json_field.reference new file mode 100644 index 00000000000..b5637b1fbb7 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_field.reference @@ -0,0 +1,12 @@ +1 10 a +Tuple(a UInt8, s String) +1 10 a 0 +2 sss b 300 +3 20 c 0 +Tuple(a String, b UInt16, s String) +1 10 a 0 +2 sss b 300 +3 20 c 0 +4 30 400 +5 0 qqq 0 foo +Tuple(a String, b UInt16, s String, t String) diff --git a/tests/queries/0_stateless/01825_type_json_field.sql b/tests/queries/0_stateless/01825_type_json_field.sql new file mode 100644 index 00000000000..6c906023cef --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_field.sql @@ -0,0 +1,28 @@ +-- Tags: no-fasttest + +SET allow_experimental_object_type = 1; + +DROP TABLE IF EXISTS t_json_field; + +CREATE TABLE t_json_field (id UInt32, data JSON) +ENGINE = MergeTree ORDER BY tuple(); + +INSERT INTO t_json_field VALUES (1, (10, 'a')::Tuple(a UInt32, s String)); + +SELECT id, data.a, data.s FROM t_json_field ORDER BY id; +SELECT DISTINCT toTypeName(data) FROM t_json_field; + +INSERT INTO t_json_field VALUES (2, ('sss', 300, 'b')::Tuple(a String, b UInt64, s String)), (3, (20, 'c')::Tuple(a UInt32, s String)); + +SELECT id, data.a, data.s, data.b FROM t_json_field ORDER BY id; +SELECT DISTINCT toTypeName(data) FROM t_json_field; + +INSERT INTO t_json_field VALUES (4, map('a', 30, 'b', 400)), (5, map('s', 'qqq', 't', 'foo')); + +SELECT id, data.a, data.s, data.b, data.t FROM t_json_field ORDER BY id; +SELECT DISTINCT toTypeName(data) FROM t_json_field; + +INSERT INTO t_json_field VALUES (6, map(1, 2, 3, 4)); -- { clientError 53 } +INSERT INTO t_json_field VALUES (6, (1, 2, 3)); -- { clientError 53 } + +DROP TABLE t_json_field; diff --git a/tests/queries/0_stateless/01825_type_json_ghdata.reference b/tests/queries/0_stateless/01825_type_json_ghdata.reference new file mode 100644 index 00000000000..c11e9c2dfd9 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_ghdata.reference @@ -0,0 +1,22 @@ +5000 +900 +String 562 +Array 134 +UInt64 63 +Tuple 52 +Int32 47 +Int8 17 +Int16 15 +Nested 9 +Int64 1 +leonardomso/33-js-concepts 3 +ytdl-org/youtube-dl 3 +Bogdanp/neko 2 +bminossi/AllVideoPocsFromHackerOne 2 +disclose/diodata 2 +Commit 182 +chipeo345 119 +phanwi346 114 +Nicholas Piggin 95 +direwolf-github 49 +2 diff --git a/tests/queries/0_stateless/01825_type_json_ghdata.sh b/tests/queries/0_stateless/01825_type_json_ghdata.sh new file mode 100755 index 00000000000..7486571cc22 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_ghdata.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS ghdata" + +${CLICKHOUSE_CLIENT} -q "CREATE TABLE ghdata (data JSON) ENGINE = MergeTree ORDER BY tuple()" --allow_experimental_object_type 1 + +cat $CUR_DIR/data_json/ghdata_sample.json | ${CLICKHOUSE_CLIENT} -q "INSERT INTO ghdata FORMAT JSONAsObject" + +${CLICKHOUSE_CLIENT} -q "SELECT count() FROM ghdata WHERE NOT ignore(*)" + +${CLICKHOUSE_CLIENT} -q \ +"SELECT length(subcolumns.names) \ + FROM system.parts_columns \ + WHERE table = 'ghdata' AND database = '$CLICKHOUSE_DATABASE'" + +${CLICKHOUSE_CLIENT} -q "WITH position(full_type, '(') AS pos +SELECT if(pos = 0, full_type, substring(full_type, 1, pos - 1)) AS type, count() AS c \ + FROM system.parts_columns ARRAY JOIN subcolumns.types AS full_type \ + WHERE table = 'ghdata' AND database = '$CLICKHOUSE_DATABASE' \ + GROUP BY type ORDER BY c DESC" + +${CLICKHOUSE_CLIENT} -q \ +"SELECT data.repo.name, count() AS stars FROM ghdata \ + WHERE data.type = 'WatchEvent' GROUP BY data.repo.name ORDER BY stars DESC, data.repo.name LIMIT 5" + +${CLICKHOUSE_CLIENT} -q \ +"SELECT data.payload.commits.author.name AS name, count() AS c FROM ghdata \ + ARRAY JOIN data.payload.commits.author.name \ + GROUP BY name ORDER BY c DESC, name LIMIT 5" + +${CLICKHOUSE_CLIENT} -q "SELECT max(data.payload.pull_request.assignees.size0) FROM ghdata" + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS ghdata" diff --git a/tests/queries/0_stateless/01825_type_json_insert_select.reference b/tests/queries/0_stateless/01825_type_json_insert_select.reference new file mode 100644 index 00000000000..8283cc5af48 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_insert_select.reference @@ -0,0 +1,12 @@ +Tuple(k1 Int8, k2 String) +1 (1,'foo') +Tuple(k1 Int8, k2 String, k3 String) +1 (1,'foo','') +2 (2,'bar','') +3 (3,'','aaa') +Tuple(arr Nested(k11 Int8, k22 String, k33 Int8), k1 Int8, k2 String, k3 String) +1 ([],1,'foo','') +2 ([],2,'bar','') +3 ([],3,'','aaa') +4 ([(5,'6',0),(7,'0',8)],0,'','') +5 ([(0,'str1',0)],0,'','') diff --git a/tests/queries/0_stateless/01825_type_json_insert_select.sql b/tests/queries/0_stateless/01825_type_json_insert_select.sql new file mode 100644 index 00000000000..8bb03f84f5a --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_insert_select.sql @@ -0,0 +1,36 @@ +-- Tags: no-fasttest + +SET allow_experimental_object_type = 1; + +DROP TABLE IF EXISTS type_json_src; +DROP TABLE IF EXISTS type_json_dst; + +CREATE TABLE type_json_src (id UInt32, data JSON) ENGINE = MergeTree ORDER BY id; +CREATE TABLE type_json_dst AS type_json_src; + +INSERT INTO type_json_src VALUES (1, '{"k1": 1, "k2": "foo"}'); +INSERT INTO type_json_dst SELECT * FROM type_json_src; + +SELECT DISTINCT toTypeName(data) FROM type_json_dst; +SELECT id, data FROM type_json_dst ORDER BY id; + +INSERT INTO type_json_src VALUES (2, '{"k1": 2, "k2": "bar"}') (3, '{"k1": 3, "k3": "aaa"}'); +INSERT INTO type_json_dst SELECT * FROM type_json_src WHERE id > 1; + +SELECT DISTINCT toTypeName(data) FROM type_json_dst; +SELECT id, data FROM type_json_dst ORDER BY id; + +INSERT INTO type_json_dst VALUES (4, '{"arr": [{"k11": 5, "k22": 6}, {"k11": 7, "k33": 8}]}'); + +INSERT INTO type_json_src VALUES (5, '{"arr": "not array"}'); +INSERT INTO type_json_dst SELECT * FROM type_json_src WHERE id = 5; -- { serverError 15 } + +TRUNCATE TABLE type_json_src; +INSERT INTO type_json_src VALUES (5, '{"arr": [{"k22": "str1"}]}') +INSERT INTO type_json_dst SELECT * FROM type_json_src WHERE id = 5; + +SELECT DISTINCT toTypeName(data) FROM type_json_dst; +SELECT id, data FROM type_json_dst ORDER BY id; + +DROP TABLE type_json_src; +DROP TABLE type_json_dst; diff --git a/tests/queries/0_stateless/01825_type_json_nbagames.reference b/tests/queries/0_stateless/01825_type_json_nbagames.reference new file mode 100644 index 00000000000..8f86bfe613e --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_nbagames.reference @@ -0,0 +1,12 @@ +1000 +data Tuple(_id Tuple(`$oid` String), date Tuple(`$date` String), teams Nested(abbreviation String, city String, home UInt64, name String, players Nested(ast Int8, blk Int8, drb Int8, fg Int8, fg3 Int8, fg3_pct String, fg3a Int8, fg_pct String, fga Int8, ft Int8, ft_pct String, fta Int8, mp String, orb Int8, pf Int8, player String, pts Int8, stl Int8, tov Int8, trb Int8), results Tuple(ast Int8, blk Int8, drb Int8, fg Int8, fg3 Int8, fg3_pct String, fg3a Int8, fg_pct String, fga Int8, ft Int8, ft_pct String, fta Int8, mp Int16, orb Int8, pf Int8, pts Int16, stl Int8, tov Int8, trb Int8), score Int16, won Int8)) +Boston Celtics 70 +Los Angeles Lakers 64 +Milwaukee Bucks 61 +Philadelphia 76ers 57 +Atlanta Hawks 55 +Larry Bird 10 +Clyde Drexler 4 +Alvin Robertson 3 +Magic Johnson 3 +Charles Barkley 2 diff --git a/tests/queries/0_stateless/01825_type_json_nbagames.sh b/tests/queries/0_stateless/01825_type_json_nbagames.sh new file mode 100755 index 00000000000..18e7c050680 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_nbagames.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS nbagames" + +${CLICKHOUSE_CLIENT} -q "CREATE TABLE nbagames (data JSON) ENGINE = MergeTree ORDER BY tuple()" --allow_experimental_object_type 1 + +cat $CUR_DIR/data_json/nbagames_sample.json | ${CLICKHOUSE_CLIENT} -q "INSERT INTO nbagames FORMAT JSONAsObject" + +${CLICKHOUSE_CLIENT} -q "SELECT count() FROM nbagames WHERE NOT ignore(*)" +${CLICKHOUSE_CLIENT} -q "DESC nbagames SETTINGS describe_extend_object_types = 1" + +${CLICKHOUSE_CLIENT} -q \ + "SELECT teams.name AS name, sum(teams.won) AS wins FROM nbagames \ + ARRAY JOIN data.teams AS teams GROUP BY name \ + ORDER BY wins DESC LIMIT 5;" + +${CLICKHOUSE_CLIENT} -q \ +"SELECT player, sum(triple_double) AS triple_doubles FROM \ +( \ + SELECT \ + tupleElement(players, 'player') AS player, \ + ((tupleElement(players, 'pts') >= 10) + \ + (tupleElement(players, 'ast') >= 10) + \ + (tupleElement(players, 'blk') >= 10) + \ + (tupleElement(players, 'stl') >= 10) + \ + (tupleElement(players, 'trb') >= 10)) >= 3 AS triple_double \ + FROM \ + ( \ + SELECT arrayJoin(arrayJoin(data.teams.players)) as players from nbagames \ + ) \ +) \ +GROUP BY player ORDER BY triple_doubles DESC, player LIMIT 5" + + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS nbagames" diff --git a/tests/queries/0_stateless/01825_type_json_nullable.reference b/tests/queries/0_stateless/01825_type_json_nullable.reference new file mode 100644 index 00000000000..587fb1b1bc9 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_nullable.reference @@ -0,0 +1,17 @@ +1 (1,2,NULL) Tuple(k1 Nullable(Int8), k2 Nullable(Int8), k3 Nullable(Int8)) +2 (NULL,3,4) Tuple(k1 Nullable(Int8), k2 Nullable(Int8), k3 Nullable(Int8)) +1 1 2 \N +2 \N 3 4 +1 (1,2,NULL) Tuple(k1 Nullable(Int8), k2 Nullable(Int8), k3 Nullable(String)) +2 (NULL,3,'4') Tuple(k1 Nullable(Int8), k2 Nullable(Int8), k3 Nullable(String)) +3 (NULL,NULL,'10') Tuple(k1 Nullable(Int8), k2 Nullable(Int8), k3 Nullable(String)) +4 (NULL,5,'str') Tuple(k1 Nullable(Int8), k2 Nullable(Int8), k3 Nullable(String)) +1 1 2 \N +2 \N 3 4 +3 \N \N 10 +4 \N 5 str +============ +1 ([(11,NULL,NULL),(NULL,22,NULL)]) Tuple(k1 Nested(k2 Nullable(Int8), k3 Nullable(Int8), k4 Nullable(Int8))) +2 ([(NULL,33,NULL),(NULL,NULL,44),(NULL,55,66)]) Tuple(k1 Nested(k2 Nullable(Int8), k3 Nullable(Int8), k4 Nullable(Int8))) +1 [11,NULL] [NULL,22] [NULL,NULL] +2 [NULL,NULL,NULL] [33,NULL,55] [NULL,44,66] diff --git a/tests/queries/0_stateless/01825_type_json_nullable.sql b/tests/queries/0_stateless/01825_type_json_nullable.sql new file mode 100644 index 00000000000..65589243f43 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_nullable.sql @@ -0,0 +1,28 @@ +-- Tags: no-fasttest + +SET allow_experimental_object_type = 1; + +DROP TABLE IF EXISTS t_json_null; + +CREATE TABLE t_json_null(id UInt64, data Object(Nullable('JSON'))) +ENGINE = MergeTree ORDER BY tuple(); + +INSERT INTO t_json_null FORMAT JSONEachRow {"id": 1, "data": {"k1": 1, "k2" : 2}} {"id": 2, "data": {"k2": 3, "k3" : 4}}; + +SELECT id, data, toTypeName(data) FROM t_json_null ORDER BY id; +SELECT id, data.k1, data.k2, data.k3 FROM t_json_null ORDER BY id; + +INSERT INTO t_json_null FORMAT JSONEachRow {"id": 3, "data": {"k3" : 10}} {"id": 4, "data": {"k2": 5, "k3" : "str"}}; + +SELECT id, data, toTypeName(data) FROM t_json_null ORDER BY id; +SELECT id, data.k1, data.k2, data.k3 FROM t_json_null ORDER BY id; + +SELECT '============'; +TRUNCATE TABLE t_json_null; + +INSERT INTO TABLE t_json_null FORMAT JSONEachRow {"id": 1, "data": {"k1" : [{"k2" : 11}, {"k3" : 22}]}} {"id": 2, "data": {"k1" : [{"k3" : 33}, {"k4" : 44}, {"k3" : 55, "k4" : 66}]}}; + +SELECT id, data, toTypeName(data) FROM t_json_null ORDER BY id; +SELECT id, data.k1.k2, data.k1.k3, data.k1.k4 FROM t_json_null ORDER BY id; + +DROP TABLE t_json_null; diff --git a/tests/queries/0_stateless/01825_type_json_schema_race_long.reference b/tests/queries/0_stateless/01825_type_json_schema_race_long.reference new file mode 100644 index 00000000000..d86bac9de59 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_schema_race_long.reference @@ -0,0 +1 @@ +OK diff --git a/tests/queries/0_stateless/01825_type_json_schema_race_long.sh b/tests/queries/0_stateless/01825_type_json_schema_race_long.sh new file mode 100755 index 00000000000..38d1432cef6 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_schema_race_long.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, long + +set -e + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS t_json_race" +$CLICKHOUSE_CLIENT -q "CREATE TABLE t_json_race (data JSON) ENGINE = MergeTree ORDER BY tuple()" --allow_experimental_object_type 1 + +function test_case() +{ + $CLICKHOUSE_CLIENT -q "TRUNCATE TABLE t_json_race" + + echo '{"data": {"k1": 1, "k2": 2}}' | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json_race FORMAT JSONEachRow" + + pids=() + for _ in {1..5}; do + $CLICKHOUSE_CLIENT -q "SELECT * FROM t_json_race WHERE 0 IN (SELECT sleep(0.05)) FORMAT Null" & + pids+=($!) + done + + echo '{"data": {"k1": "str", "k2": "str1"}}' | $CLICKHOUSE_CLIENT -q "INSERT INTO t_json_race FORMAT JSONEachRow" & + + for pid in "${pids[@]}"; do + wait "$pid" || exit 1 + done +} + +for _ in {1..30}; do test_case; done + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS t_json_race" +echo OK diff --git a/tests/queries/0_stateless/02007_ipv4_and_ipv6_to_and_from_string.reference b/tests/queries/0_stateless/02007_ipv4_and_ipv6_to_and_from_string.reference index 8a4df1605fb..8da82a0726f 100644 --- a/tests/queries/0_stateless/02007_ipv4_and_ipv6_to_and_from_string.reference +++ b/tests/queries/0_stateless/02007_ipv4_and_ipv6_to_and_from_string.reference @@ -4,3 +4,4 @@ 2001:db8:0:85a3::ac1f:8001 String 0.0.0.0 IPv4 :: IPv6 +::ffff:1.1.1.1 IPv6 diff --git a/tests/queries/0_stateless/02007_ipv4_and_ipv6_to_and_from_string.sql b/tests/queries/0_stateless/02007_ipv4_and_ipv6_to_and_from_string.sql index 2fcc20b9811..b303d580e72 100644 --- a/tests/queries/0_stateless/02007_ipv4_and_ipv6_to_and_from_string.sql +++ b/tests/queries/0_stateless/02007_ipv4_and_ipv6_to_and_from_string.sql @@ -4,10 +4,10 @@ SELECT CAST(toIPv4('127.0.0.1') as String) as v, toTypeName(v); SELECT CAST('2001:0db8:0000:85a3:0000:0000:ac1f:8001' as IPv6) as v, toTypeName(v); SELECT CAST(toIPv6('2001:0db8:0000:85a3:0000:0000:ac1f:8001') as String) as v, toTypeName(v); -SELECT toIPv4('hello') as v, toTypeName(v); -SELECT toIPv6('hello') as v, toTypeName(v); +SELECT toIPv4OrDefault('hello') as v, toTypeName(v); +SELECT toIPv6OrDefault('hello') as v, toTypeName(v); SELECT CAST('hello' as IPv4) as v, toTypeName(v); -- { serverError CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING } SELECT CAST('hello' as IPv6) as v, toTypeName(v); -- { serverError CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING } -SELECT CAST('1.1.1.1' as IPv6) as v, toTypeName(v); -- { serverError CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING } +SELECT CAST('1.1.1.1' as IPv6) as v, toTypeName(v); diff --git a/tests/queries/0_stateless/02158_proportions_ztest.reference b/tests/queries/0_stateless/02158_proportions_ztest.reference new file mode 100644 index 00000000000..3c5c5a13a34 --- /dev/null +++ b/tests/queries/0_stateless/02158_proportions_ztest.reference @@ -0,0 +1,2 @@ +(-0.20656724435948853,0.8363478437079654,-0.09345975390115283,0.07563797172293502) +(-0.20656724435948853,0.8363478437079654,-0.09345975390115283,0.07563797172293502) diff --git a/tests/queries/0_stateless/02158_proportions_ztest.sql b/tests/queries/0_stateless/02158_proportions_ztest.sql new file mode 100644 index 00000000000..bda50b43a97 --- /dev/null +++ b/tests/queries/0_stateless/02158_proportions_ztest.sql @@ -0,0 +1,13 @@ +SELECT proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled'); + +DROP TABLE IF EXISTS proportions_ztest; +CREATE TABLE proportions_ztest (sx UInt64, sy UInt64, tx UInt64, ty UInt64) Engine = Memory(); +INSERT INTO proportions_ztest VALUES (10, 11, 100, 101); +SELECT proportionsZTest(sx, sy, tx, ty, 0.95, 'unpooled') FROM proportions_ztest; +DROP TABLE IF EXISTS proportions_ztest; + + +SELECT + NULL, + proportionsZTest(257, 1048575, 1048575, 257, -inf, NULL), + proportionsZTest(1024, 1025, 2, 2, 'unpooled'); -- { serverError 43 } \ No newline at end of file diff --git a/tests/queries/0_stateless/02158_proportions_ztest_cmp.python b/tests/queries/0_stateless/02158_proportions_ztest_cmp.python new file mode 100644 index 00000000000..d622004db28 --- /dev/null +++ b/tests/queries/0_stateless/02158_proportions_ztest_cmp.python @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +import os +import sys +from math import sqrt, nan +from random import randrange +from scipy import stats +import pandas as pd +import numpy as np + +CURDIR = os.path.dirname(os.path.realpath(__file__)) +sys.path.insert(0, os.path.join(CURDIR, 'helpers')) + +from pure_http_client import ClickHouseClient + + +# unpooled variance z-test for proportions of two samples +def twosample_proportion_ztest(s1, s2, t1, t2, alpha): + if s1 == 0 or s2 == 0 or s1 > t1 or s2 > t2 or t1 + t2 == 0: + return nan, nan, nan, nan + + p1 = s1 / t1 + p2 = s2 / t2 + se = sqrt(p1 * (1 - p1) / t1 + p2 * (1 - p2) / t2) + if se == 0: + return nan, nan, nan, nan + z_stat = (p1 - p2) / se + + one_side = 1 - stats.norm.cdf(abs(z_stat)) + p_value = one_side * 2 + + z = stats.norm.ppf(1 - 0.5 * alpha) + ci_lower = (p1 - p2) - z * se + ci_upper = (p1 - p2) + z * se + + return z_stat, p_value, ci_lower, ci_upper + + +def test_and_check(name, z_stat, p_value, ci_lower, ci_upper, precision=1e-2): + client = ClickHouseClient() + real = client.query_return_df( + "SELECT roundBankers({}.1, 16) as z_stat, ".format(name) + + "roundBankers({}.2, 16) as p_value, ".format(name) + + "roundBankers({}.3, 16) as ci_lower, ".format(name) + + "roundBankers({}.4, 16) as ci_upper ".format(name) + + "FORMAT TabSeparatedWithNames;") + real_z_stat = real['z_stat'][0] + real_p_value = real['p_value'][0] + real_ci_lower = real['ci_lower'][0] + real_ci_upper = real['ci_upper'][0] + assert((np.isnan(real_z_stat) and np.isnan(z_stat)) or abs(real_z_stat - np.float64(z_stat)) < precision), "clickhouse_z_stat {}, py_z_stat {}".format(real_z_stat, z_stat) + assert((np.isnan(real_p_value) and np.isnan(p_value)) or abs(real_p_value - np.float64(p_value)) < precision), "clickhouse_p_value {}, py_p_value {}".format(real_p_value, p_value) + assert((np.isnan(real_ci_lower) and np.isnan(ci_lower)) or abs(real_ci_lower - np.float64(ci_lower)) < precision), "clickhouse_ci_lower {}, py_ci_lower {}".format(real_ci_lower, ci_lower) + assert((np.isnan(real_ci_upper) and np.isnan(ci_upper)) or abs(real_ci_upper - np.float64(ci_upper)) < precision), "clickhouse_ci_upper {}, py_ci_upper {}".format(real_ci_upper, ci_upper) + + +def test_mean_ztest(): + counts = [0, 0] + nobs = [0, 0] + z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) + test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) + z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(10, 10, 10, 10, 0.05) + + counts = [10, 10] + nobs = [10, 10] + z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) + test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) + z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(10, 10, 10, 10, 0.05) + + counts = [16, 16] + nobs = [16, 18] + z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) + test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) + + counts = [10, 20] + nobs = [30, 40] + z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) + test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) + + counts = [20, 10] + nobs = [40, 30] + z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) + test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) + + counts = [randrange(10,20), randrange(10,20)] + nobs = [randrange(counts[0] + 1, counts[0] * 2), randrange(counts[1], counts[1] * 2)] + z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) + test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) + + counts = [randrange(1,100), randrange(1,200)] + nobs = [randrange(counts[0], counts[0] * 2), randrange(counts[1], counts[1] * 3)] + z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) + test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) + + counts = [randrange(1,200), randrange(1,100)] + nobs = [randrange(counts[0], counts[0] * 3), randrange(counts[1], counts[1] * 2)] + z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) + test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) + + counts = [randrange(1,1000), randrange(1,1000)] + nobs = [randrange(counts[0], counts[0] * 2), randrange(counts[1], counts[1] * 2)] + z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) + test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) + + +if __name__ == "__main__": + test_mean_ztest() + print("Ok.") + diff --git a/tests/queries/0_stateless/02158_proportions_ztest_cmp.reference b/tests/queries/0_stateless/02158_proportions_ztest_cmp.reference new file mode 100644 index 00000000000..587579af915 --- /dev/null +++ b/tests/queries/0_stateless/02158_proportions_ztest_cmp.reference @@ -0,0 +1 @@ +Ok. diff --git a/tests/queries/0_stateless/02158_proportions_ztest_cmp.sh b/tests/queries/0_stateless/02158_proportions_ztest_cmp.sh new file mode 100755 index 00000000000..64eeb513958 --- /dev/null +++ b/tests/queries/0_stateless/02158_proportions_ztest_cmp.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# We should have correct env vars from shell_config.sh to run this test + +python3 "$CURDIR"/02158_proportions_ztest_cmp.python diff --git a/tests/queries/0_stateless/02177_issue_31009_pt2.reference b/tests/queries/0_stateless/02177_issue_31009_pt2.reference new file mode 100644 index 00000000000..3c644f22b9b --- /dev/null +++ b/tests/queries/0_stateless/02177_issue_31009_pt2.reference @@ -0,0 +1,28 @@ +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/02177_issue_31009_pt2.sql.j2 b/tests/queries/0_stateless/02177_issue_31009_pt2.sql.j2 new file mode 100644 index 00000000000..79799596bdf --- /dev/null +++ b/tests/queries/0_stateless/02177_issue_31009_pt2.sql.j2 @@ -0,0 +1,37 @@ +-- Tags: long + +DROP TABLE IF EXISTS left; +DROP TABLE IF EXISTS right; + +SET join_algorithm = 'partial_merge'; + +{% for block_size in [10, 11, 128, 129, 65505, 65506, 70000] -%} +{% for join_block_size in range(block_size - 2, block_size + 2) -%} + +CREATE OR REPLACE TABLE left ( key UInt32, value String ) ENGINE = Memory; +CREATE OR REPLACE TABLE right ( key UInt32, value String ) ENGINE = Memory; + +INSERT INTO left SELECT number, toString(number) FROM numbers({{ block_size * 2 + 1 }}); +INSERT INTO right SELECT number, toString(number) FROM numbers({{ block_size * 2 + 5 }}); + +SET max_joined_block_size_rows = {{ join_block_size }}; +SET max_block_size = {{ block_size }}; + +SELECT key, count(1) AS cnt +FROM ( + SELECT * + FROM ( SELECT key FROM left AS s ) AS data + ALL LEFT JOIN ( SELECT key FROM right GROUP BY key ) AS promo ON promo.key = data.key +) GROUP BY key HAVING count(1) > 1 +; + +SELECT count() == (SELECT count() from left) AND min(key == promo.key) == 1 +FROM ( SELECT key FROM left AS s ) AS data +ALL LEFT JOIN ( SELECT key FROM right GROUP BY key ) AS promo ON promo.key = data.key +; + +{% endfor -%} +{% endfor -%} + +DROP TABLE IF EXISTS left; +DROP TABLE IF EXISTS right; diff --git a/tests/queries/0_stateless/02232_allow_only_replicated_engine.reference b/tests/queries/0_stateless/02232_allow_only_replicated_engine.reference new file mode 100644 index 00000000000..9b45eb31b7e --- /dev/null +++ b/tests/queries/0_stateless/02232_allow_only_replicated_engine.reference @@ -0,0 +1,3 @@ +Only table with Replicated engine +Only table with Replicated engine +Only table with Replicated engine diff --git a/tests/queries/0_stateless/02232_allow_only_replicated_engine.sh b/tests/queries/0_stateless/02232_allow_only_replicated_engine.sh new file mode 100755 index 00000000000..d04c8a98df4 --- /dev/null +++ b/tests/queries/0_stateless/02232_allow_only_replicated_engine.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# Tags: replica + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -q "create table mute_stylecheck (x UInt32) engine = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/root', '1') order by x" + +${CLICKHOUSE_CLIENT} -q "CREATE USER user_${CLICKHOUSE_DATABASE} settings database_replicated_allow_only_replicated_engine=1" +${CLICKHOUSE_CLIENT} -q "GRANT CREATE TABLE ON ${CLICKHOUSE_DATABASE}_db.* TO user_${CLICKHOUSE_DATABASE}" +${CLICKHOUSE_CLIENT} --allow_experimental_database_replicated=1 --query "CREATE DATABASE ${CLICKHOUSE_DATABASE}_db engine = Replicated('/clickhouse/databases/${CLICKHOUSE_TEST_ZOOKEEPER_PREFIX}/${CLICKHOUSE_DATABASE}_db', '{shard}', '{replica}')" +${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none --user "user_${CLICKHOUSE_DATABASE}" --query "CREATE TABLE ${CLICKHOUSE_DATABASE}_db.tab_memory (x UInt32) engine = Memory;" +${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none --user "user_${CLICKHOUSE_DATABASE}" -n --query "set distributed_ddl_entry_format_version=2; CREATE TABLE ${CLICKHOUSE_DATABASE}_db.tab_mt (x UInt32) engine = MergeTree order by x;" 2>&1 | grep -o "Only table with Replicated engine" +${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none -n --query "set distributed_ddl_entry_format_version=2; CREATE TABLE ${CLICKHOUSE_DATABASE}_db.tab_mt (x UInt32) engine = MergeTree order by x;" +${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none --user "user_${CLICKHOUSE_DATABASE}" -n --query "set distributed_ddl_entry_format_version=2; CREATE TABLE ${CLICKHOUSE_DATABASE}_db.tab_rmt (x UInt32) engine = ReplicatedMergeTree order by x;" +${CLICKHOUSE_CLIENT} --query "DROP DATABASE ${CLICKHOUSE_DATABASE}_db" +${CLICKHOUSE_CLIENT} -q "DROP USER user_${CLICKHOUSE_DATABASE}" diff --git a/tests/queries/0_stateless/02234_cast_to_ip_address.reference b/tests/queries/0_stateless/02234_cast_to_ip_address.reference new file mode 100644 index 00000000000..3a4c40a07cf --- /dev/null +++ b/tests/queries/0_stateless/02234_cast_to_ip_address.reference @@ -0,0 +1,45 @@ +IPv4 functions +0 +\N +2130706433 +2130706433 +2130706433 +-- +0.0.0.0 +\N +127.0.0.1 +127.0.0.1 +127.0.0.1 +-- +127.0.0.1 +-- +0 +0.0.0.0 +0 +0.0.0.0 +0.0.0.0 +0.0.0.0 +IPv6 functions +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0 +\N +\0\0\0\0\0\0\0\0\0\0ÿÿ\0\0 +\0\0\0\0\0\0\0\0\0\0ÿÿ\0\0 +\0\0\0\0\0\0\0\0\0\0ÿÿ\0\0 +-- +:: +\N +::ffff:127.0.0.1 +::ffff:127.0.0.1 +::ffff:127.0.0.1 +-- +::ffff:127.0.0.1 +-- +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0 +:: +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0 +:: +:: +:: +-- +::ffff:127.0.0.1 ::ffff:127.0.0.1 ::ffff:127.0.0.1 +::1\0\0 ::1 ::1 diff --git a/tests/queries/0_stateless/02234_cast_to_ip_address.sql b/tests/queries/0_stateless/02234_cast_to_ip_address.sql new file mode 100644 index 00000000000..d436c93b9db --- /dev/null +++ b/tests/queries/0_stateless/02234_cast_to_ip_address.sql @@ -0,0 +1,82 @@ +SELECT 'IPv4 functions'; + +SELECT IPv4StringToNum('test'); --{serverError 441} +SELECT IPv4StringToNumOrDefault('test'); +SELECT IPv4StringToNumOrNull('test'); + +SELECT IPv4StringToNum('127.0.0.1'); +SELECT IPv4StringToNumOrDefault('127.0.0.1'); +SELECT IPv4StringToNumOrNull('127.0.0.1'); + +SELECT '--'; + +SELECT toIPv4('test'); --{serverError 441} +SELECT toIPv4OrDefault('test'); +SELECT toIPv4OrNull('test'); + +SELECT toIPv4('127.0.0.1'); +SELECT toIPv4OrDefault('127.0.0.1'); +SELECT toIPv4OrNull('127.0.0.1'); + +SELECT '--'; + +SELECT cast('test' , 'IPv4'); --{serverError 441} +SELECT cast('127.0.0.1' , 'IPv4'); + +SELECT '--'; + +SET cast_ipv4_ipv6_default_on_conversion_error = 1; + +SELECT IPv4StringToNum('test'); +SELECT toIPv4('test'); +SELECT IPv4StringToNum(''); +SELECT toIPv4(''); +SELECT cast('test' , 'IPv4'); +SELECT cast('' , 'IPv4'); + +SET cast_ipv4_ipv6_default_on_conversion_error = 0; + +SELECT 'IPv6 functions'; + +SELECT IPv6StringToNum('test'); --{serverError 441} +SELECT IPv6StringToNumOrDefault('test'); +SELECT IPv6StringToNumOrNull('test'); + +SELECT IPv6StringToNum('::ffff:127.0.0.1'); +SELECT IPv6StringToNumOrDefault('::ffff:127.0.0.1'); +SELECT IPv6StringToNumOrNull('::ffff:127.0.0.1'); + +SELECT '--'; + +SELECT toIPv6('test'); --{serverError 441} +SELECT toIPv6OrDefault('test'); +SELECT toIPv6OrNull('test'); + +SELECT toIPv6('::ffff:127.0.0.1'); +SELECT toIPv6OrDefault('::ffff:127.0.0.1'); +SELECT toIPv6OrNull('::ffff:127.0.0.1'); + +SELECT '--'; + +SELECT cast('test' , 'IPv6'); --{serverError 441} +SELECT cast('::ffff:127.0.0.1', 'IPv6'); + +SELECT '--'; + +SET cast_ipv4_ipv6_default_on_conversion_error = 1; + +SELECT IPv6StringToNum('test'); +SELECT toIPv6('test'); +SELECT IPv6StringToNum(''); +SELECT toIPv6(''); +SELECT cast('test' , 'IPv6'); +SELECT cast('' , 'IPv6'); + +SELECT '--'; + +SET cast_ipv4_ipv6_default_on_conversion_error = 0; + +SELECT toFixedString('::ffff:127.0.0.1', 16) as value, cast(value, 'IPv6'), toIPv6(value); +SELECT toFixedString('::1', 5) as value, cast(value, 'IPv6'), toIPv6(value); +SELECT toFixedString('', 16) as value, cast(value, 'IPv6'); --{serverError 441} +SELECT toFixedString('', 16) as value, toIPv6(value); --{serverError 441} diff --git a/tests/queries/0_stateless/02236_json_each_row_empty_map_schema_inference.reference b/tests/queries/0_stateless/02236_json_each_row_empty_map_schema_inference.reference new file mode 100644 index 00000000000..864f98fd923 --- /dev/null +++ b/tests/queries/0_stateless/02236_json_each_row_empty_map_schema_inference.reference @@ -0,0 +1,2 @@ +{} +{'b':1} diff --git a/tests/queries/0_stateless/02236_json_each_row_empty_map_schema_inference.sql b/tests/queries/0_stateless/02236_json_each_row_empty_map_schema_inference.sql new file mode 100644 index 00000000000..8241c8ead37 --- /dev/null +++ b/tests/queries/0_stateless/02236_json_each_row_empty_map_schema_inference.sql @@ -0,0 +1,3 @@ +-- Tags: no-fasttest + +select * from format(JSONEachRow, '{"a" : {}}, {"a" : {"b" : 1}}') diff --git a/tests/queries/0_stateless/02239_client_host_help.reference b/tests/queries/0_stateless/02239_client_host_help.reference new file mode 100644 index 00000000000..2c94e483710 --- /dev/null +++ b/tests/queries/0_stateless/02239_client_host_help.reference @@ -0,0 +1,2 @@ +OK +OK diff --git a/tests/queries/0_stateless/02239_client_host_help.sh b/tests/queries/0_stateless/02239_client_host_help.sh new file mode 100755 index 00000000000..7b31b9d4dd7 --- /dev/null +++ b/tests/queries/0_stateless/02239_client_host_help.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --help | grep -q "\-\-host" && echo "OK" || echo "FAIL" +${CLICKHOUSE_CLIENT} --help | grep -q "\-\-port arg" && echo "OK" || echo "FAIL" + diff --git a/tests/queries/0_stateless/data_json/btc_transactions.json b/tests/queries/0_stateless/data_json/btc_transactions.json new file mode 100644 index 00000000000..136f8ea29c1 Binary files /dev/null and b/tests/queries/0_stateless/data_json/btc_transactions.json differ diff --git a/tests/queries/0_stateless/data_json/ghdata_sample.json b/tests/queries/0_stateless/data_json/ghdata_sample.json new file mode 100644 index 00000000000..985b4f135b8 Binary files /dev/null and b/tests/queries/0_stateless/data_json/ghdata_sample.json differ diff --git a/tests/queries/0_stateless/data_json/nbagames_sample.json b/tests/queries/0_stateless/data_json/nbagames_sample.json new file mode 100644 index 00000000000..5082ca059b3 Binary files /dev/null and b/tests/queries/0_stateless/data_json/nbagames_sample.json differ