Merge branch 'master' into parallel-downloading-url-engine

This commit is contained in:
Antonio Andelic 2022-03-17 10:11:58 +00:00
commit 103a3fa140
383 changed files with 8495 additions and 1695 deletions

1
.gitattributes vendored
View File

@ -1,2 +1,3 @@
contrib/* linguist-vendored
*.h linguist-language=C++
tests/queries/0_stateless/data_json/* binary

View File

@ -60,5 +60,5 @@ clientPort=2181 \n\
maxClientCnxns=80' > /opt/zookeeper/conf/zoo.cfg
RUN mkdir /zookeeper && chmod -R 777 /zookeeper
ENV TZ=Europe/Moscow
ENV TZ=Etc/UTC
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

View File

@ -40,7 +40,7 @@ RUN apt-get update \
/tmp/* \
&& apt-get clean
ENV TZ=Europe/Moscow
ENV TZ=Etc/UTC
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
ENV DOCKER_CHANNEL stable

View File

@ -13,10 +13,18 @@ Alias: `INET_NTOA`.
## IPv4StringToNum(s) {#ipv4stringtonums}
The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it returns 0.
The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it throws exception.
Alias: `INET_ATON`.
## IPv4StringToNumOrDefault(s) {#ipv4stringtonums}
Same as `IPv4StringToNum`, but if the IPv4 address has an invalid format, it returns 0.
## IPv4StringToNumOrNull(s) {#ipv4stringtonums}
Same as `IPv4StringToNum`, but if the IPv4 address has an invalid format, it returns null.
## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum}
Similar to IPv4NumToString, but using xxx instead of the last octet.
@ -123,7 +131,7 @@ LIMIT 10
## IPv6StringToNum {#ipv6stringtonums}
The reverse function of [IPv6NumToString](#ipv6numtostringx). If the IPv6 address has an invalid format, it returns a string of null bytes.
The reverse function of [IPv6NumToString](#ipv6numtostringx). If the IPv6 address has an invalid format, it throws exception.
If the input string contains a valid IPv4 address, returns its IPv6 equivalent.
HEX can be uppercase or lowercase.
@ -168,6 +176,14 @@ Result:
- [cutIPv6](#cutipv6x-bytestocutforipv6-bytestocutforipv4).
## IPv6StringToNumOrDefault(s) {#ipv6stringtonums}
Same as `IPv6StringToNum`, but if the IPv6 address has an invalid format, it returns 0.
## IPv6StringToNumOrNull(s) {#ipv6stringtonums}
Same as `IPv6StringToNum`, but if the IPv6 address has an invalid format, it returns null.
## IPv4ToIPv6(x) {#ipv4toipv6x}
Takes a `UInt32` number. Interprets it as an IPv4 address in [big endian](https://en.wikipedia.org/wiki/Endianness). Returns a `FixedString(16)` value containing the IPv6 address in binary format. Examples:
@ -261,6 +277,14 @@ SELECT
└───────────────────────────────────┴──────────────────────────┘
```
## toIPv4OrDefault(string) {#toipv4ordefaultstring}
Same as `toIPv4`, but if the IPv4 address has an invalid format, it returns 0.
## toIPv4OrNull(string) {#toipv4ornullstring}
Same as `toIPv4`, but if the IPv4 address has an invalid format, it returns null.
## toIPv6 {#toipv6string}
Converts a string form of IPv6 address to [IPv6](../../sql-reference/data-types/domains/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value.
@ -317,6 +341,14 @@ Result:
└─────────────────────┘
```
## IPv6StringToNumOrDefault(s) {#toipv6ordefaultstring}
Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns 0.
## IPv6StringToNumOrNull(s) {#toipv6ornullstring}
Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns null.
## isIPv4String {#isipv4string}
Determines whether the input string is an IPv4 address or not. If `string` is IPv6 address returns `0`.

View File

@ -0,0 +1,48 @@
---
toc_priority: 69
toc_title: Statistics
---
# Functions for Working with Statistics {#functions-for-working-with-statistics}
# proportionsZTest {#proportionsztest}
Applies proportion z-test to samples from two populations (X and Y). The alternative is 'two-sided'.
**Syntax**
``` sql
proportionsZTest(successes_x, successes_y, trials_x, trials_y, significance_level, usevar)
```
**Arguments**
- `successes_x` — The number of successes for X in trials.
- `successes_y` — The number of successes for X in trials.
- `trials_x` — The number of trials for X.
- `trials_y` — The number of trials for Y.
- `significance_level`
- `usevar` - It can be `'pooled'` or `'unpooled'`.
- `'pooled'` - The variance of the two populations are assumed to be equal.
- `'unpooled'` - The assumption of equal variances is dropped.
**Returned value**
- A tuple with the (z-statistic, p-value, confidence-interval-lower, confidence-interval-upper).
Type: [Tuple](../../sql-reference/data-types/tuple.md).
**Example**
Query:
``` sql
SELECT proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled');
```
Result:
``` text
(-0.20656724435948853,0.8363478437079654,-0.09345975390115283,0.07563797172293502)
```

View File

@ -1,4 +1,4 @@
Babel==2.8.0
Babel==2.9.1
backports-abc==0.5
backports.functools-lru-cache==1.6.1
beautifulsoup4==4.9.1
@ -10,22 +10,22 @@ cssmin==0.2.0
future==0.18.2
htmlmin==0.1.12
idna==2.10
Jinja2>=2.11.3
Jinja2>=3.0.3
jinja2-highlight==0.6.1
jsmin==3.0.0
livereload==2.6.2
livereload==2.6.3
Markdown==3.3.2
MarkupSafe==1.1.1
MarkupSafe==2.1.0
mkdocs==1.1.2
mkdocs-htmlproofer-plugin==0.0.3
mkdocs-macros-plugin==0.4.20
nltk==3.5
nltk==3.7
nose==1.3.7
protobuf==3.14.0
numpy==1.21.2
pymdown-extensions==8.0
python-slugify==4.0.1
PyYAML==5.4.1
PyYAML==6.0
repackage==0.7.3
requests==2.25.1
singledispatch==3.4.0.3
@ -34,5 +34,6 @@ soupsieve==2.0.1
termcolor==1.1.0
tornado==6.1
Unidecode==1.1.1
urllib3>=1.26.5
Pygments>=2.7.4
urllib3>=1.26.8
Pygments>=2.11.2

View File

@ -787,6 +787,7 @@ void Client::printHelpMessage(const OptionsDescription & options_description)
{
std::cout << options_description.main_description.value() << "\n";
std::cout << options_description.external_description.value() << "\n";
std::cout << options_description.hosts_and_ports_description.value() << "\n";
std::cout << "In addition, --param_name=value can be specified for substitution of parameters for parametrized queries.\n";
}

View File

@ -304,8 +304,8 @@ void LocalServer::setupUsers()
ConfigurationPtr users_config;
auto & access_control = global_context->getAccessControl();
access_control.setPlaintextPasswordSetting(config().getBool("allow_plaintext_password", true));
access_control.setNoPasswordSetting(config().getBool("allow_no_password", true));
access_control.setNoPasswordAllowed(config().getBool("allow_no_password", true));
access_control.setPlaintextPasswordAllowed(config().getBool("allow_plaintext_password", true));
if (config().has("users_config") || config().has("config-file") || fs::exists("config.xml"))
{
const auto users_config_path = config().getString("users_config", config().getString("config-file", "config.xml"));

View File

@ -1074,9 +1074,10 @@ if (ThreadFuzzer::instance().isEffective())
auto & access_control = global_context->getAccessControl();
if (config().has("custom_settings_prefixes"))
access_control.setCustomSettingsPrefixes(config().getString("custom_settings_prefixes"));
///set the allow_plaintext_and_no_password setting in context.
access_control.setPlaintextPasswordSetting(config().getBool("allow_plaintext_password", true));
access_control.setNoPasswordSetting(config().getBool("allow_no_password", true));
access_control.setNoPasswordAllowed(config().getBool("allow_no_password", true));
access_control.setPlaintextPasswordAllowed(config().getBool("allow_plaintext_password", true));
/// Initialize access storages.
try
{

View File

@ -243,7 +243,7 @@
openssl dhparam -out /etc/clickhouse-server/dhparam.pem 4096
Only file format with BEGIN DH PARAMETERS is supported.
-->
<!-- <dhParamsFile>/etc/clickhouse-server/dhparam.pem</dhParamsFile>-->
<!-- <dhParamsFile>/etc/clickhouse-server/dhparam.pem</dhParamsFile>-->
<verificationMode>none</verificationMode>
<loadDefaultCAFile>true</loadDefaultCAFile>
<cacheSessions>true</cacheSessions>
@ -368,7 +368,7 @@
<!-- Path to temporary data for processing hard queries. -->
<tmp_path>/var/lib/clickhouse/tmp/</tmp_path>
<!-- Disable AuthType Plaintext_password and No_password for ACL. -->
<!-- Disable AuthType plaintext_password and no_password for ACL. -->
<!-- <allow_plaintext_password>0</allow_plaintext_password> -->
<!-- <allow_no_password>0</allow_no_password> -->`

View File

@ -173,7 +173,8 @@ void AccessControl::addUsersConfigStorage(const String & storage_name_, const Po
auto check_setting_name_function = [this](const std::string_view & setting_name) { checkSettingNameIsAllowed(setting_name); };
auto is_no_password_allowed_function = [this]() -> bool { return isNoPasswordAllowed(); };
auto is_plaintext_password_allowed_function = [this]() -> bool { return isPlaintextPasswordAllowed(); };
auto new_storage = std::make_shared<UsersConfigAccessStorage>(storage_name_, check_setting_name_function,is_no_password_allowed_function,is_plaintext_password_allowed_function);
auto new_storage = std::make_shared<UsersConfigAccessStorage>(storage_name_, check_setting_name_function,
is_no_password_allowed_function, is_plaintext_password_allowed_function);
new_storage->setConfig(users_config_);
addStorage(new_storage);
LOG_DEBUG(getLogger(), "Added {} access storage '{}', path: {}",
@ -209,7 +210,8 @@ void AccessControl::addUsersConfigStorage(
auto check_setting_name_function = [this](const std::string_view & setting_name) { checkSettingNameIsAllowed(setting_name); };
auto is_no_password_allowed_function = [this]() -> bool { return isNoPasswordAllowed(); };
auto is_plaintext_password_allowed_function = [this]() -> bool { return isPlaintextPasswordAllowed(); };
auto new_storage = std::make_shared<UsersConfigAccessStorage>(storage_name_, check_setting_name_function,is_no_password_allowed_function,is_plaintext_password_allowed_function);
auto new_storage = std::make_shared<UsersConfigAccessStorage>(storage_name_, check_setting_name_function,
is_no_password_allowed_function, is_plaintext_password_allowed_function);
new_storage->load(users_config_path_, include_from_path_, preprocessed_dir_, get_zookeeper_function_);
addStorage(new_storage);
LOG_DEBUG(getLogger(), "Added {} access storage '{}', path: {}", String(new_storage->getStorageType()), new_storage->getStorageName(), new_storage->getPath());
@ -411,7 +413,8 @@ UUID AccessControl::authenticate(const Credentials & credentials, const Poco::Ne
{
try
{
return MultipleAccessStorage::authenticate(credentials, address, *external_authenticators,allow_no_password, allow_plaintext_password);
return MultipleAccessStorage::authenticate(credentials, address, *external_authenticators, allow_no_password,
allow_plaintext_password);
}
catch (...)
{
@ -447,26 +450,38 @@ void AccessControl::setCustomSettingsPrefixes(const String & comma_separated_pre
setCustomSettingsPrefixes(prefixes);
}
void AccessControl::setPlaintextPasswordSetting(bool allow_plaintext_password_)
{
allow_plaintext_password = allow_plaintext_password_;
}
void AccessControl::setNoPasswordSetting(bool allow_no_password_)
{
allow_no_password = allow_no_password_;
}
bool AccessControl::isSettingNameAllowed(const std::string_view & setting_name) const
bool AccessControl::isSettingNameAllowed(const std::string_view setting_name) const
{
return custom_settings_prefixes->isSettingNameAllowed(setting_name);
}
void AccessControl::checkSettingNameIsAllowed(const std::string_view & setting_name) const
void AccessControl::checkSettingNameIsAllowed(const std::string_view setting_name) const
{
custom_settings_prefixes->checkSettingNameIsAllowed(setting_name);
}
void AccessControl::setNoPasswordAllowed(bool allow_no_password_)
{
allow_no_password = allow_no_password_;
}
bool AccessControl::isNoPasswordAllowed() const
{
return allow_no_password;
}
void AccessControl::setPlaintextPasswordAllowed(bool allow_plaintext_password_)
{
allow_plaintext_password = allow_plaintext_password_;
}
bool AccessControl::isPlaintextPasswordAllowed() const
{
return allow_plaintext_password;
}
std::shared_ptr<const ContextAccess> AccessControl::getContextAccess(
const UUID & user_id,
const std::vector<UUID> & current_roles,
@ -550,15 +565,6 @@ std::vector<QuotaUsage> AccessControl::getAllQuotasUsage() const
return quota_cache->getAllQuotasUsage();
}
bool AccessControl::isPlaintextPasswordAllowed() const
{
return allow_plaintext_password;
}
bool AccessControl::isNoPasswordAllowed() const
{
return allow_no_password;
}
std::shared_ptr<const EnabledSettings> AccessControl::getEnabledSettings(
const UUID & user_id,

View File

@ -49,8 +49,6 @@ class AccessControl : public MultipleAccessStorage
public:
AccessControl();
~AccessControl() override;
std::atomic_bool allow_plaintext_password;
std::atomic_bool allow_no_password;
/// Parses access entities from a configuration loaded from users.xml.
/// This function add UsersConfigAccessStorage if it wasn't added before.
@ -113,12 +111,16 @@ public:
/// This function also enables custom prefixes to be used.
void setCustomSettingsPrefixes(const Strings & prefixes);
void setCustomSettingsPrefixes(const String & comma_separated_prefixes);
bool isSettingNameAllowed(const std::string_view & name) const;
void checkSettingNameIsAllowed(const std::string_view & name) const;
bool isSettingNameAllowed(const std::string_view name) const;
void checkSettingNameIsAllowed(const std::string_view name) const;
//sets allow_plaintext_password and allow_no_password setting
void setPlaintextPasswordSetting(const bool allow_plaintext_password_);
void setNoPasswordSetting(const bool allow_no_password_);
/// Allows users without password (by default it's allowed).
void setNoPasswordAllowed(const bool allow_no_password_);
bool isNoPasswordAllowed() const;
/// Allows users with plaintext password (by default it's allowed).
void setPlaintextPasswordAllowed(const bool allow_plaintext_password_);
bool isPlaintextPasswordAllowed() const;
UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address) const;
void setExternalAuthenticatorsConfig(const Poco::Util::AbstractConfiguration & config);
@ -153,9 +155,6 @@ public:
std::vector<QuotaUsage> getAllQuotasUsage() const;
bool isPlaintextPasswordAllowed() const;
bool isNoPasswordAllowed() const;
std::shared_ptr<const EnabledSettings> getEnabledSettings(
const UUID & user_id,
const SettingsProfileElements & settings_from_user,
@ -177,6 +176,8 @@ private:
std::unique_ptr<SettingsProfilesCache> settings_profiles_cache;
std::unique_ptr<ExternalAuthenticators> external_authenticators;
std::unique_ptr<CustomSettingsPrefixes> custom_settings_prefixes;
std::atomic_bool allow_plaintext_password = true;
std::atomic_bool allow_no_password = true;
};
}

View File

@ -120,7 +120,7 @@ AccessEntityPtr deserializeAccessEntityImpl(const String & definition)
if (res)
throw Exception("Two access entities attached in the same file", ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
res = user = std::make_unique<User>();
InterpreterCreateUserQuery::updateUserFromQuery(*user, *create_user_query);
InterpreterCreateUserQuery::updateUserFromQuery(*user, *create_user_query, /* allow_no_password = */ true, /* allow_plaintext_password = */ true);
}
else if (auto * create_role_query = query->as<ASTCreateRoleQuery>())
{

View File

@ -441,7 +441,9 @@ void IAccessStorage::notify(const Notifications & notifications)
UUID IAccessStorage::authenticate(
const Credentials & credentials,
const Poco::Net::IPAddress & address,
const ExternalAuthenticators & external_authenticators, bool allow_no_password, bool allow_plaintext_password) const
const ExternalAuthenticators & external_authenticators,
bool allow_no_password,
bool allow_plaintext_password) const
{
return *authenticateImpl(credentials, address, external_authenticators, /* throw_if_user_not_exists = */ true, allow_no_password, allow_plaintext_password);
}
@ -451,7 +453,9 @@ std::optional<UUID> IAccessStorage::authenticate(
const Credentials & credentials,
const Poco::Net::IPAddress & address,
const ExternalAuthenticators & external_authenticators,
bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const
bool throw_if_user_not_exists,
bool allow_no_password,
bool allow_plaintext_password) const
{
return authenticateImpl(credentials, address, external_authenticators, throw_if_user_not_exists, allow_no_password, allow_plaintext_password);
}
@ -461,7 +465,9 @@ std::optional<UUID> IAccessStorage::authenticateImpl(
const Credentials & credentials,
const Poco::Net::IPAddress & address,
const ExternalAuthenticators & external_authenticators,
bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const
bool throw_if_user_not_exists,
bool allow_no_password,
bool allow_plaintext_password) const
{
if (auto id = find<User>(credentials.getUserName()))
{
@ -469,8 +475,11 @@ std::optional<UUID> IAccessStorage::authenticateImpl(
{
if (!isAddressAllowed(*user, address))
throwAddressNotAllowed(address);
if (isNoPasswordAllowed(*user, allow_no_password) || isPlaintextPasswordAllowed(*user, allow_plaintext_password))
throwPasswordTypeNotAllowed();
auto auth_type = user->auth_data.getType();
if (((auth_type == AuthenticationType::NO_PASSWORD) && !allow_no_password) ||
((auth_type == AuthenticationType::PLAINTEXT_PASSWORD) && !allow_plaintext_password))
throwAuthenticationTypeNotAllowed(auth_type);
if (!areCredentialsValid(*user, credentials, external_authenticators))
throwInvalidCredentials();
@ -506,15 +515,6 @@ bool IAccessStorage::isAddressAllowed(const User & user, const Poco::Net::IPAddr
return user.allowed_client_hosts.contains(address);
}
bool IAccessStorage::isPlaintextPasswordAllowed(const User & user, bool allow_plaintext_password)
{
return !allow_plaintext_password && user.auth_data.getType() == AuthenticationType::PLAINTEXT_PASSWORD;
}
bool IAccessStorage::isNoPasswordAllowed(const User & user, bool allow_no_password)
{
return !allow_no_password && user.auth_data.getType() == AuthenticationType::NO_PASSWORD;
}
UUID IAccessStorage::generateRandomID()
{
@ -610,11 +610,12 @@ void IAccessStorage::throwAddressNotAllowed(const Poco::Net::IPAddress & address
throw Exception("Connections from " + address.toString() + " are not allowed", ErrorCodes::IP_ADDRESS_NOT_ALLOWED);
}
void IAccessStorage::throwPasswordTypeNotAllowed()
void IAccessStorage::throwAuthenticationTypeNotAllowed(AuthenticationType auth_type)
{
throw Exception(
"Authentication denied for users configured with AuthType PLAINTEXT_PASSWORD and NO_PASSWORD. Please check with Clickhouse admin to allow allow PLAINTEXT_PASSWORD and NO_PASSWORD through server configuration ",
ErrorCodes::AUTHENTICATION_FAILED);
ErrorCodes::AUTHENTICATION_FAILED,
"Authentication type {} is not allowed, check the setting allow_{} in the server configuration",
toString(auth_type), AuthenticationTypeInfo::get(auth_type).name);
}
void IAccessStorage::throwInvalidCredentials()
{

View File

@ -18,6 +18,7 @@ namespace DB
struct User;
class Credentials;
class ExternalAuthenticators;
enum class AuthenticationType;
/// Contains entities, i.e. instances of classes derived from IAccessEntity.
/// The implementations of this class MUST be thread-safe.
@ -148,7 +149,7 @@ public:
/// Finds a user, check the provided credentials and returns the ID of the user if they are valid.
/// Throws an exception if no such user or credentials are invalid.
UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool allow_no_password=true, bool allow_plaintext_password=true) const;
UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool allow_no_password, bool allow_plaintext_password) const;
std::optional<UUID> authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const;
protected:
@ -164,8 +165,6 @@ protected:
virtual std::optional<UUID> authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const;
virtual bool areCredentialsValid(const User & user, const Credentials & credentials, const ExternalAuthenticators & external_authenticators) const;
virtual bool isAddressAllowed(const User & user, const Poco::Net::IPAddress & address) const;
static bool isPlaintextPasswordAllowed(const User & user, bool allow_plaintext_password) ;
static bool isNoPasswordAllowed(const User & user, bool allow_no_password);
static UUID generateRandomID();
Poco::Logger * getLogger() const;
static String formatEntityTypeWithName(AccessEntityType type, const String & name) { return AccessEntityTypeInfo::get(type).formatEntityNameWithType(name); }
@ -181,7 +180,7 @@ protected:
[[noreturn]] void throwReadonlyCannotRemove(AccessEntityType type, const String & name) const;
[[noreturn]] static void throwAddressNotAllowed(const Poco::Net::IPAddress & address);
[[noreturn]] static void throwInvalidCredentials();
[[noreturn]] static void throwPasswordTypeNotAllowed();
[[noreturn]] static void throwAuthenticationTypeNotAllowed(AuthenticationType auth_type);
using Notification = std::tuple<OnChangedHandler, UUID, AccessEntityPtr>;
using Notifications = std::vector<Notification>;
static void notify(const Notifications & notifications);

View File

@ -481,7 +481,9 @@ std::optional<UUID> LDAPAccessStorage::authenticateImpl(
const Credentials & credentials,
const Poco::Net::IPAddress & address,
const ExternalAuthenticators & external_authenticators,
bool throw_if_user_not_exists,bool allow_no_password __attribute__((unused)), bool allow_plaintext_password __attribute__((unused))) const
bool throw_if_user_not_exists,
bool /* allow_no_password */,
bool /* allow_plaintext_password */) const
{
std::scoped_lock lock(mutex);
auto id = memory_storage.find<User>(credentials.getUserName());

View File

@ -449,14 +449,20 @@ void MultipleAccessStorage::updateSubscriptionsToNestedStorages(std::unique_lock
}
std::optional<UUID> MultipleAccessStorage::authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists,bool allow_no_password, bool allow_plaintext_password) const
std::optional<UUID>
MultipleAccessStorage::authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address,
const ExternalAuthenticators & external_authenticators,
bool throw_if_user_not_exists,
bool allow_no_password, bool allow_plaintext_password) const
{
auto storages = getStoragesInternal();
for (size_t i = 0; i != storages->size(); ++i)
{
const auto & storage = (*storages)[i];
bool is_last_storage = (i == storages->size() - 1);
auto id = storage->authenticate(credentials, address, external_authenticators, (throw_if_user_not_exists && is_last_storage), allow_no_password, allow_plaintext_password);
auto id = storage->authenticate(credentials, address, external_authenticators,
(throw_if_user_not_exists && is_last_storage),
allow_no_password, allow_plaintext_password);
if (id)
{
std::lock_guard lock{mutex};

View File

@ -28,8 +28,6 @@ namespace ErrorCodes
extern const int BAD_ARGUMENTS;
extern const int UNKNOWN_ADDRESS_PATTERN_TYPE;
extern const int NOT_IMPLEMENTED;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
namespace
@ -50,7 +48,7 @@ namespace
UUID generateID(const IAccessEntity & entity) { return generateID(entity.getType(), entity.getName()); }
UserPtr parseUser(const Poco::Util::AbstractConfiguration & config, const String & user_name)
UserPtr parseUser(const Poco::Util::AbstractConfiguration & config, const String & user_name, bool allow_no_password, bool allow_plaintext_password)
{
auto user = std::make_shared<User>();
user->setName(user_name);
@ -130,6 +128,15 @@ namespace
user->auth_data.setSSLCertificateCommonNames(std::move(common_names));
}
auto auth_type = user->auth_data.getType();
if (((auth_type == AuthenticationType::NO_PASSWORD) && !allow_no_password) ||
((auth_type == AuthenticationType::PLAINTEXT_PASSWORD) && !allow_plaintext_password))
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Authentication type {} is not allowed, check the setting allow_{} in the server configuration",
toString(auth_type), AuthenticationTypeInfo::get(auth_type).name);
}
const auto profile_name_config = user_config + ".profile";
if (config.has(profile_name_config))
{
@ -225,24 +232,18 @@ namespace
}
std::vector<AccessEntityPtr> parseUsers(const Poco::Util::AbstractConfiguration & config, Fn<bool()> auto && is_no_password_allowed_function, Fn<bool()> auto && is_plaintext_password_allowed_function)
std::vector<AccessEntityPtr> parseUsers(const Poco::Util::AbstractConfiguration & config, bool allow_no_password, bool allow_plaintext_password)
{
Poco::Util::AbstractConfiguration::Keys user_names;
config.keys("users", user_names);
std::vector<AccessEntityPtr> users;
users.reserve(user_names.size());
bool allow_plaintext_password = is_plaintext_password_allowed_function();
bool allow_no_password = is_no_password_allowed_function();
for (const auto & user_name : user_names)
{
try
{
String user_config = "users." + user_name;
if ((config.has(user_config + ".password") && !allow_plaintext_password) || (config.has(user_config + ".no_password") && !allow_no_password))
throw Exception("Incorrect User configuration. User is not allowed to configure PLAINTEXT_PASSWORD or NO_PASSWORD. Please configure User with authtype SHA256_PASSWORD_HASH, SHA256_PASSWORD, DOUBLE_SHA1_PASSWORD OR enable setting allow_plaintext_and_no_password in server configuration to configure user with plaintext and no password Auth_Type"
" Though it is not recommended to use plaintext_password and No_password for user authentication.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
users.push_back(parseUser(config, user_name));
users.push_back(parseUser(config, user_name, allow_no_password, allow_plaintext_password));
}
catch (Exception & e)
{
@ -562,8 +563,10 @@ void UsersConfigAccessStorage::parseFromConfig(const Poco::Util::AbstractConfigu
{
try
{
bool no_password_allowed = is_no_password_allowed_function();
bool plaintext_password_allowed = is_plaintext_password_allowed_function();
std::vector<std::pair<UUID, AccessEntityPtr>> all_entities;
for (const auto & entity : parseUsers(config,is_no_password_allowed_function, is_plaintext_password_allowed_function))
for (const auto & entity : parseUsers(config, no_password_allowed, plaintext_password_allowed))
all_entities.emplace_back(generateID(*entity), entity);
for (const auto & entity : parseQuotas(config))
all_entities.emplace_back(generateID(*entity), entity);

View File

@ -38,7 +38,8 @@ struct AggregateFunctionWithProperties
AggregateFunctionWithProperties(const AggregateFunctionWithProperties &) = default;
AggregateFunctionWithProperties & operator = (const AggregateFunctionWithProperties &) = default;
template <typename Creator, std::enable_if_t<!std::is_same_v<Creator, AggregateFunctionWithProperties>> * = nullptr>
template <typename Creator>
requires (!std::is_same_v<Creator, AggregateFunctionWithProperties>)
AggregateFunctionWithProperties(Creator creator_, AggregateFunctionProperties properties_ = {}) /// NOLINT
: creator(std::forward<Creator>(creator_)), properties(std::move(properties_))
{

View File

@ -569,6 +569,14 @@ if (ENABLE_TESTS)
clickhouse_common_zookeeper
string_utils)
if (TARGET ch_contrib::simdjson)
target_link_libraries(unit_tests_dbms PRIVATE ch_contrib::simdjson)
endif()
if(TARGET ch_contrib::rapidjson)
target_include_directories(unit_tests_dbms PRIVATE ch_contrib::rapidjson)
endif()
if (TARGET ch_contrib::yaml_cpp)
target_link_libraries(unit_tests_dbms PRIVATE ch_contrib::yaml_cpp)
endif()

View File

@ -1092,10 +1092,11 @@ void ClientBase::sendData(Block & sample, const ColumnsDescription & columns_des
try
{
auto metadata = storage->getInMemoryMetadataPtr();
sendDataFromPipe(
storage->read(
sample.getNames(),
storage->getInMemoryMetadataPtr(),
storage->getStorageSnapshot(metadata),
query_info,
global_context,
{},

View File

@ -297,7 +297,7 @@ ColumnPtr ColumnAggregateFunction::filter(const Filter & filter, ssize_t result_
{
size_t size = data.size();
if (size != filter.size())
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filter.size(), size);
if (size == 0)
return cloneEmpty();

View File

@ -608,7 +608,7 @@ ColumnPtr ColumnArray::filterString(const Filter & filt, ssize_t result_size_hin
{
size_t col_size = getOffsets().size();
if (col_size != filt.size())
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), col_size);
if (0 == col_size)
return ColumnArray::create(data);
@ -676,7 +676,7 @@ ColumnPtr ColumnArray::filterGeneric(const Filter & filt, ssize_t result_size_hi
{
size_t size = getOffsets().size();
if (size != filt.size())
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size);
if (size == 0)
return ColumnArray::create(data);
@ -1189,4 +1189,12 @@ void ColumnArray::gather(ColumnGathererStream & gatherer)
gatherer.gather(*this);
}
size_t ColumnArray::getNumberOfDimensions() const
{
const auto * nested_array = checkAndGetColumn<ColumnArray>(*data);
if (!nested_array)
return 1;
return 1 + nested_array->getNumberOfDimensions(); /// Every modern C++ compiler optimizes tail recursion.
}
}

View File

@ -169,6 +169,8 @@ public:
bool isCollationSupported() const override { return getData().isCollationSupported(); }
size_t getNumberOfDimensions() const;
private:
WrappedPtr data;
WrappedPtr offsets;

View File

@ -266,7 +266,7 @@ ColumnPtr ColumnDecimal<T>::filter(const IColumn::Filter & filt, ssize_t result_
{
size_t size = data.size();
if (size != filt.size())
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size);
auto res = this->create(0, scale);
Container & res_data = res->getData();

View File

@ -207,7 +207,7 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result
{
size_t col_size = size();
if (col_size != filt.size())
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), col_size);
auto res = ColumnFixedString::create(n);

View File

@ -144,15 +144,15 @@ public:
double getRatioOfDefaultRows(double sample_ratio) const override
{
return null_map->getRatioOfDefaultRows(sample_ratio);
return getRatioOfDefaultRowsImpl<ColumnNullable>(sample_ratio);
}
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
{
null_map->getIndicesOfNonDefaultRows(indices, from, limit);
getIndicesOfNonDefaultRowsImpl<ColumnNullable>(indices, from, limit);
}
ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override;
ColumnPtr createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override;
bool isNullable() const override { return true; }
bool isFixedAndContiguous() const override { return false; }

View File

@ -0,0 +1,780 @@
#include <Core/Field.h>
#include <Columns/ColumnObject.h>
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnArray.h>
#include <DataTypes/ObjectUtils.h>
#include <DataTypes/getLeastSupertype.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/NestedUtils.h>
#include <Interpreters/castColumn.h>
#include <Interpreters/convertFieldToType.h>
#include <Common/HashTable/HashSet.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int ILLEGAL_COLUMN;
extern const int DUPLICATE_COLUMN;
extern const int NUMBER_OF_DIMENSIONS_MISMATHED;
extern const int NOT_IMPLEMENTED;
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
}
namespace
{
/// Recreates column with default scalar values and keeps sizes of arrays.
ColumnPtr recreateColumnWithDefaultValues(
const ColumnPtr & column, const DataTypePtr & scalar_type, size_t num_dimensions)
{
const auto * column_array = checkAndGetColumn<ColumnArray>(column.get());
if (column_array && num_dimensions)
{
return ColumnArray::create(
recreateColumnWithDefaultValues(
column_array->getDataPtr(), scalar_type, num_dimensions - 1),
IColumn::mutate(column_array->getOffsetsPtr()));
}
return createArrayOfType(scalar_type, num_dimensions)->createColumn()->cloneResized(column->size());
}
/// Replaces NULL fields to given field or empty array.
class FieldVisitorReplaceNull : public StaticVisitor<Field>
{
public:
explicit FieldVisitorReplaceNull(
const Field & replacement_, size_t num_dimensions_)
: replacement(replacement_)
, num_dimensions(num_dimensions_)
{
}
Field operator()(const Null &) const
{
return num_dimensions
? createEmptyArrayField(num_dimensions)
: replacement;
}
Field operator()(const Array & x) const
{
assert(num_dimensions > 0);
const size_t size = x.size();
Array res(size);
for (size_t i = 0; i < size; ++i)
res[i] = applyVisitor(FieldVisitorReplaceNull(replacement, num_dimensions - 1), x[i]);
return res;
}
template <typename T>
Field operator()(const T & x) const { return x; }
private:
const Field & replacement;
size_t num_dimensions;
};
/// Calculates number of dimensions in array field.
/// Returns 0 for scalar fields.
class FieldVisitorToNumberOfDimensions : public StaticVisitor<size_t>
{
public:
size_t operator()(const Array & x) const
{
const size_t size = x.size();
std::optional<size_t> dimensions;
for (size_t i = 0; i < size; ++i)
{
/// Do not count Nulls, because they will be replaced by default
/// values with proper number of dimensions.
if (x[i].isNull())
continue;
size_t current_dimensions = applyVisitor(*this, x[i]);
if (!dimensions)
dimensions = current_dimensions;
else if (current_dimensions != *dimensions)
throw Exception(ErrorCodes::NUMBER_OF_DIMENSIONS_MISMATHED,
"Number of dimensions mismatched among array elements");
}
return 1 + dimensions.value_or(0);
}
template <typename T>
size_t operator()(const T &) const { return 0; }
};
/// Visitor that allows to get type of scalar field
/// or least common type of scalars in array.
/// More optimized version of FieldToDataType.
class FieldVisitorToScalarType : public StaticVisitor<>
{
public:
using FieldType = Field::Types::Which;
void operator()(const Array & x)
{
size_t size = x.size();
for (size_t i = 0; i < size; ++i)
applyVisitor(*this, x[i]);
}
void operator()(const UInt64 & x)
{
field_types.insert(FieldType::UInt64);
if (x <= std::numeric_limits<UInt8>::max())
type_indexes.insert(TypeIndex::UInt8);
else if (x <= std::numeric_limits<UInt16>::max())
type_indexes.insert(TypeIndex::UInt16);
else if (x <= std::numeric_limits<UInt32>::max())
type_indexes.insert(TypeIndex::UInt32);
else
type_indexes.insert(TypeIndex::UInt64);
}
void operator()(const Int64 & x)
{
field_types.insert(FieldType::Int64);
if (x <= std::numeric_limits<Int8>::max() && x >= std::numeric_limits<Int8>::min())
type_indexes.insert(TypeIndex::Int8);
else if (x <= std::numeric_limits<Int16>::max() && x >= std::numeric_limits<Int16>::min())
type_indexes.insert(TypeIndex::Int16);
else if (x <= std::numeric_limits<Int32>::max() && x >= std::numeric_limits<Int32>::min())
type_indexes.insert(TypeIndex::Int32);
else
type_indexes.insert(TypeIndex::Int64);
}
void operator()(const Null &)
{
have_nulls = true;
}
template <typename T>
void operator()(const T &)
{
field_types.insert(Field::TypeToEnum<NearestFieldType<T>>::value);
type_indexes.insert(TypeToTypeIndex<NearestFieldType<T>>);
}
DataTypePtr getScalarType() const { return getLeastSupertype(type_indexes, true); }
bool haveNulls() const { return have_nulls; }
bool needConvertField() const { return field_types.size() > 1; }
private:
TypeIndexSet type_indexes;
std::unordered_set<FieldType> field_types;
bool have_nulls = false;
};
}
FieldInfo getFieldInfo(const Field & field)
{
FieldVisitorToScalarType to_scalar_type_visitor;
applyVisitor(to_scalar_type_visitor, field);
return
{
to_scalar_type_visitor.getScalarType(),
to_scalar_type_visitor.haveNulls(),
to_scalar_type_visitor.needConvertField(),
applyVisitor(FieldVisitorToNumberOfDimensions(), field),
};
}
ColumnObject::Subcolumn::Subcolumn(MutableColumnPtr && data_, bool is_nullable_)
: least_common_type(getDataTypeByColumn(*data_))
, is_nullable(is_nullable_)
{
data.push_back(std::move(data_));
}
ColumnObject::Subcolumn::Subcolumn(
size_t size_, bool is_nullable_)
: least_common_type(std::make_shared<DataTypeNothing>())
, is_nullable(is_nullable_)
, num_of_defaults_in_prefix(size_)
{
}
size_t ColumnObject::Subcolumn::Subcolumn::size() const
{
size_t res = num_of_defaults_in_prefix;
for (const auto & part : data)
res += part->size();
return res;
}
size_t ColumnObject::Subcolumn::Subcolumn::byteSize() const
{
size_t res = 0;
for (const auto & part : data)
res += part->byteSize();
return res;
}
size_t ColumnObject::Subcolumn::Subcolumn::allocatedBytes() const
{
size_t res = 0;
for (const auto & part : data)
res += part->allocatedBytes();
return res;
}
void ColumnObject::Subcolumn::checkTypes() const
{
DataTypes prefix_types;
prefix_types.reserve(data.size());
for (size_t i = 0; i < data.size(); ++i)
{
auto current_type = getDataTypeByColumn(*data[i]);
prefix_types.push_back(current_type);
auto prefix_common_type = getLeastSupertype(prefix_types);
if (!prefix_common_type->equals(*current_type))
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Data type {} of column at position {} cannot represent all columns from i-th prefix",
current_type->getName(), i);
}
}
void ColumnObject::Subcolumn::insert(Field field)
{
auto info = getFieldInfo(field);
insert(std::move(field), std::move(info));
}
void ColumnObject::Subcolumn::insert(Field field, FieldInfo info)
{
auto base_type = info.scalar_type;
if (isNothing(base_type) && info.num_dimensions == 0)
{
insertDefault();
return;
}
auto column_dim = getNumberOfDimensions(*least_common_type);
auto value_dim = info.num_dimensions;
if (isNothing(least_common_type))
column_dim = value_dim;
if (field.isNull())
value_dim = column_dim;
if (value_dim != column_dim)
throw Exception(ErrorCodes::NUMBER_OF_DIMENSIONS_MISMATHED,
"Dimension of types mismatched between inserted value and column. "
"Dimension of value: {}. Dimension of column: {}",
value_dim, column_dim);
if (is_nullable)
base_type = makeNullable(base_type);
if (!is_nullable && info.have_nulls)
field = applyVisitor(FieldVisitorReplaceNull(base_type->getDefault(), value_dim), std::move(field));
auto value_type = createArrayOfType(base_type, value_dim);
bool type_changed = false;
if (data.empty())
{
data.push_back(value_type->createColumn());
least_common_type = value_type;
}
else if (!least_common_type->equals(*value_type))
{
value_type = getLeastSupertype(DataTypes{value_type, least_common_type}, true);
type_changed = true;
if (!least_common_type->equals(*value_type))
{
data.push_back(value_type->createColumn());
least_common_type = value_type;
}
}
if (type_changed || info.need_convert)
field = convertFieldToTypeOrThrow(field, *value_type);
data.back()->insert(field);
}
void ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn & src, size_t start, size_t length)
{
assert(src.isFinalized());
const auto & src_column = src.data.back();
const auto & src_type = src.least_common_type;
if (data.empty())
{
least_common_type = src_type;
data.push_back(src_type->createColumn());
data.back()->insertRangeFrom(*src_column, start, length);
}
else if (least_common_type->equals(*src_type))
{
data.back()->insertRangeFrom(*src_column, start, length);
}
else
{
auto new_least_common_type = getLeastSupertype(DataTypes{least_common_type, src_type}, true);
auto casted_column = castColumn({src_column, src_type, ""}, new_least_common_type);
if (!least_common_type->equals(*new_least_common_type))
{
least_common_type = new_least_common_type;
data.push_back(least_common_type->createColumn());
}
data.back()->insertRangeFrom(*casted_column, start, length);
}
}
void ColumnObject::Subcolumn::finalize()
{
if (isFinalized() || data.empty())
return;
const auto & to_type = least_common_type;
auto result_column = to_type->createColumn();
if (num_of_defaults_in_prefix)
result_column->insertManyDefaults(num_of_defaults_in_prefix);
for (auto & part : data)
{
auto from_type = getDataTypeByColumn(*part);
size_t part_size = part->size();
if (!from_type->equals(*to_type))
{
auto offsets = ColumnUInt64::create();
auto & offsets_data = offsets->getData();
/// We need to convert only non-default values and then recreate column
/// with default value of new type, because default values (which represents misses in data)
/// may be inconsistent between types (e.g "0" in UInt64 and empty string in String).
part->getIndicesOfNonDefaultRows(offsets_data, 0, part_size);
if (offsets->size() == part_size)
{
part = castColumn({part, from_type, ""}, to_type);
}
else
{
auto values = part->index(*offsets, offsets->size());
values = castColumn({values, from_type, ""}, to_type);
part = values->createWithOffsets(offsets_data, to_type->getDefault(), part_size, /*shift=*/ 0);
}
}
result_column->insertRangeFrom(*part, 0, part_size);
}
data = { std::move(result_column) };
num_of_defaults_in_prefix = 0;
}
void ColumnObject::Subcolumn::insertDefault()
{
if (data.empty())
++num_of_defaults_in_prefix;
else
data.back()->insertDefault();
}
void ColumnObject::Subcolumn::insertManyDefaults(size_t length)
{
if (data.empty())
num_of_defaults_in_prefix += length;
else
data.back()->insertManyDefaults(length);
}
void ColumnObject::Subcolumn::popBack(size_t n)
{
assert(n <= size());
size_t num_removed = 0;
for (auto it = data.rbegin(); it != data.rend(); ++it)
{
if (n == 0)
break;
auto & column = *it;
if (n < column->size())
{
column->popBack(n);
n = 0;
}
else
{
++num_removed;
n -= column->size();
}
}
data.resize(data.size() - num_removed);
num_of_defaults_in_prefix -= n;
}
Field ColumnObject::Subcolumn::getLastField() const
{
if (data.empty())
return Field();
const auto & last_part = data.back();
assert(!last_part->empty());
return (*last_part)[last_part->size() - 1];
}
ColumnObject::Subcolumn ColumnObject::Subcolumn::recreateWithDefaultValues(const FieldInfo & field_info) const
{
auto scalar_type = field_info.scalar_type;
if (is_nullable)
scalar_type = makeNullable(scalar_type);
Subcolumn new_subcolumn;
new_subcolumn.least_common_type = createArrayOfType(scalar_type, field_info.num_dimensions);
new_subcolumn.is_nullable = is_nullable;
new_subcolumn.num_of_defaults_in_prefix = num_of_defaults_in_prefix;
new_subcolumn.data.reserve(data.size());
for (const auto & part : data)
new_subcolumn.data.push_back(recreateColumnWithDefaultValues(
part, scalar_type, field_info.num_dimensions));
return new_subcolumn;
}
IColumn & ColumnObject::Subcolumn::getFinalizedColumn()
{
assert(isFinalized());
return *data[0];
}
const IColumn & ColumnObject::Subcolumn::getFinalizedColumn() const
{
assert(isFinalized());
return *data[0];
}
const ColumnPtr & ColumnObject::Subcolumn::getFinalizedColumnPtr() const
{
assert(isFinalized());
return data[0];
}
ColumnObject::ColumnObject(bool is_nullable_)
: is_nullable(is_nullable_)
, num_rows(0)
{
}
ColumnObject::ColumnObject(SubcolumnsTree && subcolumns_, bool is_nullable_)
: is_nullable(is_nullable_)
, subcolumns(std::move(subcolumns_))
, num_rows(subcolumns.empty() ? 0 : (*subcolumns.begin())->data.size())
{
checkConsistency();
}
void ColumnObject::checkConsistency() const
{
if (subcolumns.empty())
return;
for (const auto & leaf : subcolumns)
{
if (num_rows != leaf->data.size())
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "Sizes of subcolumns are inconsistent in ColumnObject."
" Subcolumn '{}' has {} rows, but expected size is {}",
leaf->path.getPath(), leaf->data.size(), num_rows);
}
}
}
size_t ColumnObject::size() const
{
#ifndef NDEBUG
checkConsistency();
#endif
return num_rows;
}
MutableColumnPtr ColumnObject::cloneResized(size_t new_size) const
{
/// cloneResized with new_size == 0 is used for cloneEmpty().
if (new_size != 0)
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"ColumnObject doesn't support resize to non-zero length");
return ColumnObject::create(is_nullable);
}
size_t ColumnObject::byteSize() const
{
size_t res = 0;
for (const auto & entry : subcolumns)
res += entry->data.byteSize();
return res;
}
size_t ColumnObject::allocatedBytes() const
{
size_t res = 0;
for (const auto & entry : subcolumns)
res += entry->data.allocatedBytes();
return res;
}
void ColumnObject::forEachSubcolumn(ColumnCallback callback)
{
if (!isFinalized())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot iterate over non-finalized ColumnObject");
for (auto & entry : subcolumns)
callback(entry->data.data.back());
}
void ColumnObject::insert(const Field & field)
{
const auto & object = field.get<const Object &>();
HashSet<StringRef, StringRefHash> inserted;
size_t old_size = size();
for (const auto & [key_str, value] : object)
{
PathInData key(key_str);
inserted.insert(key_str);
if (!hasSubcolumn(key))
addSubcolumn(key, old_size);
auto & subcolumn = getSubcolumn(key);
subcolumn.insert(value);
}
for (auto & entry : subcolumns)
if (!inserted.has(entry->path.getPath()))
entry->data.insertDefault();
++num_rows;
}
void ColumnObject::insertDefault()
{
for (auto & entry : subcolumns)
entry->data.insertDefault();
++num_rows;
}
Field ColumnObject::operator[](size_t n) const
{
if (!isFinalized())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get Field from non-finalized ColumnObject");
Object object;
for (const auto & entry : subcolumns)
object[entry->path.getPath()] = (*entry->data.data.back())[n];
return object;
}
void ColumnObject::get(size_t n, Field & res) const
{
if (!isFinalized())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get Field from non-finalized ColumnObject");
auto & object = res.get<Object &>();
for (const auto & entry : subcolumns)
{
auto it = object.try_emplace(entry->path.getPath()).first;
entry->data.data.back()->get(n, it->second);
}
}
void ColumnObject::insertRangeFrom(const IColumn & src, size_t start, size_t length)
{
const auto & src_object = assert_cast<const ColumnObject &>(src);
for (auto & entry : subcolumns)
{
if (src_object.hasSubcolumn(entry->path))
entry->data.insertRangeFrom(src_object.getSubcolumn(entry->path), start, length);
else
entry->data.insertManyDefaults(length);
}
num_rows += length;
finalize();
}
ColumnPtr ColumnObject::replicate(const Offsets & offsets) const
{
if (!isFinalized())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot replicate non-finalized ColumnObject");
auto res_column = ColumnObject::create(is_nullable);
for (const auto & entry : subcolumns)
{
auto replicated_data = entry->data.data.back()->replicate(offsets)->assumeMutable();
res_column->addSubcolumn(entry->path, std::move(replicated_data));
}
return res_column;
}
void ColumnObject::popBack(size_t length)
{
for (auto & entry : subcolumns)
entry->data.popBack(length);
num_rows -= length;
}
const ColumnObject::Subcolumn & ColumnObject::getSubcolumn(const PathInData & key) const
{
if (const auto * node = subcolumns.findLeaf(key))
return node->data;
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in ColumnObject", key.getPath());
}
ColumnObject::Subcolumn & ColumnObject::getSubcolumn(const PathInData & key)
{
if (const auto * node = subcolumns.findLeaf(key))
return const_cast<SubcolumnsTree::Node *>(node)->data;
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in ColumnObject", key.getPath());
}
bool ColumnObject::hasSubcolumn(const PathInData & key) const
{
return subcolumns.findLeaf(key) != nullptr;
}
void ColumnObject::addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn)
{
size_t new_size = subcolumn->size();
bool inserted = subcolumns.add(key, Subcolumn(std::move(subcolumn), is_nullable));
if (!inserted)
throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Subcolumn '{}' already exists", key.getPath());
if (num_rows == 0)
num_rows = new_size;
else if (new_size != num_rows)
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH,
"Size of subcolumn {} ({}) is inconsistent with column size ({})",
key.getPath(), new_size, num_rows);
}
void ColumnObject::addSubcolumn(const PathInData & key, size_t new_size)
{
bool inserted = subcolumns.add(key, Subcolumn(new_size, is_nullable));
if (!inserted)
throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Subcolumn '{}' already exists", key.getPath());
if (num_rows == 0)
num_rows = new_size;
else if (new_size != num_rows)
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH,
"Required size of subcolumn {} ({}) is inconsistent with column size ({})",
key.getPath(), new_size, num_rows);
}
void ColumnObject::addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size)
{
if (!key.hasNested())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Cannot add Nested subcolumn, because path doesn't contain Nested");
bool inserted = false;
/// We find node that represents the same Nested type as @key.
const auto * nested_node = subcolumns.findBestMatch(key);
if (nested_node)
{
/// Find any leaf of Nested subcolumn.
const auto * leaf = subcolumns.findLeaf(nested_node, [&](const auto &) { return true; });
assert(leaf);
/// Recreate subcolumn with default values and the same sizes of arrays.
auto new_subcolumn = leaf->data.recreateWithDefaultValues(field_info);
/// It's possible that we have already inserted value from current row
/// to this subcolumn. So, adjust size to expected.
if (new_subcolumn.size() > new_size)
new_subcolumn.popBack(new_subcolumn.size() - new_size);
assert(new_subcolumn.size() == new_size);
inserted = subcolumns.add(key, new_subcolumn);
}
else
{
/// If node was not found just add subcolumn with empty arrays.
inserted = subcolumns.add(key, Subcolumn(new_size, is_nullable));
}
if (!inserted)
throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Subcolumn '{}' already exists", key.getPath());
if (num_rows == 0)
num_rows = new_size;
}
PathsInData ColumnObject::getKeys() const
{
PathsInData keys;
keys.reserve(subcolumns.size());
for (const auto & entry : subcolumns)
keys.emplace_back(entry->path);
return keys;
}
bool ColumnObject::isFinalized() const
{
return std::all_of(subcolumns.begin(), subcolumns.end(),
[](const auto & entry) { return entry->data.isFinalized(); });
}
void ColumnObject::finalize()
{
size_t old_size = size();
SubcolumnsTree new_subcolumns;
for (auto && entry : subcolumns)
{
const auto & least_common_type = entry->data.getLeastCommonType();
/// Do not add subcolumns, which consists only from NULLs.
if (isNothing(getBaseTypeOfArray(least_common_type)))
continue;
entry->data.finalize();
new_subcolumns.add(entry->path, entry->data);
}
/// If all subcolumns were skipped add a dummy subcolumn,
/// because Tuple type must have at least one element.
if (new_subcolumns.empty())
new_subcolumns.add(PathInData{COLUMN_NAME_DUMMY}, Subcolumn{ColumnUInt8::create(old_size, 0), is_nullable});
std::swap(subcolumns, new_subcolumns);
checkObjectHasNoAmbiguosPaths(getKeys());
}
}

219
src/Columns/ColumnObject.h Normal file
View File

@ -0,0 +1,219 @@
#pragma once
#include <Core/Field.h>
#include <Core/Names.h>
#include <Columns/IColumn.h>
#include <Common/PODArray.h>
#include <Common/HashTable/HashMap.h>
#include <DataTypes/Serializations/JSONDataParser.h>
#include <DataTypes/Serializations/SubcolumnsTree.h>
#include <DataTypes/IDataType.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
/// Info that represents a scalar or array field in a decomposed view.
/// It allows to recreate field with different number
/// of dimensions or nullability.
struct FieldInfo
{
/// The common type of of all scalars in field.
DataTypePtr scalar_type;
/// Do we have NULL scalar in field.
bool have_nulls;
/// If true then we have scalars with different types in array and
/// we need to convert scalars to the common type.
bool need_convert;
/// Number of dimension in array. 0 if field is scalar.
size_t num_dimensions;
};
FieldInfo getFieldInfo(const Field & field);
/** A column that represents object with dynamic set of subcolumns.
* Subcolumns are identified by paths in document and are stored in
* a trie-like structure. ColumnObject is not suitable for writing into tables
* and it should be converted to Tuple with fixed set of subcolumns before that.
*/
class ColumnObject final : public COWHelper<IColumn, ColumnObject>
{
public:
/** Class that represents one subcolumn.
* It stores values in several parts of column
* and keeps current common type of all parts.
* We add a new column part with a new type, when we insert a field,
* which can't be converted to the current common type.
* After insertion of all values subcolumn should be finalized
* for writing and other operations.
*/
class Subcolumn
{
public:
Subcolumn() = default;
Subcolumn(size_t size_, bool is_nullable_);
Subcolumn(MutableColumnPtr && data_, bool is_nullable_);
size_t size() const;
size_t byteSize() const;
size_t allocatedBytes() const;
bool isFinalized() const { return data.size() == 1 && num_of_defaults_in_prefix == 0; }
const DataTypePtr & getLeastCommonType() const { return least_common_type; }
/// Checks the consistency of column's parts stored in @data.
void checkTypes() const;
/// Inserts a field, which scalars can be arbitrary, but number of
/// dimensions should be consistent with current common type.
void insert(Field field);
void insert(Field field, FieldInfo info);
void insertDefault();
void insertManyDefaults(size_t length);
void insertRangeFrom(const Subcolumn & src, size_t start, size_t length);
void popBack(size_t n);
/// Converts all column's parts to the common type and
/// creates a single column that stores all values.
void finalize();
/// Returns last inserted field.
Field getLastField() const;
/// Recreates subcolumn with default scalar values and keeps sizes of arrays.
/// Used to create columns of type Nested with consistent array sizes.
Subcolumn recreateWithDefaultValues(const FieldInfo & field_info) const;
/// Returns single column if subcolumn in finalizes.
/// Otherwise -- undefined behaviour.
IColumn & getFinalizedColumn();
const IColumn & getFinalizedColumn() const;
const ColumnPtr & getFinalizedColumnPtr() const;
friend class ColumnObject;
private:
/// Current least common type of all values inserted to this subcolumn.
DataTypePtr least_common_type;
/// If true then common type type of subcolumn is Nullable
/// and default values are NULLs.
bool is_nullable = false;
/// Parts of column. Parts should be in increasing order in terms of subtypes/supertypes.
/// That means that the least common type for i-th prefix is the type of i-th part
/// and it's the supertype for all type of column from 0 to i-1.
std::vector<WrappedPtr> data;
/// Until we insert any non-default field we don't know further
/// least common type and we count number of defaults in prefix,
/// which will be converted to the default type of final common type.
size_t num_of_defaults_in_prefix = 0;
};
using SubcolumnsTree = SubcolumnsTree<Subcolumn>;
private:
/// If true then all subcolumns are nullable.
const bool is_nullable;
SubcolumnsTree subcolumns;
size_t num_rows;
public:
static constexpr auto COLUMN_NAME_DUMMY = "_dummy";
explicit ColumnObject(bool is_nullable_);
ColumnObject(SubcolumnsTree && subcolumns_, bool is_nullable_);
/// Checks that all subcolumns have consistent sizes.
void checkConsistency() const;
bool hasSubcolumn(const PathInData & key) const;
const Subcolumn & getSubcolumn(const PathInData & key) const;
Subcolumn & getSubcolumn(const PathInData & key);
void incrementNumRows() { ++num_rows; }
/// Adds a subcolumn from existing IColumn.
void addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn);
/// Adds a subcolumn of specific size with default values.
void addSubcolumn(const PathInData & key, size_t new_size);
/// Adds a subcolumn of type Nested of specific size with default values.
/// It cares about consistency of sizes of Nested arrays.
void addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size);
const SubcolumnsTree & getSubcolumns() const { return subcolumns; }
SubcolumnsTree & getSubcolumns() { return subcolumns; }
PathsInData getKeys() const;
/// Finalizes all subcolumns.
void finalize();
bool isFinalized() const;
/// Part of interface
const char * getFamilyName() const override { return "Object"; }
TypeIndex getDataType() const override { return TypeIndex::Object; }
size_t size() const override;
MutableColumnPtr cloneResized(size_t new_size) const override;
size_t byteSize() const override;
size_t allocatedBytes() const override;
void forEachSubcolumn(ColumnCallback callback) override;
void insert(const Field & field) override;
void insertDefault() override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
ColumnPtr replicate(const Offsets & offsets) const override;
void popBack(size_t length) override;
Field operator[](size_t n) const override;
void get(size_t n, Field & res) const override;
/// All other methods throw exception.
ColumnPtr decompress() const override { throwMustBeConcrete(); }
StringRef getDataAt(size_t) const override { throwMustBeConcrete(); }
bool isDefaultAt(size_t) const override { throwMustBeConcrete(); }
void insertData(const char *, size_t) override { throwMustBeConcrete(); }
StringRef serializeValueIntoArena(size_t, Arena &, char const *&) const override { throwMustBeConcrete(); }
const char * deserializeAndInsertFromArena(const char *) override { throwMustBeConcrete(); }
const char * skipSerializedInArena(const char *) const override { throwMustBeConcrete(); }
void updateHashWithValue(size_t, SipHash &) const override { throwMustBeConcrete(); }
void updateWeakHash32(WeakHash32 &) const override { throwMustBeConcrete(); }
void updateHashFast(SipHash &) const override { throwMustBeConcrete(); }
ColumnPtr filter(const Filter &, ssize_t) const override { throwMustBeConcrete(); }
void expand(const Filter &, bool) override { throwMustBeConcrete(); }
ColumnPtr permute(const Permutation &, size_t) const override { throwMustBeConcrete(); }
ColumnPtr index(const IColumn &, size_t) const override { throwMustBeConcrete(); }
int compareAt(size_t, size_t, const IColumn &, int) const override { throwMustBeConcrete(); }
void compareColumn(const IColumn &, size_t, PaddedPODArray<UInt64> *, PaddedPODArray<Int8> &, int, int) const override { throwMustBeConcrete(); }
bool hasEqualValues() const override { throwMustBeConcrete(); }
void getPermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &) const override { throwMustBeConcrete(); }
void updatePermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &, EqualRanges &) const override { throwMustBeConcrete(); }
MutableColumns scatter(ColumnIndex, const Selector &) const override { throwMustBeConcrete(); }
void gather(ColumnGathererStream &) override { throwMustBeConcrete(); }
void getExtremes(Field &, Field &) const override { throwMustBeConcrete(); }
size_t byteSizeAt(size_t) const override { throwMustBeConcrete(); }
double getRatioOfDefaultRows(double) const override { throwMustBeConcrete(); }
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeConcrete(); }
private:
[[noreturn]] static void throwMustBeConcrete()
{
throw Exception("ColumnObject must be converted to ColumnTuple before use", ErrorCodes::LOGICAL_ERROR);
}
};
}

View File

@ -288,7 +288,7 @@ void ColumnSparse::popBack(size_t n)
ColumnPtr ColumnSparse::filter(const Filter & filt, ssize_t) const
{
if (_size != filt.size())
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), _size);
if (offsets->empty())
{

View File

@ -381,7 +381,7 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
{
size_t size = data.size();
if (size != filt.size())
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size);
auto res = this->create();
Container & res_data = res->getData();
@ -450,7 +450,7 @@ void ColumnVector<T>::applyZeroMap(const IColumn::Filter & filt, bool inverted)
{
size_t size = data.size();
if (size != filt.size())
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size);
const UInt8 * filt_pos = filt.data();
const UInt8 * filt_end = filt_pos + size;

View File

@ -192,7 +192,7 @@ namespace
{
const size_t size = src_offsets.size();
if (size != filt.size())
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size);
ResultOffsetsBuilder result_offsets_builder(res_offsets);

View File

@ -883,8 +883,8 @@ public:
return toDayNum(years_lut[year - DATE_LUT_MIN_YEAR]);
}
template <typename Date,
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
template <typename Date>
requires std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>
inline auto toStartOfQuarterInterval(Date d, UInt64 quarters) const
{
if (quarters == 1)
@ -892,8 +892,8 @@ public:
return toStartOfMonthInterval(d, quarters * 3);
}
template <typename Date,
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
template <typename Date>
requires std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>
inline auto toStartOfMonthInterval(Date d, UInt64 months) const
{
if (months == 1)
@ -906,8 +906,8 @@ public:
return toDayNum(years_months_lut[month_total_index / months * months]);
}
template <typename Date,
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
template <typename Date>
requires std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>
inline auto toStartOfWeekInterval(Date d, UInt64 weeks) const
{
if (weeks == 1)
@ -920,8 +920,8 @@ public:
return ExtendedDayNum(4 + (d - 4) / days * days);
}
template <typename Date,
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
template <typename Date>
requires std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>
inline Time toStartOfDayInterval(Date d, UInt64 days) const
{
if (days == 1)
@ -1219,10 +1219,8 @@ public:
/// If resulting month has less deys than source month, then saturation can happen.
/// Example: 31 Aug + 1 month = 30 Sep.
template <
typename DateTime,
typename
= std::enable_if_t<std::is_same_v<DateTime, UInt32> || std::is_same_v<DateTime, Int64> || std::is_same_v<DateTime, time_t>>>
template <typename DateTime>
requires std::is_same_v<DateTime, UInt32> || std::is_same_v<DateTime, Int64> || std::is_same_v<DateTime, time_t>
inline Time NO_SANITIZE_UNDEFINED addMonths(DateTime t, Int64 delta) const
{
const auto result_day = addMonthsIndex(t, delta);
@ -1247,8 +1245,8 @@ public:
return res;
}
template <typename Date,
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
template <typename Date>
requires std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>
inline auto NO_SANITIZE_UNDEFINED addMonths(Date d, Int64 delta) const
{
if constexpr (std::is_same_v<Date, DayNum>)
@ -1280,10 +1278,8 @@ public:
}
/// Saturation can occur if 29 Feb is mapped to non-leap year.
template <
typename DateTime,
typename
= std::enable_if_t<std::is_same_v<DateTime, UInt32> || std::is_same_v<DateTime, Int64> || std::is_same_v<DateTime, time_t>>>
template <typename DateTime>
requires std::is_same_v<DateTime, UInt32> || std::is_same_v<DateTime, Int64> || std::is_same_v<DateTime, time_t>
inline Time addYears(DateTime t, Int64 delta) const
{
auto result_day = addYearsIndex(t, delta);
@ -1308,8 +1304,8 @@ public:
return res;
}
template <typename Date,
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
template <typename Date>
requires std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>
inline auto addYears(Date d, Int64 delta) const
{
if constexpr (std::is_same_v<Date, DayNum>)

View File

@ -613,6 +613,7 @@
M(642, CANNOT_PACK_ARCHIVE) \
M(643, CANNOT_UNPACK_ARCHIVE) \
M(644, REMOTE_FS_OBJECT_CACHE_ERROR) \
M(645, NUMBER_OF_DIMENSIONS_MISMATHED) \
\
M(999, KEEPER_EXCEPTION) \
M(1000, POCO_EXCEPTION) \

View File

@ -205,7 +205,8 @@ void rethrowFirstException(const Exceptions & exceptions);
template <typename T>
std::enable_if_t<std::is_pointer_v<T>, T> exception_cast(std::exception_ptr e)
requires std::is_pointer_v<T>
T exception_cast(std::exception_ptr e)
{
try
{

View File

@ -46,6 +46,11 @@ public:
throw Exception("Cannot convert Map to " + demangle(typeid(T).name()), ErrorCodes::CANNOT_CONVERT_TYPE);
}
T operator() (const Object &) const
{
throw Exception("Cannot convert Object to " + demangle(typeid(T).name()), ErrorCodes::CANNOT_CONVERT_TYPE);
}
T operator() (const UInt64 & x) const { return T(x); }
T operator() (const Int64 & x) const { return T(x); }
T operator() (const Int128 & x) const { return T(x); }
@ -113,7 +118,8 @@ public:
throw Exception("Cannot convert AggregateFunctionStateData to " + demangle(typeid(T).name()), ErrorCodes::CANNOT_CONVERT_TYPE);
}
template <typename U, typename = std::enable_if_t<is_big_int_v<U>> >
template <typename U>
requires is_big_int_v<U>
T operator() (const U & x) const
{
if constexpr (is_decimal<T>)

View File

@ -95,6 +95,23 @@ String FieldVisitorDump::operator() (const Map & x) const
return wb.str();
}
String FieldVisitorDump::operator() (const Object & x) const
{
WriteBufferFromOwnString wb;
wb << "Object_(";
for (auto it = x.begin(); it != x.end(); ++it)
{
if (it != x.begin())
wb << ", ";
wb << "(" << it->first << ", " << applyVisitor(*this, it->second) << ")";
}
wb << ')';
return wb.str();
}
String FieldVisitorDump::operator() (const AggregateFunctionStateData & x) const
{
WriteBufferFromOwnString wb;

View File

@ -22,6 +22,7 @@ public:
String operator() (const Array & x) const;
String operator() (const Tuple & x) const;
String operator() (const Map & x) const;
String operator() (const Object & x) const;
String operator() (const DecimalField<Decimal32> & x) const;
String operator() (const DecimalField<Decimal64> & x) const;
String operator() (const DecimalField<Decimal128> & x) const;

View File

@ -94,6 +94,19 @@ void FieldVisitorHash::operator() (const Array & x) const
applyVisitor(*this, elem);
}
void FieldVisitorHash::operator() (const Object & x) const
{
UInt8 type = Field::Types::Object;
hash.update(type);
hash.update(x.size());
for (const auto & [key, value]: x)
{
hash.update(key);
applyVisitor(*this, value);
}
}
void FieldVisitorHash::operator() (const DecimalField<Decimal32> & x) const
{
UInt8 type = Field::Types::Decimal32;

View File

@ -28,6 +28,7 @@ public:
void operator() (const Array & x) const;
void operator() (const Tuple & x) const;
void operator() (const Map & x) const;
void operator() (const Object & x) const;
void operator() (const DecimalField<Decimal32> & x) const;
void operator() (const DecimalField<Decimal64> & x) const;
void operator() (const DecimalField<Decimal128> & x) const;

View File

@ -26,6 +26,7 @@ bool FieldVisitorSum::operator() (String &) const { throw Exception("Cannot sum
bool FieldVisitorSum::operator() (Array &) const { throw Exception("Cannot sum Arrays", ErrorCodes::LOGICAL_ERROR); }
bool FieldVisitorSum::operator() (Tuple &) const { throw Exception("Cannot sum Tuples", ErrorCodes::LOGICAL_ERROR); }
bool FieldVisitorSum::operator() (Map &) const { throw Exception("Cannot sum Maps", ErrorCodes::LOGICAL_ERROR); }
bool FieldVisitorSum::operator() (Object &) const { throw Exception("Cannot sum Objects", ErrorCodes::LOGICAL_ERROR); }
bool FieldVisitorSum::operator() (UUID &) const { throw Exception("Cannot sum UUIDs", ErrorCodes::LOGICAL_ERROR); }
bool FieldVisitorSum::operator() (AggregateFunctionStateData &) const

View File

@ -25,6 +25,7 @@ public:
bool operator() (Array &) const;
bool operator() (Tuple &) const;
bool operator() (Map &) const;
bool operator() (Object &) const;
bool operator() (UUID &) const;
bool operator() (AggregateFunctionStateData &) const;
bool operator() (bool &) const;
@ -36,7 +37,8 @@ public:
return x.getValue() != T(0);
}
template <typename T, typename = std::enable_if_t<is_big_int_v<T>> >
template <typename T>
requires is_big_int_v<T>
bool operator() (T & x) const
{
x += rhs.reinterpret<T>();

View File

@ -126,5 +126,24 @@ String FieldVisitorToString::operator() (const Map & x) const
return wb.str();
}
String FieldVisitorToString::operator() (const Object & x) const
{
WriteBufferFromOwnString wb;
wb << '{';
for (auto it = x.begin(); it != x.end(); ++it)
{
if (it != x.begin())
wb << ", ";
writeDoubleQuoted(it->first, wb);
wb << ": " << applyVisitor(*this, it->second);
}
wb << '}';
return wb.str();
}
}

View File

@ -22,6 +22,7 @@ public:
String operator() (const Array & x) const;
String operator() (const Tuple & x) const;
String operator() (const Map & x) const;
String operator() (const Object & x) const;
String operator() (const DecimalField<Decimal32> & x) const;
String operator() (const DecimalField<Decimal64> & x) const;
String operator() (const DecimalField<Decimal128> & x) const;

View File

@ -66,6 +66,20 @@ void FieldVisitorWriteBinary::operator() (const Map & x, WriteBuffer & buf) cons
}
}
void FieldVisitorWriteBinary::operator() (const Object & x, WriteBuffer & buf) const
{
const size_t size = x.size();
writeBinary(size, buf);
for (const auto & [key, value] : x)
{
const UInt8 type = value.getType();
writeBinary(type, buf);
writeBinary(key, buf);
Field::dispatch([&buf] (const auto & val) { FieldVisitorWriteBinary()(val, buf); }, value);
}
}
void FieldVisitorWriteBinary::operator()(const bool & x, WriteBuffer & buf) const
{
writeBinary(UInt8(x), buf);

View File

@ -21,6 +21,7 @@ public:
void operator() (const Array & x, WriteBuffer & buf) const;
void operator() (const Tuple & x, WriteBuffer & buf) const;
void operator() (const Map & x, WriteBuffer & buf) const;
void operator() (const Object & x, WriteBuffer & buf) const;
void operator() (const DecimalField<Decimal32> & x, WriteBuffer & buf) const;
void operator() (const DecimalField<Decimal64> & x, WriteBuffer & buf) const;
void operator() (const DecimalField<Decimal128> & x, WriteBuffer & buf) const;

View File

@ -46,7 +46,16 @@ FileSegment::State FileSegment::state() const
size_t FileSegment::getDownloadOffset() const
{
std::lock_guard segment_lock(mutex);
return range().left + downloaded_size;
return range().left + getDownloadedSize(segment_lock);
}
size_t FileSegment::getDownloadedSize(std::lock_guard<std::mutex> & /* segment_lock */) const
{
if (download_state == State::DOWNLOADED)
return downloaded_size;
std::lock_guard download_lock(download_mutex);
return downloaded_size;
}
String FileSegment::getCallerId()
@ -174,7 +183,12 @@ void FileSegment::write(const char * from, size_t size)
try
{
cache_writer->write(from, size);
std::lock_guard download_lock(download_mutex);
cache_writer->next();
downloaded_size += size;
}
catch (...)
{
@ -189,9 +203,6 @@ void FileSegment::write(const char * from, size_t size)
throw;
}
std::lock_guard segment_lock(mutex);
downloaded_size += size;
}
FileSegment::State FileSegment::wait()
@ -225,15 +236,15 @@ bool FileSegment::reserve(size_t size)
{
std::lock_guard segment_lock(mutex);
auto caller_id = getCallerId();
if (downloader_id != caller_id)
throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Space can be reserved only by downloader (current: {}, expected: {})", caller_id, downloader_id);
if (downloaded_size + size > range().size())
throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR,
"Attempt to reserve space too much space ({}) for file segment with range: {} (downloaded size: {})",
size, range().toString(), downloaded_size);
auto caller_id = getCallerId();
if (downloader_id != caller_id)
throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Space can be reserved only by downloader (current: {}, expected: {})", caller_id, downloader_id);
assert(reserved_size >= downloaded_size);
}
@ -323,7 +334,7 @@ void FileSegment::complete()
if (download_state == State::SKIP_CACHE || detached)
return;
if (downloaded_size == range().size() && download_state != State::DOWNLOADED)
if (download_state != State::DOWNLOADED && getDownloadedSize(segment_lock) == range().size())
setDownloaded(segment_lock);
if (download_state == State::DOWNLOADING || download_state == State::EMPTY)
@ -350,10 +361,11 @@ void FileSegment::completeImpl(bool allow_non_strict_checking)
if (!download_can_continue)
{
if (!downloaded_size)
size_t current_downloaded_size = getDownloadedSize(segment_lock);
if (current_downloaded_size == 0)
{
download_state = State::SKIP_CACHE;
LOG_TEST(log, "Remove cell {} (downloaded: {})", range().toString(), downloaded_size);
LOG_TEST(log, "Remove cell {} (nothing downloaded)", range().toString());
cache->remove(key(), offset(), cache_lock, segment_lock);
detached = true;
@ -366,7 +378,7 @@ void FileSegment::completeImpl(bool allow_non_strict_checking)
* in FileSegmentsHolder represent a contiguous range, so we can resize
* it only when nobody needs it.
*/
LOG_TEST(log, "Resize cell {} to downloaded: {}", range().toString(), downloaded_size);
LOG_TEST(log, "Resize cell {} to downloaded: {}", range().toString(), current_downloaded_size);
cache->reduceSizeToDownloaded(key(), offset(), cache_lock, segment_lock);
detached = true;
@ -397,7 +409,7 @@ String FileSegment::getInfoForLog() const
WriteBufferFromOwnString info;
info << "File segment: " << range().toString() << ", ";
info << "state: " << download_state << ", ";
info << "downloaded size: " << downloaded_size << ", ";
info << "downloaded size: " << getDownloadedSize(segment_lock) << ", ";
info << "downloader id: " << downloader_id << ", ";
info << "caller id: " << getCallerId();

View File

@ -129,6 +129,7 @@ private:
void setDownloaded(std::lock_guard<std::mutex> & segment_lock);
static String getCallerIdImpl(bool allow_non_strict_checking = false);
void resetDownloaderImpl(std::lock_guard<std::mutex> & segment_lock);
size_t getDownloadedSize(std::lock_guard<std::mutex> & segment_lock) const;
const Range segment_range;
@ -144,6 +145,14 @@ private:
mutable std::mutex mutex;
std::condition_variable cv;
/// Protects downloaded_size access with actual write into fs.
/// downloaded_size is not protected by download_mutex in methods which
/// can never be run in parallel to FileSegment::write() method
/// as downloaded_size is updated only in FileSegment::write() method.
/// Such methods are identified by isDownloader() check at their start,
/// e.g. they are executed strictly by the same thread, sequentially.
mutable std::mutex download_mutex;
Key file_key;
IFileCache * cache;

View File

@ -1,5 +1,11 @@
#pragma once
#include <base/StringRef.h>
#include <base/logger_useful.h>
#include <string_view>
#include <unordered_map>
#include <Common/Arena.h>
#include <Common/getResource.h>
#include <Common/HashTable/HashMap.h>
@ -10,11 +16,6 @@
#include <IO/readFloatText.h>
#include <IO/ZstdInflatingReadBuffer.h>
#include <base/StringRef.h>
#include <base/logger_useful.h>
#include <string_view>
#include <unordered_map>
namespace DB
{
@ -34,7 +35,6 @@ namespace ErrorCodes
class FrequencyHolder
{
public:
struct Language
{
@ -52,6 +52,7 @@ public:
public:
using Map = HashMap<StringRef, Float64>;
using Container = std::vector<Language>;
using EncodingMap = HashMap<UInt16, Float64>;
using EncodingContainer = std::vector<Encoding>;
@ -61,6 +62,30 @@ public:
return instance;
}
const Map & getEmotionalDict() const
{
return emotional_dict;
}
const EncodingContainer & getEncodingsFrequency() const
{
return encodings_freq;
}
const Container & getProgrammingFrequency() const
{
return programming_freq;
}
private:
FrequencyHolder()
{
loadEmotionalDict();
loadEncodingsFrequency();
loadProgrammingFrequency();
}
void loadEncodingsFrequency()
{
Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
@ -119,7 +144,6 @@ public:
LOG_TRACE(log, "Charset frequencies was added, charsets count: {}", encodings_freq.size());
}
void loadEmotionalDict()
{
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
@ -158,7 +182,6 @@ public:
LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
}
void loadProgrammingFrequency()
{
Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
@ -211,42 +234,10 @@ public:
LOG_TRACE(log, "Programming languages frequencies was added");
}
const Map & getEmotionalDict()
{
std::lock_guard lock(mutex);
if (emotional_dict.empty())
loadEmotionalDict();
return emotional_dict;
}
const EncodingContainer & getEncodingsFrequency()
{
std::lock_guard lock(mutex);
if (encodings_freq.empty())
loadEncodingsFrequency();
return encodings_freq;
}
const Container & getProgrammingFrequency()
{
std::lock_guard lock(mutex);
if (programming_freq.empty())
loadProgrammingFrequency();
return programming_freq;
}
private:
Arena string_pool;
Map emotional_dict;
Container programming_freq;
EncodingContainer encodings_freq;
std::mutex mutex;
};
}

View File

@ -130,6 +130,7 @@ public:
IntervalTree() { nodes.resize(1); }
template <typename TValue = Value, std::enable_if_t<std::is_same_v<TValue, IntervalTreeVoidValue>, bool> = true>
requires std::is_same_v<Value, IntervalTreeVoidValue>
ALWAYS_INLINE bool emplace(Interval interval)
{
assert(!tree_is_built);

View File

@ -76,7 +76,8 @@ public:
void add(const char * value) { add(std::make_unique<JSONString>(value)); }
void add(bool value) { add(std::make_unique<JSONBool>(std::move(value))); }
template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, bool> = true>
template <typename T>
requires std::is_arithmetic_v<T>
void add(T value) { add(std::make_unique<JSONNumber<T>>(value)); }
void format(const FormatSettings & settings, FormatContext & context) override;
@ -100,7 +101,8 @@ public:
void add(std::string key, std::string_view value) { add(std::move(key), std::make_unique<JSONString>(value)); }
void add(std::string key, bool value) { add(std::move(key), std::make_unique<JSONBool>(std::move(value))); }
template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, bool> = true>
template <typename T>
requires std::is_arithmetic_v<T>
void add(std::string key, T value) { add(std::move(key), std::make_unique<JSONNumber<T>>(value)); }
void format(const FormatSettings & settings, FormatContext & context) override;

View File

@ -82,7 +82,8 @@ private:
#endif
public:
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
StringSearcher(const CharT * needle_, const size_t needle_size_)
: needle{reinterpret_cast<const uint8_t *>(needle_)}, needle_size{needle_size_}
{
@ -191,7 +192,8 @@ public:
#endif
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
ALWAYS_INLINE bool compareTrivial(const CharT * haystack_pos, const CharT * const haystack_end, const uint8_t * needle_pos) const
{
while (haystack_pos < haystack_end && needle_pos < needle_end)
@ -217,7 +219,8 @@ public:
return needle_pos == needle_end;
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * haystack_end, const CharT * pos) const
{
@ -262,7 +265,8 @@ public:
/** Returns haystack_end if not found.
*/
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
{
if (0 == needle_size)
@ -338,7 +342,8 @@ public:
return haystack_end;
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const size_t haystack_size) const
{
return search(haystack, haystack + haystack_size);
@ -367,7 +372,8 @@ private:
#endif
public:
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
StringSearcher(const CharT * needle_, const size_t needle_size)
: needle{reinterpret_cast<const uint8_t *>(needle_)}, needle_end{needle + needle_size}
{
@ -399,7 +405,8 @@ public:
#endif
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * /*haystack_end*/, const CharT * pos) const
{
#ifdef __SSE4_1__
@ -453,7 +460,8 @@ public:
return false;
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
{
if (needle == needle_end)
@ -540,7 +548,8 @@ public:
return haystack_end;
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const size_t haystack_size) const
{
return search(haystack, haystack + haystack_size);
@ -568,7 +577,8 @@ private:
#endif
public:
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
StringSearcher(const CharT * needle_, const size_t needle_size)
: needle{reinterpret_cast<const uint8_t *>(needle_)}, needle_end{needle + needle_size}
{
@ -596,7 +606,8 @@ public:
#endif
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * /*haystack_end*/, const CharT * pos) const
{
#ifdef __SSE4_1__
@ -642,7 +653,8 @@ public:
return false;
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
{
if (needle == needle_end)
@ -722,7 +734,8 @@ public:
return haystack_end;
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const size_t haystack_size) const
{
return search(haystack, haystack + haystack_size);
@ -740,7 +753,8 @@ class TokenSearcher : public StringSearcherBase
size_t needle_size;
public:
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
TokenSearcher(const CharT * needle_, const size_t needle_size_)
: searcher{needle_, needle_size_},
needle_size(needle_size_)
@ -752,7 +766,8 @@ public:
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
ALWAYS_INLINE bool compare(const CharT * haystack, const CharT * haystack_end, const CharT * pos) const
{
// use searcher only if pos is in the beginning of token and pos + searcher.needle_size is end of token.
@ -762,7 +777,8 @@ public:
return false;
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
{
// use searcher.search(), then verify that returned value is a token
@ -781,13 +797,15 @@ public:
return haystack_end;
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const size_t haystack_size) const
{
return search(haystack, haystack + haystack_size);
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
ALWAYS_INLINE bool isToken(const CharT * haystack, const CharT * const haystack_end, const CharT* p) const
{
return (p == haystack || isTokenSeparator(*(p - 1)))
@ -819,11 +837,13 @@ struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase
{
const char * const needle;
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
LibCASCIICaseSensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */)
: needle(reinterpret_cast<const char *>(needle_)) {}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
{
const auto * res = strstr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
@ -832,7 +852,8 @@ struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase
return reinterpret_cast<const CharT *>(res);
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const size_t haystack_size) const
{
return search(haystack, haystack + haystack_size);
@ -843,11 +864,13 @@ struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase
{
const char * const needle;
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
LibCASCIICaseInsensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */)
: needle(reinterpret_cast<const char *>(needle_)) {}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
{
const auto * res = strcasestr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
@ -856,7 +879,8 @@ struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase
return reinterpret_cast<const CharT *>(res);
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
const CharT * search(const CharT * haystack, const size_t haystack_size) const
{
return search(haystack, haystack + haystack_size);

View File

@ -9,7 +9,6 @@
#include <filesystem>
#include <fstream>
#include <optional>
#include <sstream>
#include <unordered_set>
#include <fcntl.h>
@ -21,6 +20,8 @@
#include <sys/types.h>
#include <dirent.h>
#include <boost/algorithm/string/split.hpp>
#include <base/errnoToString.h>
@ -247,9 +248,9 @@ static_assert(sizeof(raw_events_info) / sizeof(raw_events_info[0]) == NUMBER_OF_
#undef CACHE_EVENT
// A map of event name -> event index, to parse event list in settings.
static std::unordered_map<std::string, size_t> populateEventMap()
static std::unordered_map<std::string_view, size_t> populateEventMap()
{
std::unordered_map<std::string, size_t> name_to_index;
std::unordered_map<std::string_view, size_t> name_to_index;
name_to_index.reserve(NUMBER_OF_RAW_EVENTS);
for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i)
@ -455,10 +456,10 @@ std::vector<size_t> PerfEventsCounters::eventIndicesFromString(const std::string
return result;
}
std::vector<std::string> event_names;
boost::split(event_names, events_list, [](char c) { return c == ','; });
std::istringstream iss(events_list); // STYLE_CHECK_ALLOW_STD_STRING_STREAM
std::string event_name;
while (std::getline(iss, event_name, ','))
for (auto & event_name : event_names)
{
// Allow spaces at the beginning of the token, so that you can write 'a, b'.
event_name.erase(0, event_name.find_first_not_of(' '));

View File

@ -75,7 +75,8 @@ inline size_t countCodePoints(const UInt8 * data, size_t size)
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
size_t convertCodePointToUTF8(int code_point, CharT * out_bytes, size_t out_length)
{
static const Poco::UTF8Encoding utf8;
@ -84,7 +85,8 @@ size_t convertCodePointToUTF8(int code_point, CharT * out_bytes, size_t out_leng
return res;
}
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
template <typename CharT>
requires (sizeof(CharT) == 1)
std::optional<uint32_t> convertUTF8ToCodePoint(const CharT * in_bytes, size_t in_length)
{
static const Poco::UTF8Encoding utf8;

View File

@ -13,6 +13,9 @@
#cmakedefine01 USE_CASSANDRA
#cmakedefine01 USE_SENTRY
#cmakedefine01 USE_GRPC
#cmakedefine01 USE_SIMDJSON
#cmakedefine01 USE_RAPIDJSON
#cmakedefine01 USE_DATASKETCHES
#cmakedefine01 USE_YAML_CPP
#cmakedefine01 CLICKHOUSE_SPLIT_BINARY

View File

@ -25,7 +25,8 @@ namespace DB
* In the rest, behaves like a dynamic_cast.
*/
template <typename To, typename From>
std::enable_if_t<std::is_reference_v<To>, To> typeid_cast(From & from)
requires std::is_reference_v<To>
To typeid_cast(From & from)
{
try
{
@ -43,7 +44,8 @@ std::enable_if_t<std::is_reference_v<To>, To> typeid_cast(From & from)
template <typename To, typename From>
std::enable_if_t<std::is_pointer_v<To>, To> typeid_cast(From * from)
requires std::is_pointer_v<To>
To typeid_cast(From * from)
{
try
{
@ -60,7 +62,8 @@ std::enable_if_t<std::is_pointer_v<To>, To> typeid_cast(From * from)
template <typename To, typename From>
std::enable_if_t<is_shared_ptr_v<To>, To> typeid_cast(const std::shared_ptr<From> & from)
requires is_shared_ptr_v<To>
To typeid_cast(const std::shared_ptr<From> & from)
{
try
{

View File

@ -726,18 +726,6 @@ void convertToFullIfSparse(Block & block)
column.column = recursiveRemoveSparse(column.column);
}
ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & column)
{
auto current_column = block.getByName(column.getNameInStorage()).column;
current_column = current_column->decompress();
if (column.isSubcolumn())
return column.getTypeInStorage()->getSubcolumn(column.getSubcolumnName(), current_column);
return current_column;
}
Block materializeBlock(const Block & block)
{
if (!block)

View File

@ -196,10 +196,6 @@ void getBlocksDifference(const Block & lhs, const Block & rhs, std::string & out
void convertToFullIfSparse(Block & block);
/// Helps in-memory storages to extract columns from block.
/// Properly handles cases, when column is a subcolumn and when it is compressed.
ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & column);
/// Converts columns-constants to full columns ("materializes" them).
Block materializeBlock(const Block & block);
void materializeBlockInplace(Block & block);

View File

@ -115,8 +115,8 @@ private:
}
template <typename T, typename U>
static std::enable_if_t<is_decimal<T> && is_decimal<U>, Shift>
getScales(const DataTypePtr & left_type, const DataTypePtr & right_type)
requires is_decimal<T> && is_decimal<U>
static Shift getScales(const DataTypePtr & left_type, const DataTypePtr & right_type)
{
const DataTypeDecimalBase<T> * decimal0 = checkDecimalBase<T>(*left_type);
const DataTypeDecimalBase<U> * decimal1 = checkDecimalBase<U>(*right_type);
@ -137,8 +137,8 @@ private:
}
template <typename T, typename U>
static std::enable_if_t<is_decimal<T> && !is_decimal<U>, Shift>
getScales(const DataTypePtr & left_type, const DataTypePtr &)
requires is_decimal<T> && (!is_decimal<U>)
static Shift getScales(const DataTypePtr & left_type, const DataTypePtr &)
{
Shift shift;
const DataTypeDecimalBase<T> * decimal0 = checkDecimalBase<T>(*left_type);
@ -148,8 +148,8 @@ private:
}
template <typename T, typename U>
static std::enable_if_t<!is_decimal<T> && is_decimal<U>, Shift>
getScales(const DataTypePtr &, const DataTypePtr & right_type)
requires (!is_decimal<T>) && is_decimal<U>
static Shift getScales(const DataTypePtr &, const DataTypePtr & right_type)
{
Shift shift;
const DataTypeDecimalBase<U> * decimal1 = checkDecimalBase<U>(*right_type);

View File

@ -99,6 +99,12 @@ inline Field getBinaryValue(UInt8 type, ReadBuffer & buf)
readBinary(value, buf);
return value;
}
case Field::Types::Object:
{
Object value;
readBinary(value, buf);
return value;
}
case Field::Types::AggregateFunctionState:
{
AggregateFunctionStateData value;
@ -208,6 +214,40 @@ void writeText(const Map & x, WriteBuffer & buf)
writeFieldText(Field(x), buf);
}
void readBinary(Object & x, ReadBuffer & buf)
{
size_t size;
readBinary(size, buf);
for (size_t index = 0; index < size; ++index)
{
UInt8 type;
String key;
readBinary(type, buf);
readBinary(key, buf);
x[key] = getBinaryValue(type, buf);
}
}
void writeBinary(const Object & x, WriteBuffer & buf)
{
const size_t size = x.size();
writeBinary(size, buf);
for (const auto & [key, value] : x)
{
const UInt8 type = value.getType();
writeBinary(type, buf);
writeBinary(key, buf);
Field::dispatch([&buf] (const auto & val) { FieldVisitorWriteBinary()(val, buf); }, value);
}
}
void writeText(const Object & x, WriteBuffer & buf)
{
writeFieldText(Field(x), buf);
}
template <typename T>
void readQuoted(DecimalField<T> & x, ReadBuffer & buf)
{

View File

@ -3,6 +3,7 @@
#include <cassert>
#include <vector>
#include <algorithm>
#include <map>
#include <type_traits>
#include <functional>
@ -49,10 +50,22 @@ DEFINE_FIELD_VECTOR(Array);
DEFINE_FIELD_VECTOR(Tuple);
/// An array with the following structure: [(key1, value1), (key2, value2), ...]
DEFINE_FIELD_VECTOR(Map);
DEFINE_FIELD_VECTOR(Map); /// TODO: use map instead of vector.
#undef DEFINE_FIELD_VECTOR
using FieldMap = std::map<String, Field, std::less<String>, AllocatorWithMemoryTracking<std::pair<const String, Field>>>;
#define DEFINE_FIELD_MAP(X) \
struct X : public FieldMap \
{ \
using FieldMap::FieldMap; \
}
DEFINE_FIELD_MAP(Object);
#undef DEFINE_FIELD_MAP
struct AggregateFunctionStateData
{
String name; /// Name with arguments.
@ -219,6 +232,7 @@ template <> struct NearestFieldTypeImpl<String> { using Type = String; };
template <> struct NearestFieldTypeImpl<Array> { using Type = Array; };
template <> struct NearestFieldTypeImpl<Tuple> { using Type = Tuple; };
template <> struct NearestFieldTypeImpl<Map> { using Type = Map; };
template <> struct NearestFieldTypeImpl<Object> { using Type = Object; };
template <> struct NearestFieldTypeImpl<bool> { using Type = UInt64; };
template <> struct NearestFieldTypeImpl<Null> { using Type = Null; };
@ -283,6 +297,7 @@ public:
Map = 26,
UUID = 27,
Bool = 28,
Object = 29,
};
};
@ -472,6 +487,7 @@ public:
case Types::Array: return get<Array>() < rhs.get<Array>();
case Types::Tuple: return get<Tuple>() < rhs.get<Tuple>();
case Types::Map: return get<Map>() < rhs.get<Map>();
case Types::Object: return get<Object>() < rhs.get<Object>();
case Types::Decimal32: return get<DecimalField<Decimal32>>() < rhs.get<DecimalField<Decimal32>>();
case Types::Decimal64: return get<DecimalField<Decimal64>>() < rhs.get<DecimalField<Decimal64>>();
case Types::Decimal128: return get<DecimalField<Decimal128>>() < rhs.get<DecimalField<Decimal128>>();
@ -510,6 +526,7 @@ public:
case Types::Array: return get<Array>() <= rhs.get<Array>();
case Types::Tuple: return get<Tuple>() <= rhs.get<Tuple>();
case Types::Map: return get<Map>() <= rhs.get<Map>();
case Types::Object: return get<Object>() <= rhs.get<Object>();
case Types::Decimal32: return get<DecimalField<Decimal32>>() <= rhs.get<DecimalField<Decimal32>>();
case Types::Decimal64: return get<DecimalField<Decimal64>>() <= rhs.get<DecimalField<Decimal64>>();
case Types::Decimal128: return get<DecimalField<Decimal128>>() <= rhs.get<DecimalField<Decimal128>>();
@ -548,6 +565,7 @@ public:
case Types::Array: return get<Array>() == rhs.get<Array>();
case Types::Tuple: return get<Tuple>() == rhs.get<Tuple>();
case Types::Map: return get<Map>() == rhs.get<Map>();
case Types::Object: return get<Object>() == rhs.get<Object>();
case Types::UInt128: return get<UInt128>() == rhs.get<UInt128>();
case Types::UInt256: return get<UInt256>() == rhs.get<UInt256>();
case Types::Int128: return get<Int128>() == rhs.get<Int128>();
@ -597,6 +615,7 @@ public:
bool value = bool(field.template get<UInt64>());
return f(value);
}
case Types::Object: return f(field.template get<Object>());
case Types::Decimal32: return f(field.template get<DecimalField<Decimal32>>());
case Types::Decimal64: return f(field.template get<DecimalField<Decimal64>>());
case Types::Decimal128: return f(field.template get<DecimalField<Decimal128>>());
@ -713,6 +732,9 @@ private:
case Types::Map:
destroy<Map>();
break;
case Types::Object:
destroy<Object>();
break;
case Types::AggregateFunctionState:
destroy<AggregateFunctionStateData>();
break;
@ -737,26 +759,27 @@ private:
using Row = std::vector<Field>;
template <> struct Field::TypeToEnum<Null> { static const Types::Which value = Types::Null; };
template <> struct Field::TypeToEnum<UInt64> { static const Types::Which value = Types::UInt64; };
template <> struct Field::TypeToEnum<UInt128> { static const Types::Which value = Types::UInt128; };
template <> struct Field::TypeToEnum<UInt256> { static const Types::Which value = Types::UInt256; };
template <> struct Field::TypeToEnum<Int64> { static const Types::Which value = Types::Int64; };
template <> struct Field::TypeToEnum<Int128> { static const Types::Which value = Types::Int128; };
template <> struct Field::TypeToEnum<Int256> { static const Types::Which value = Types::Int256; };
template <> struct Field::TypeToEnum<UUID> { static const Types::Which value = Types::UUID; };
template <> struct Field::TypeToEnum<Float64> { static const Types::Which value = Types::Float64; };
template <> struct Field::TypeToEnum<String> { static const Types::Which value = Types::String; };
template <> struct Field::TypeToEnum<Array> { static const Types::Which value = Types::Array; };
template <> struct Field::TypeToEnum<Tuple> { static const Types::Which value = Types::Tuple; };
template <> struct Field::TypeToEnum<Map> { static const Types::Which value = Types::Map; };
template <> struct Field::TypeToEnum<DecimalField<Decimal32>>{ static const Types::Which value = Types::Decimal32; };
template <> struct Field::TypeToEnum<DecimalField<Decimal64>>{ static const Types::Which value = Types::Decimal64; };
template <> struct Field::TypeToEnum<DecimalField<Decimal128>>{ static const Types::Which value = Types::Decimal128; };
template <> struct Field::TypeToEnum<DecimalField<Decimal256>>{ static const Types::Which value = Types::Decimal256; };
template <> struct Field::TypeToEnum<DecimalField<DateTime64>>{ static const Types::Which value = Types::Decimal64; };
template <> struct Field::TypeToEnum<AggregateFunctionStateData>{ static const Types::Which value = Types::AggregateFunctionState; };
template <> struct Field::TypeToEnum<bool>{ static const Types::Which value = Types::Bool; };
template <> struct Field::TypeToEnum<Null> { static constexpr Types::Which value = Types::Null; };
template <> struct Field::TypeToEnum<UInt64> { static constexpr Types::Which value = Types::UInt64; };
template <> struct Field::TypeToEnum<UInt128> { static constexpr Types::Which value = Types::UInt128; };
template <> struct Field::TypeToEnum<UInt256> { static constexpr Types::Which value = Types::UInt256; };
template <> struct Field::TypeToEnum<Int64> { static constexpr Types::Which value = Types::Int64; };
template <> struct Field::TypeToEnum<Int128> { static constexpr Types::Which value = Types::Int128; };
template <> struct Field::TypeToEnum<Int256> { static constexpr Types::Which value = Types::Int256; };
template <> struct Field::TypeToEnum<UUID> { static constexpr Types::Which value = Types::UUID; };
template <> struct Field::TypeToEnum<Float64> { static constexpr Types::Which value = Types::Float64; };
template <> struct Field::TypeToEnum<String> { static constexpr Types::Which value = Types::String; };
template <> struct Field::TypeToEnum<Array> { static constexpr Types::Which value = Types::Array; };
template <> struct Field::TypeToEnum<Tuple> { static constexpr Types::Which value = Types::Tuple; };
template <> struct Field::TypeToEnum<Map> { static constexpr Types::Which value = Types::Map; };
template <> struct Field::TypeToEnum<Object> { static constexpr Types::Which value = Types::Object; };
template <> struct Field::TypeToEnum<DecimalField<Decimal32>>{ static constexpr Types::Which value = Types::Decimal32; };
template <> struct Field::TypeToEnum<DecimalField<Decimal64>>{ static constexpr Types::Which value = Types::Decimal64; };
template <> struct Field::TypeToEnum<DecimalField<Decimal128>>{ static constexpr Types::Which value = Types::Decimal128; };
template <> struct Field::TypeToEnum<DecimalField<Decimal256>>{ static constexpr Types::Which value = Types::Decimal256; };
template <> struct Field::TypeToEnum<DecimalField<DateTime64>>{ static constexpr Types::Which value = Types::Decimal64; };
template <> struct Field::TypeToEnum<AggregateFunctionStateData>{ static constexpr Types::Which value = Types::AggregateFunctionState; };
template <> struct Field::TypeToEnum<bool>{ static constexpr Types::Which value = Types::Bool; };
template <> struct Field::EnumToType<Field::Types::Null> { using Type = Null; };
template <> struct Field::EnumToType<Field::Types::UInt64> { using Type = UInt64; };
@ -771,6 +794,7 @@ template <> struct Field::EnumToType<Field::Types::String> { using Type = Strin
template <> struct Field::EnumToType<Field::Types::Array> { using Type = Array; };
template <> struct Field::EnumToType<Field::Types::Tuple> { using Type = Tuple; };
template <> struct Field::EnumToType<Field::Types::Map> { using Type = Map; };
template <> struct Field::EnumToType<Field::Types::Object> { using Type = Object; };
template <> struct Field::EnumToType<Field::Types::Decimal32> { using Type = DecimalField<Decimal32>; };
template <> struct Field::EnumToType<Field::Types::Decimal64> { using Type = DecimalField<Decimal64>; };
template <> struct Field::EnumToType<Field::Types::Decimal128> { using Type = DecimalField<Decimal128>; };
@ -931,34 +955,39 @@ class WriteBuffer;
/// It is assumed that all elements of the array have the same type.
void readBinary(Array & x, ReadBuffer & buf);
[[noreturn]] inline void readText(Array &, ReadBuffer &) { throw Exception("Cannot read Array.", ErrorCodes::NOT_IMPLEMENTED); }
[[noreturn]] inline void readQuoted(Array &, ReadBuffer &) { throw Exception("Cannot read Array.", ErrorCodes::NOT_IMPLEMENTED); }
/// It is assumed that all elements of the array have the same type.
/// Also write size and type into buf. UInt64 and Int64 is written in variadic size form
void writeBinary(const Array & x, WriteBuffer & buf);
void writeText(const Array & x, WriteBuffer & buf);
[[noreturn]] inline void writeQuoted(const Array &, WriteBuffer &) { throw Exception("Cannot write Array quoted.", ErrorCodes::NOT_IMPLEMENTED); }
void readBinary(Tuple & x, ReadBuffer & buf);
[[noreturn]] inline void readText(Tuple &, ReadBuffer &) { throw Exception("Cannot read Tuple.", ErrorCodes::NOT_IMPLEMENTED); }
[[noreturn]] inline void readQuoted(Tuple &, ReadBuffer &) { throw Exception("Cannot read Tuple.", ErrorCodes::NOT_IMPLEMENTED); }
void writeBinary(const Tuple & x, WriteBuffer & buf);
void writeText(const Tuple & x, WriteBuffer & buf);
[[noreturn]] inline void writeQuoted(const Tuple &, WriteBuffer &) { throw Exception("Cannot write Tuple quoted.", ErrorCodes::NOT_IMPLEMENTED); }
void readBinary(Map & x, ReadBuffer & buf);
[[noreturn]] inline void readText(Map &, ReadBuffer &) { throw Exception("Cannot read Map.", ErrorCodes::NOT_IMPLEMENTED); }
[[noreturn]] inline void readQuoted(Map &, ReadBuffer &) { throw Exception("Cannot read Map.", ErrorCodes::NOT_IMPLEMENTED); }
void writeBinary(const Map & x, WriteBuffer & buf);
void writeText(const Map & x, WriteBuffer & buf);
[[noreturn]] inline void writeQuoted(const Map &, WriteBuffer &) { throw Exception("Cannot write Map quoted.", ErrorCodes::NOT_IMPLEMENTED); }
void readBinary(Object & x, ReadBuffer & buf);
[[noreturn]] inline void readText(Object &, ReadBuffer &) { throw Exception("Cannot read Object.", ErrorCodes::NOT_IMPLEMENTED); }
[[noreturn]] inline void readQuoted(Object &, ReadBuffer &) { throw Exception("Cannot read Object.", ErrorCodes::NOT_IMPLEMENTED); }
void writeBinary(const Object & x, WriteBuffer & buf);
void writeText(const Object & x, WriteBuffer & buf);
[[noreturn]] inline void writeQuoted(const Object &, WriteBuffer &) { throw Exception("Cannot write Object quoted.", ErrorCodes::NOT_IMPLEMENTED); }
__attribute__ ((noreturn)) inline void writeText(const AggregateFunctionStateData &, WriteBuffer &)
{
// This probably doesn't make any sense, but we have to have it for
@ -977,8 +1006,6 @@ void readQuoted(DecimalField<T> & x, ReadBuffer & buf);
void writeFieldText(const Field & x, WriteBuffer & buf);
[[noreturn]] inline void writeQuoted(const Tuple &, WriteBuffer &) { throw Exception("Cannot write Tuple quoted.", ErrorCodes::NOT_IMPLEMENTED); }
String toString(const Field & x);
}

View File

@ -53,7 +53,8 @@ struct MultiEnum
return bitset;
}
template <typename ValueType, typename = std::enable_if_t<std::is_convertible_v<ValueType, StorageType>>>
template <typename ValueType>
requires std::is_convertible_v<ValueType, StorageType>
void setValue(ValueType new_value)
{
// Can't set value from any enum avoid confusion
@ -66,7 +67,8 @@ struct MultiEnum
return bitset == other.bitset;
}
template <typename ValueType, typename = std::enable_if_t<std::is_convertible_v<ValueType, StorageType>>>
template <typename ValueType>
requires std::is_convertible_v<ValueType, StorageType>
bool operator==(ValueType other) const
{
// Shouldn't be comparable with any enum to avoid confusion
@ -80,13 +82,15 @@ struct MultiEnum
return !(*this == other);
}
template <typename ValueType, typename = std::enable_if_t<std::is_convertible_v<ValueType, StorageType>>>
template <typename ValueType>
requires std::is_convertible_v<ValueType, StorageType>
friend bool operator==(ValueType left, MultiEnum right)
{
return right.operator==(left);
}
template <typename L, typename = typename std::enable_if<!std::is_same_v<L, MultiEnum>>::type>
template <typename L>
requires (!std::is_same_v<L, MultiEnum>)
friend bool operator!=(L left, MultiEnum right)
{
return !(right.operator==(left));

View File

@ -473,6 +473,7 @@ class IColumn;
M(Bool, allow_experimental_geo_types, false, "Allow geo data types such as Point, Ring, Polygon, MultiPolygon", 0) \
M(Bool, data_type_default_nullable, false, "Data types without NULL or NOT NULL will make Nullable", 0) \
M(Bool, cast_keep_nullable, false, "CAST operator keep Nullable for result data type", 0) \
M(Bool, cast_ipv4_ipv6_default_on_conversion_error, false, "CAST operator into IPv4, CAST operator into IPV6 type, toIPv4, toIPv6 functions will return default value instead of throwing exception on conversion error.", 0) \
M(Bool, alter_partition_verbose_result, false, "Output information about affected parts. Currently works only for FREEZE and ATTACH commands.", 0) \
M(Bool, allow_experimental_database_materialized_mysql, false, "Allow to create database with Engine=MaterializedMySQL(...).", 0) \
M(Bool, allow_experimental_database_materialized_postgresql, false, "Allow to create database with Engine=MaterializedPostgreSQL(...).", 0) \
@ -492,6 +493,7 @@ class IColumn;
M(Bool, force_optimize_projection, false, "If projection optimization is enabled, SELECT queries need to use projection", 0) \
M(Bool, async_socket_for_remote, true, "Asynchronously read from socket executing remote query", 0) \
M(Bool, insert_null_as_default, true, "Insert DEFAULT values instead of NULL in INSERT SELECT (UNION ALL)", 0) \
M(Bool, describe_extend_object_types, false, "Deduce concrete type of columns of type Object in DESCRIBE query", 0) \
M(Bool, describe_include_subcolumns, false, "If true, subcolumns of all table columns will be included into result of DESCRIBE query", 0) \
\
M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \
@ -508,6 +510,7 @@ class IColumn;
M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \
M(UInt64, max_distributed_depth, 5, "Maximum distributed query depth", 0) \
M(Bool, database_replicated_always_detach_permanently, false, "Execute DETACH TABLE as DETACH TABLE PERMANENTLY if database engine is Replicated", 0) \
M(Bool, database_replicated_allow_only_replicated_engine, false, "Allow to create only Replicated tables in database with engine Replicated", 0) \
M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, "Format of distributed DDL query result", 0) \
M(UInt64, distributed_ddl_entry_format_version, 1, "Version of DDL entry to write into ZooKeeper", 0) \
\
@ -567,6 +570,7 @@ class IColumn;
/** Experimental functions */ \
M(Bool, allow_experimental_funnel_functions, false, "Enable experimental functions for funnel analysis.", 0) \
M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \
M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \
M(String, insert_deduplication_token, "", "If not empty, used for duplicate detection instead of data digest", 0) \
// End of COMMON_SETTINGS
// Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS.

View File

@ -87,6 +87,7 @@ enum class TypeIndex
AggregateFunction,
LowCardinality,
Map,
Object,
};
#if !defined(__clang__)
#pragma GCC diagnostic pop

View File

@ -15,6 +15,8 @@
#cmakedefine01 USE_NURAFT
#cmakedefine01 USE_NLP
#cmakedefine01 USE_KRB5
#cmakedefine01 USE_SIMDJSON
#cmakedefine01 USE_RAPIDJSON
#cmakedefine01 USE_FILELOG
#cmakedefine01 USE_ODBC
#cmakedefine01 USE_REPLXX

View File

@ -7,7 +7,8 @@ namespace DB
// Use template to disable implicit casting for certain overloaded types such as Field, which leads
// to overload resolution ambiguity.
class Field;
template <typename T, typename U = std::enable_if_t<std::is_same_v<T, Field>>>
template <typename T>
requires std::is_same_v<T, Field>
std::ostream & operator<<(std::ostream & stream, const T & what);
struct NameAndTypePair;

View File

@ -1,3 +1,5 @@
add_subdirectory (Serializations)
if (ENABLE_EXAMPLES)
add_subdirectory(examples)
add_subdirectory (examples)
endif ()

View File

@ -213,6 +213,7 @@ DataTypeFactory::DataTypeFactory()
registerDataTypeDomainSimpleAggregateFunction(*this);
registerDataTypeDomainGeo(*this);
registerDataTypeMap(*this);
registerDataTypeObject(*this);
}
DataTypeFactory & DataTypeFactory::instance()

View File

@ -87,5 +87,6 @@ void registerDataTypeDomainIPv4AndIPv6(DataTypeFactory & factory);
void registerDataTypeDomainBool(DataTypeFactory & factory);
void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory);
void registerDataTypeDomainGeo(DataTypeFactory & factory);
void registerDataTypeObject(DataTypeFactory & factory);
}

View File

@ -0,0 +1,83 @@
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/Serializations/SerializationObject.h>
#include <Parsers/IAST.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTFunction.h>
#include <IO/Operators.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int UNEXPECTED_AST_STRUCTURE;
}
DataTypeObject::DataTypeObject(const String & schema_format_, bool is_nullable_)
: schema_format(Poco::toLower(schema_format_))
, is_nullable(is_nullable_)
, default_serialization(getObjectSerialization(schema_format))
{
}
bool DataTypeObject::equals(const IDataType & rhs) const
{
if (const auto * object = typeid_cast<const DataTypeObject *>(&rhs))
return schema_format == object->schema_format && is_nullable == object->is_nullable;
return false;
}
SerializationPtr DataTypeObject::doGetDefaultSerialization() const
{
return default_serialization;
}
String DataTypeObject::doGetName() const
{
WriteBufferFromOwnString out;
if (is_nullable)
out << "Object(Nullable(" << quote << schema_format << "))";
else
out << "Object(" << quote << schema_format << ")";
return out.str();
}
static DataTypePtr create(const ASTPtr & arguments)
{
if (!arguments || arguments->children.size() != 1)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Object data type family must have one argument - name of schema format");
ASTPtr schema_argument = arguments->children[0];
bool is_nullable = false;
if (const auto * func = schema_argument->as<ASTFunction>())
{
if (func->name != "Nullable" || func->arguments->children.size() != 1)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE,
"Expected 'Nullable(<schema_name>)' as parameter for type Object", func->name);
schema_argument = func->arguments->children[0];
is_nullable = true;
}
const auto * literal = schema_argument->as<ASTLiteral>();
if (!literal || literal->value.getType() != Field::Types::String)
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE,
"Object data type family must have a const string as its schema name parameter");
return std::make_shared<DataTypeObject>(literal->value.get<const String &>(), is_nullable);
}
void registerDataTypeObject(DataTypeFactory & factory)
{
factory.registerDataType("Object", create);
factory.registerSimpleDataType("JSON",
[] { return std::make_shared<DataTypeObject>("JSON", false); },
DataTypeFactory::CaseInsensitive);
}
}

View File

@ -0,0 +1,46 @@
#pragma once
#include <DataTypes/IDataType.h>
#include <Core/Field.h>
#include <Columns/ColumnObject.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
class DataTypeObject : public IDataType
{
private:
String schema_format;
bool is_nullable;
SerializationPtr default_serialization;
public:
DataTypeObject(const String & schema_format_, bool is_nullable_);
const char * getFamilyName() const override { return "Object"; }
String doGetName() const override;
TypeIndex getTypeId() const override { return TypeIndex::Object; }
MutableColumnPtr createColumn() const override { return ColumnObject::create(is_nullable); }
Field getDefault() const override
{
throw Exception("Method getDefault() is not implemented for data type " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
bool haveSubtypes() const override { return false; }
bool equals(const IDataType & rhs) const override;
bool isParametric() const override { return true; }
SerializationPtr doGetDefaultSerialization() const override;
bool hasNullableSubcolumns() const { return is_nullable; }
};
}

View File

@ -1,6 +1,7 @@
#include <DataTypes/FieldToDataType.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/DataTypeString.h>
@ -108,12 +109,11 @@ DataTypePtr FieldToDataType::operator() (const Array & x) const
element_types.reserve(x.size());
for (const Field & elem : x)
element_types.emplace_back(applyVisitor(FieldToDataType(), elem));
element_types.emplace_back(applyVisitor(FieldToDataType(allow_convertion_to_string), elem));
return std::make_shared<DataTypeArray>(getLeastSupertype(element_types));
return std::make_shared<DataTypeArray>(getLeastSupertype(element_types, allow_convertion_to_string));
}
DataTypePtr FieldToDataType::operator() (const Tuple & tuple) const
{
if (tuple.empty())
@ -123,7 +123,7 @@ DataTypePtr FieldToDataType::operator() (const Tuple & tuple) const
element_types.reserve(tuple.size());
for (const auto & element : tuple)
element_types.push_back(applyVisitor(FieldToDataType(), element));
element_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), element));
return std::make_shared<DataTypeTuple>(element_types);
}
@ -139,11 +139,19 @@ DataTypePtr FieldToDataType::operator() (const Map & map) const
{
const auto & tuple = elem.safeGet<const Tuple &>();
assert(tuple.size() == 2);
key_types.push_back(applyVisitor(FieldToDataType(), tuple[0]));
value_types.push_back(applyVisitor(FieldToDataType(), tuple[1]));
key_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), tuple[0]));
value_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), tuple[1]));
}
return std::make_shared<DataTypeMap>(getLeastSupertype(key_types), getLeastSupertype(value_types));
return std::make_shared<DataTypeMap>(
getLeastSupertype(key_types, allow_convertion_to_string),
getLeastSupertype(value_types, allow_convertion_to_string));
}
DataTypePtr FieldToDataType::operator() (const Object &) const
{
/// TODO: Do we need different parameters for type Object?
return std::make_shared<DataTypeObject>("json", false);
}
DataTypePtr FieldToDataType::operator() (const AggregateFunctionStateData & x) const

View File

@ -20,26 +20,34 @@ using DataTypePtr = std::shared_ptr<const IDataType>;
class FieldToDataType : public StaticVisitor<DataTypePtr>
{
public:
FieldToDataType(bool allow_convertion_to_string_ = false)
: allow_convertion_to_string(allow_convertion_to_string_)
{
}
DataTypePtr operator() (const Null & x) const;
DataTypePtr operator() (const UInt64 & x) const;
DataTypePtr operator() (const UInt128 & x) const;
DataTypePtr operator() (const UInt256 & x) const;
DataTypePtr operator() (const Int64 & x) const;
DataTypePtr operator() (const Int128 & x) const;
DataTypePtr operator() (const Int256 & x) const;
DataTypePtr operator() (const UUID & x) const;
DataTypePtr operator() (const Float64 & x) const;
DataTypePtr operator() (const String & x) const;
DataTypePtr operator() (const Array & x) const;
DataTypePtr operator() (const Tuple & tuple) const;
DataTypePtr operator() (const Map & map) const;
DataTypePtr operator() (const Object & map) const;
DataTypePtr operator() (const DecimalField<Decimal32> & x) const;
DataTypePtr operator() (const DecimalField<Decimal64> & x) const;
DataTypePtr operator() (const DecimalField<Decimal128> & x) const;
DataTypePtr operator() (const DecimalField<Decimal256> & x) const;
DataTypePtr operator() (const AggregateFunctionStateData & x) const;
DataTypePtr operator() (const UInt256 & x) const;
DataTypePtr operator() (const Int256 & x) const;
DataTypePtr operator() (const bool & x) const;
private:
bool allow_convertion_to_string;
};
}

View File

@ -126,19 +126,25 @@ DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const
DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const
{
SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type);
return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, true);
}
SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const
ColumnPtr IDataType::tryGetSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const
{
SubstreamData data = { serialization, nullptr, nullptr, nullptr };
return getForSubcolumn<SerializationPtr>(subcolumn_name, data, &SubstreamData::serialization);
SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr };
return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column, false);
}
ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const
{
SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr };
return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column);
return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column, true);
}
SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const
{
SubstreamData data = { serialization, nullptr, nullptr, nullptr };
return getForSubcolumn<SerializationPtr>(subcolumn_name, data, &SubstreamData::serialization, true);
}
Names IDataType::getSubcolumnNames() const

View File

@ -82,9 +82,11 @@ public:
DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const;
DataTypePtr getSubcolumnType(const String & subcolumn_name) const;
SerializationPtr getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const;
ColumnPtr tryGetSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const;
ColumnPtr getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const;
SerializationPtr getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const;
using SubstreamData = ISerialization::SubstreamData;
using SubstreamPath = ISerialization::SubstreamPath;
@ -309,7 +311,7 @@ private:
const String & subcolumn_name,
const SubstreamData & data,
Ptr SubstreamData::*member,
bool throw_if_null = true) const;
bool throw_if_null) const;
};
@ -373,11 +375,13 @@ struct WhichDataType
constexpr bool isMap() const {return idx == TypeIndex::Map; }
constexpr bool isSet() const { return idx == TypeIndex::Set; }
constexpr bool isInterval() const { return idx == TypeIndex::Interval; }
constexpr bool isObject() const { return idx == TypeIndex::Object; }
constexpr bool isNothing() const { return idx == TypeIndex::Nothing; }
constexpr bool isNullable() const { return idx == TypeIndex::Nullable; }
constexpr bool isFunction() const { return idx == TypeIndex::Function; }
constexpr bool isAggregateFunction() const { return idx == TypeIndex::AggregateFunction; }
constexpr bool isSimple() const { return isInt() || isUInt() || isFloat() || isString(); }
constexpr bool isLowCarnality() const { return idx == TypeIndex::LowCardinality; }
};
@ -399,10 +403,16 @@ inline bool isEnum(const DataTypePtr & data_type) { return WhichDataType(data_ty
inline bool isDecimal(const DataTypePtr & data_type) { return WhichDataType(data_type).isDecimal(); }
inline bool isTuple(const DataTypePtr & data_type) { return WhichDataType(data_type).isTuple(); }
inline bool isArray(const DataTypePtr & data_type) { return WhichDataType(data_type).isArray(); }
inline bool isMap(const DataTypePtr & data_type) { return WhichDataType(data_type).isMap(); }
inline bool isMap(const DataTypePtr & data_type) {return WhichDataType(data_type).isMap(); }
inline bool isNothing(const DataTypePtr & data_type) { return WhichDataType(data_type).isNothing(); }
inline bool isUUID(const DataTypePtr & data_type) { return WhichDataType(data_type).isUUID(); }
template <typename T>
inline bool isObject(const T & data_type)
{
return WhichDataType(data_type).isObject();
}
template <typename T>
inline bool isUInt8(const T & data_type)
{

View File

@ -30,6 +30,12 @@ namespace Nested
std::string concatenateName(const std::string & nested_table_name, const std::string & nested_field_name)
{
if (nested_table_name.empty())
return nested_field_name;
if (nested_field_name.empty())
return nested_table_name;
return nested_table_name + "." + nested_field_name;
}

View File

@ -0,0 +1,703 @@
#include <DataTypes/ObjectUtils.h>
#include <DataTypes/DataTypeObject.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeNested.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/getLeastSupertype.h>
#include <DataTypes/NestedUtils.h>
#include <Columns/ColumnObject.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnNullable.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTExpressionList.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTFunction.h>
#include <IO/Operators.h>
namespace DB
{
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int LOGICAL_ERROR;
extern const int DUPLICATE_COLUMN;
}
size_t getNumberOfDimensions(const IDataType & type)
{
if (const auto * type_array = typeid_cast<const DataTypeArray *>(&type))
return type_array->getNumberOfDimensions();
return 0;
}
size_t getNumberOfDimensions(const IColumn & column)
{
if (const auto * column_array = checkAndGetColumn<ColumnArray>(column))
return column_array->getNumberOfDimensions();
return 0;
}
DataTypePtr getBaseTypeOfArray(const DataTypePtr & type)
{
/// Get raw pointers to avoid extra copying of type pointers.
const DataTypeArray * last_array = nullptr;
const auto * current_type = type.get();
while (const auto * type_array = typeid_cast<const DataTypeArray *>(current_type))
{
current_type = type_array->getNestedType().get();
last_array = type_array;
}
return last_array ? last_array->getNestedType() : type;
}
ColumnPtr getBaseColumnOfArray(const ColumnPtr & column)
{
/// Get raw pointers to avoid extra copying of column pointers.
const ColumnArray * last_array = nullptr;
const auto * current_column = column.get();
while (const auto * column_array = checkAndGetColumn<ColumnArray>(current_column))
{
current_column = &column_array->getData();
last_array = column_array;
}
return last_array ? last_array->getDataPtr() : column;
}
DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions)
{
for (size_t i = 0; i < num_dimensions; ++i)
type = std::make_shared<DataTypeArray>(std::move(type));
return type;
}
ColumnPtr createArrayOfColumn(ColumnPtr column, size_t num_dimensions)
{
for (size_t i = 0; i < num_dimensions; ++i)
column = ColumnArray::create(column);
return column;
}
Array createEmptyArrayField(size_t num_dimensions)
{
if (num_dimensions == 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create array field with 0 dimensions");
Array array;
Array * current_array = &array;
for (size_t i = 1; i < num_dimensions; ++i)
{
current_array->push_back(Array());
current_array = &current_array->back().get<Array &>();
}
return array;
}
DataTypePtr getDataTypeByColumn(const IColumn & column)
{
auto idx = column.getDataType();
if (WhichDataType(idx).isSimple())
return DataTypeFactory::instance().get(String(magic_enum::enum_name(idx)));
if (const auto * column_array = checkAndGetColumn<ColumnArray>(&column))
return std::make_shared<DataTypeArray>(getDataTypeByColumn(column_array->getData()));
if (const auto * column_nullable = checkAndGetColumn<ColumnNullable>(&column))
return makeNullable(getDataTypeByColumn(column_nullable->getNestedColumn()));
/// TODO: add more types.
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get data type of column {}", column.getFamilyName());
}
template <size_t I, typename Tuple>
static auto extractVector(const std::vector<Tuple> & vec)
{
static_assert(I < std::tuple_size_v<Tuple>);
std::vector<std::tuple_element_t<I, Tuple>> res;
res.reserve(vec.size());
for (const auto & elem : vec)
res.emplace_back(std::get<I>(elem));
return res;
}
void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, const NamesAndTypesList & extended_storage_columns)
{
std::unordered_map<String, DataTypePtr> storage_columns_map;
for (const auto & [name, type] : extended_storage_columns)
storage_columns_map[name] = type;
for (auto & name_type : columns_list)
{
if (!isObject(name_type.type))
continue;
auto & column = block.getByName(name_type.name);
if (!isObject(column.type))
throw Exception(ErrorCodes::TYPE_MISMATCH,
"Type for column '{}' mismatch in columns list and in block. In list: {}, in block: {}",
name_type.name, name_type.type->getName(), column.type->getName());
const auto & column_object = assert_cast<const ColumnObject &>(*column.column);
const auto & subcolumns = column_object.getSubcolumns();
if (!column_object.isFinalized())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Cannot convert to tuple column '{}' from type {}. Column should be finalized first",
name_type.name, name_type.type->getName());
PathsInData tuple_paths;
DataTypes tuple_types;
Columns tuple_columns;
for (const auto & entry : subcolumns)
{
tuple_paths.emplace_back(entry->path);
tuple_types.emplace_back(entry->data.getLeastCommonType());
tuple_columns.emplace_back(entry->data.getFinalizedColumnPtr());
}
auto it = storage_columns_map.find(name_type.name);
if (it == storage_columns_map.end())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' not found in storage", name_type.name);
std::tie(column.column, column.type) = unflattenTuple(tuple_paths, tuple_types, tuple_columns);
name_type.type = column.type;
/// Check that constructed Tuple type and type in storage are compatible.
getLeastCommonTypeForObject({column.type, it->second}, true);
}
}
static bool isPrefix(const PathInData::Parts & prefix, const PathInData::Parts & parts)
{
if (prefix.size() > parts.size())
return false;
for (size_t i = 0; i < prefix.size(); ++i)
if (prefix[i].key != parts[i].key)
return false;
return true;
}
void checkObjectHasNoAmbiguosPaths(const PathsInData & paths)
{
size_t size = paths.size();
for (size_t i = 0; i < size; ++i)
{
for (size_t j = 0; j < i; ++j)
{
if (isPrefix(paths[i].getParts(), paths[j].getParts())
|| isPrefix(paths[j].getParts(), paths[i].getParts()))
throw Exception(ErrorCodes::DUPLICATE_COLUMN,
"Data in Object has ambiguous paths: '{}' and '{}'",
paths[i].getPath(), paths[j].getPath());
}
}
}
DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool check_ambiguos_paths)
{
if (types.empty())
return nullptr;
bool all_equal = true;
for (size_t i = 1; i < types.size(); ++i)
{
if (!types[i]->equals(*types[0]))
{
all_equal = false;
break;
}
}
if (all_equal)
return types[0];
/// Types of subcolumns by path from all tuples.
std::unordered_map<PathInData, DataTypes, PathInData::Hash> subcolumns_types;
/// First we flatten tuples, then get common type for paths
/// and finally unflatten paths and create new tuple type.
for (const auto & type : types)
{
const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type.get());
if (!type_tuple)
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Least common type for object can be deduced only from tuples, but {} given", type->getName());
auto [tuple_paths, tuple_types] = flattenTuple(type);
assert(tuple_paths.size() == tuple_types.size());
for (size_t i = 0; i < tuple_paths.size(); ++i)
subcolumns_types[tuple_paths[i]].push_back(tuple_types[i]);
}
PathsInData tuple_paths;
DataTypes tuple_types;
/// Get the least common type for all paths.
for (const auto & [key, subtypes] : subcolumns_types)
{
assert(!subtypes.empty());
if (key.getPath() == ColumnObject::COLUMN_NAME_DUMMY)
continue;
size_t first_dim = getNumberOfDimensions(*subtypes[0]);
for (size_t i = 1; i < subtypes.size(); ++i)
if (first_dim != getNumberOfDimensions(*subtypes[i]))
throw Exception(ErrorCodes::TYPE_MISMATCH,
"Uncompatible types of subcolumn '{}': {} and {}",
key.getPath(), subtypes[0]->getName(), subtypes[i]->getName());
tuple_paths.emplace_back(key);
tuple_types.emplace_back(getLeastSupertype(subtypes, /*allow_conversion_to_string=*/ true));
}
if (tuple_paths.empty())
{
tuple_paths.emplace_back(ColumnObject::COLUMN_NAME_DUMMY);
tuple_types.emplace_back(std::make_shared<DataTypeUInt8>());
}
if (check_ambiguos_paths)
checkObjectHasNoAmbiguosPaths(tuple_paths);
return unflattenTuple(tuple_paths, tuple_types);
}
NameSet getNamesOfObjectColumns(const NamesAndTypesList & columns_list)
{
NameSet res;
for (const auto & [name, type] : columns_list)
if (isObject(type))
res.insert(name);
return res;
}
bool hasObjectColumns(const ColumnsDescription & columns)
{
return std::any_of(columns.begin(), columns.end(), [](const auto & column) { return isObject(column.type); });
}
void extendObjectColumns(NamesAndTypesList & columns_list, const ColumnsDescription & object_columns, bool with_subcolumns)
{
NamesAndTypesList subcolumns_list;
for (auto & column : columns_list)
{
auto object_column = object_columns.tryGetColumn(GetColumnsOptions::All, column.name);
if (object_column)
{
column.type = object_column->type;
if (with_subcolumns)
subcolumns_list.splice(subcolumns_list.end(), object_columns.getSubcolumns(column.name));
}
}
columns_list.splice(columns_list.end(), std::move(subcolumns_list));
}
void updateObjectColumns(ColumnsDescription & object_columns, const NamesAndTypesList & new_columns)
{
for (const auto & new_column : new_columns)
{
auto object_column = object_columns.tryGetColumn(GetColumnsOptions::All, new_column.name);
if (object_column && !object_column->type->equals(*new_column.type))
{
object_columns.modify(new_column.name, [&](auto & column)
{
column.type = getLeastCommonTypeForObject({object_column->type, new_column.type});
});
}
}
}
namespace
{
void flattenTupleImpl(
PathInDataBuilder & builder,
DataTypePtr type,
std::vector<PathInData::Parts> & new_paths,
DataTypes & new_types)
{
if (const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type.get()))
{
const auto & tuple_names = type_tuple->getElementNames();
const auto & tuple_types = type_tuple->getElements();
for (size_t i = 0; i < tuple_names.size(); ++i)
{
builder.append(tuple_names[i], false);
flattenTupleImpl(builder, tuple_types[i], new_paths, new_types);
builder.popBack();
}
}
else if (const auto * type_array = typeid_cast<const DataTypeArray *>(type.get()))
{
PathInDataBuilder element_builder;
std::vector<PathInData::Parts> element_paths;
DataTypes element_types;
flattenTupleImpl(element_builder, type_array->getNestedType(), element_paths, element_types);
assert(element_paths.size() == element_types.size());
for (size_t i = 0; i < element_paths.size(); ++i)
{
builder.append(element_paths[i], true);
new_paths.emplace_back(builder.getParts());
new_types.emplace_back(std::make_shared<DataTypeArray>(element_types[i]));
builder.popBack(element_paths[i].size());
}
}
else
{
new_paths.emplace_back(builder.getParts());
new_types.emplace_back(type);
}
}
/// @offsets_columns are used as stack of array offsets and allows to recreate Array columns.
void flattenTupleImpl(const ColumnPtr & column, Columns & new_columns, Columns & offsets_columns)
{
if (const auto * column_tuple = checkAndGetColumn<ColumnTuple>(column.get()))
{
const auto & subcolumns = column_tuple->getColumns();
for (const auto & subcolumn : subcolumns)
flattenTupleImpl(subcolumn, new_columns, offsets_columns);
}
else if (const auto * column_array = checkAndGetColumn<ColumnArray>(column.get()))
{
offsets_columns.push_back(column_array->getOffsetsPtr());
flattenTupleImpl(column_array->getDataPtr(), new_columns, offsets_columns);
offsets_columns.pop_back();
}
else
{
if (!offsets_columns.empty())
{
auto new_column = ColumnArray::create(column, offsets_columns.back());
for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it)
new_column = ColumnArray::create(new_column, *it);
new_columns.push_back(std::move(new_column));
}
else
{
new_columns.push_back(column);
}
}
}
DataTypePtr reduceNumberOfDimensions(DataTypePtr type, size_t dimensions_to_reduce)
{
while (dimensions_to_reduce--)
{
const auto * type_array = typeid_cast<const DataTypeArray *>(type.get());
if (!type_array)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Not enough dimensions to reduce");
type = type_array->getNestedType();
}
return type;
}
ColumnPtr reduceNumberOfDimensions(ColumnPtr column, size_t dimensions_to_reduce)
{
while (dimensions_to_reduce--)
{
const auto * column_array = typeid_cast<const ColumnArray *>(column.get());
if (!column_array)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Not enough dimensions to reduce");
column = column_array->getDataPtr();
}
return column;
}
/// We save intermediate column, type and number of array
/// dimensions for each intermediate node in path in subcolumns tree.
struct ColumnWithTypeAndDimensions
{
ColumnPtr column;
DataTypePtr type;
size_t array_dimensions;
};
using SubcolumnsTreeWithColumns = SubcolumnsTree<ColumnWithTypeAndDimensions>;
using Node = SubcolumnsTreeWithColumns::Node;
/// Creates data type and column from tree of subcolumns.
ColumnWithTypeAndDimensions createTypeFromNode(const Node * node)
{
auto collect_tuple_elemets = [](const auto & children)
{
std::vector<std::tuple<String, ColumnWithTypeAndDimensions>> tuple_elements;
tuple_elements.reserve(children.size());
for (const auto & [name, child] : children)
{
auto column = createTypeFromNode(child.get());
tuple_elements.emplace_back(name, std::move(column));
}
/// Sort to always create the same type for the same set of subcolumns.
std::sort(tuple_elements.begin(), tuple_elements.end(),
[](const auto & lhs, const auto & rhs) { return std::get<0>(lhs) < std::get<0>(rhs); });
auto tuple_names = extractVector<0>(tuple_elements);
auto tuple_columns = extractVector<1>(tuple_elements);
return std::make_tuple(std::move(tuple_names), std::move(tuple_columns));
};
if (node->kind == Node::SCALAR)
{
return node->data;
}
else if (node->kind == Node::NESTED)
{
auto [tuple_names, tuple_columns] = collect_tuple_elemets(node->children);
Columns offsets_columns;
offsets_columns.reserve(tuple_columns[0].array_dimensions + 1);
/// If we have a Nested node and child node with anonymous array levels
/// we need to push a Nested type through all array levels.
/// Example: { "k1": [[{"k2": 1, "k3": 2}] } should be parsed as
/// `k1 Array(Nested(k2 Int, k3 Int))` and k1 is marked as Nested
/// and `k2` and `k3` has anonymous_array_level = 1 in that case.
const auto & current_array = assert_cast<const ColumnArray &>(*node->data.column);
offsets_columns.push_back(current_array.getOffsetsPtr());
auto first_column = tuple_columns[0].column;
for (size_t i = 0; i < tuple_columns[0].array_dimensions; ++i)
{
const auto & column_array = assert_cast<const ColumnArray &>(*first_column);
offsets_columns.push_back(column_array.getOffsetsPtr());
first_column = column_array.getDataPtr();
}
size_t num_elements = tuple_columns.size();
Columns tuple_elements_columns(num_elements);
DataTypes tuple_elements_types(num_elements);
/// Reduce extra array dimensions to get columns and types of Nested elements.
for (size_t i = 0; i < num_elements; ++i)
{
assert(tuple_columns[i].array_dimensions == tuple_columns[0].array_dimensions);
tuple_elements_columns[i] = reduceNumberOfDimensions(tuple_columns[i].column, tuple_columns[i].array_dimensions);
tuple_elements_types[i] = reduceNumberOfDimensions(tuple_columns[i].type, tuple_columns[i].array_dimensions);
}
auto result_column = ColumnArray::create(ColumnTuple::create(tuple_elements_columns), offsets_columns.back());
auto result_type = createNested(tuple_elements_types, tuple_names);
/// Recreate result Array type and Array column.
for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it)
{
result_column = ColumnArray::create(result_column, *it);
result_type = std::make_shared<DataTypeArray>(result_type);
}
return {result_column, result_type, tuple_columns[0].array_dimensions};
}
else
{
auto [tuple_names, tuple_columns] = collect_tuple_elemets(node->children);
size_t num_elements = tuple_columns.size();
Columns tuple_elements_columns(num_elements);
DataTypes tuple_elements_types(num_elements);
for (size_t i = 0; i < tuple_columns.size(); ++i)
{
assert(tuple_columns[i].array_dimensions == tuple_columns[0].array_dimensions);
tuple_elements_columns[i] = tuple_columns[i].column;
tuple_elements_types[i] = tuple_columns[i].type;
}
auto result_column = ColumnTuple::create(tuple_elements_columns);
auto result_type = std::make_shared<DataTypeTuple>(tuple_elements_types, tuple_names);
return {result_column, result_type, tuple_columns[0].array_dimensions};
}
}
}
std::pair<PathsInData, DataTypes> flattenTuple(const DataTypePtr & type)
{
std::vector<PathInData::Parts> new_path_parts;
DataTypes new_types;
PathInDataBuilder builder;
flattenTupleImpl(builder, type, new_path_parts, new_types);
PathsInData new_paths(new_path_parts.begin(), new_path_parts.end());
return {new_paths, new_types};
}
ColumnPtr flattenTuple(const ColumnPtr & column)
{
Columns new_columns;
Columns offsets_columns;
flattenTupleImpl(column, new_columns, offsets_columns);
return ColumnTuple::create(new_columns);
}
DataTypePtr unflattenTuple(const PathsInData & paths, const DataTypes & tuple_types)
{
assert(paths.size() == tuple_types.size());
Columns tuple_columns;
tuple_columns.reserve(tuple_types.size());
for (const auto & type : tuple_types)
tuple_columns.emplace_back(type->createColumn());
return unflattenTuple(paths, tuple_types, tuple_columns).second;
}
std::pair<ColumnPtr, DataTypePtr> unflattenTuple(
const PathsInData & paths,
const DataTypes & tuple_types,
const Columns & tuple_columns)
{
assert(paths.size() == tuple_types.size());
assert(paths.size() == tuple_columns.size());
/// We add all paths to the subcolumn tree and then create a type from it.
/// The tree stores column, type and number of array dimensions
/// for each intermediate node.
SubcolumnsTreeWithColumns tree;
for (size_t i = 0; i < paths.size(); ++i)
{
auto column = tuple_columns[i];
auto type = tuple_types[i];
const auto & parts = paths[i].getParts();
size_t num_parts = parts.size();
size_t pos = 0;
tree.add(paths[i], [&](Node::Kind kind, bool exists) -> std::shared_ptr<Node>
{
if (pos >= num_parts)
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Not enough name parts for path {}. Expected at least {}, got {}",
paths[i].getPath(), pos + 1, num_parts);
size_t array_dimensions = kind == Node::NESTED ? 1 : parts[pos].anonymous_array_level;
ColumnWithTypeAndDimensions current_column{column, type, array_dimensions};
/// Get type and column for next node.
if (array_dimensions)
{
type = reduceNumberOfDimensions(type, array_dimensions);
column = reduceNumberOfDimensions(column, array_dimensions);
}
++pos;
if (exists)
return nullptr;
return kind == Node::SCALAR
? std::make_shared<Node>(kind, current_column, paths[i])
: std::make_shared<Node>(kind, current_column);
});
}
auto [column, type, _] = createTypeFromNode(tree.getRoot());
return std::make_pair(std::move(column), std::move(type));
}
static void addConstantToWithClause(const ASTPtr & query, const String & column_name, const DataTypePtr & data_type)
{
auto & select = query->as<ASTSelectQuery &>();
if (!select.with())
select.setExpression(ASTSelectQuery::Expression::WITH, std::make_shared<ASTExpressionList>());
/// TODO: avoid materialize
auto node = makeASTFunction("materialize",
makeASTFunction("CAST",
std::make_shared<ASTLiteral>(data_type->getDefault()),
std::make_shared<ASTLiteral>(data_type->getName())));
node->alias = column_name;
node->prefer_alias_to_column_name = true;
select.with()->children.push_back(std::move(node));
}
/// @expected_columns and @available_columns contain descriptions
/// of extended Object columns.
void replaceMissedSubcolumnsByConstants(
const ColumnsDescription & expected_columns,
const ColumnsDescription & available_columns,
ASTPtr query)
{
NamesAndTypes missed_names_types;
/// Find all subcolumns that are in @expected_columns, but not in @available_columns.
for (const auto & column : available_columns)
{
auto expected_column = expected_columns.getColumn(GetColumnsOptions::All, column.name);
/// Extract all paths from both descriptions to easily check existence of subcolumns.
auto [available_paths, available_types] = flattenTuple(column.type);
auto [expected_paths, expected_types] = flattenTuple(expected_column.type);
auto extract_names_and_types = [&column](const auto & paths, const auto & types)
{
NamesAndTypes res;
res.reserve(paths.size());
for (size_t i = 0; i < paths.size(); ++i)
{
auto full_name = Nested::concatenateName(column.name, paths[i].getPath());
res.emplace_back(full_name, types[i]);
}
std::sort(res.begin(), res.end());
return res;
};
auto available_names_types = extract_names_and_types(available_paths, available_types);
auto expected_names_types = extract_names_and_types(expected_paths, expected_types);
std::set_difference(
expected_names_types.begin(), expected_names_types.end(),
available_names_types.begin(), available_names_types.end(),
std::back_inserter(missed_names_types),
[](const auto & lhs, const auto & rhs) { return lhs.name < rhs.name; });
}
if (missed_names_types.empty())
return;
IdentifierNameSet identifiers;
query->collectIdentifierNames(identifiers);
/// Replace missed subcolumns to default literals of theirs type.
for (const auto & [name, type] : missed_names_types)
if (identifiers.count(name))
addConstantToWithClause(query, name, type);
}
void finalizeObjectColumns(MutableColumns & columns)
{
for (auto & column : columns)
if (auto * column_object = typeid_cast<ColumnObject *>(column.get()))
column_object->finalize();
}
}

140
src/DataTypes/ObjectUtils.h Normal file
View File

@ -0,0 +1,140 @@
#pragma once
#include <Core/Block.h>
#include <Core/NamesAndTypes.h>
#include <Common/FieldVisitors.h>
#include <Storages/ColumnsDescription.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/Serializations/JSONDataParser.h>
#include <DataTypes/DataTypesNumber.h>
#include <Columns/ColumnObject.h>
namespace DB
{
/// Returns number of dimensions in Array type. 0 if type is not array.
size_t getNumberOfDimensions(const IDataType & type);
/// Returns number of dimensions in Array column. 0 if column is not array.
size_t getNumberOfDimensions(const IColumn & column);
/// Returns type of scalars of Array of arbitrary dimensions.
DataTypePtr getBaseTypeOfArray(const DataTypePtr & type);
/// Returns Array type with requested scalar type and number of dimensions.
DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions);
/// Returns column of scalars of Array of arbitrary dimensions.
ColumnPtr getBaseColumnOfArray(const ColumnPtr & column);
/// Returns empty Array column with requested scalar column and number of dimensions.
ColumnPtr createArrayOfColumn(const ColumnPtr & column, size_t num_dimensions);
/// Returns Array with requested number of dimensions and no scalars.
Array createEmptyArrayField(size_t num_dimensions);
/// Tries to get data type by column. Only limited subset of types is supported
DataTypePtr getDataTypeByColumn(const IColumn & column);
/// Converts Object types and columns to Tuples in @columns_list and @block
/// and checks that types are consistent with types in @extended_storage_columns.
void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, const NamesAndTypesList & extended_storage_columns);
/// Checks that each path is not the prefix of any other path.
void checkObjectHasNoAmbiguosPaths(const PathsInData & paths);
/// Receives several Tuple types and deduces the least common type among them.
DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool check_ambiguos_paths = false);
/// Converts types of object columns to tuples in @columns_list
/// according to @object_columns and adds all tuple's subcolumns if needed.
void extendObjectColumns(NamesAndTypesList & columns_list, const ColumnsDescription & object_columns, bool with_subcolumns);
NameSet getNamesOfObjectColumns(const NamesAndTypesList & columns_list);
bool hasObjectColumns(const ColumnsDescription & columns);
void finalizeObjectColumns(MutableColumns & columns);
/// Updates types of objects in @object_columns inplace
/// according to types in new_columns.
void updateObjectColumns(ColumnsDescription & object_columns, const NamesAndTypesList & new_columns);
using DataTypeTuplePtr = std::shared_ptr<DataTypeTuple>;
/// Flattens nested Tuple to plain Tuple. I.e extracts all paths and types from tuple.
/// E.g. Tuple(t Tuple(c1 UInt32, c2 String), c3 UInt64) -> Tuple(t.c1 UInt32, t.c2 String, c3 UInt32)
std::pair<PathsInData, DataTypes> flattenTuple(const DataTypePtr & type);
/// Flattens nested Tuple column to plain Tuple column.
ColumnPtr flattenTuple(const ColumnPtr & column);
/// The reverse operation to 'flattenTuple'.
/// Creates nested Tuple from all paths and types.
/// E.g. Tuple(t.c1 UInt32, t.c2 String, c3 UInt32) -> Tuple(t Tuple(c1 UInt32, c2 String), c3 UInt64)
DataTypePtr unflattenTuple(
const PathsInData & paths,
const DataTypes & tuple_types);
std::pair<ColumnPtr, DataTypePtr> unflattenTuple(
const PathsInData & paths,
const DataTypes & tuple_types,
const Columns & tuple_columns);
/// For all columns which exist in @expected_columns and
/// don't exist in @available_columns adds to WITH clause
/// an alias with column name to literal of default value of column type.
void replaceMissedSubcolumnsByConstants(
const ColumnsDescription & expected_columns,
const ColumnsDescription & available_columns,
ASTPtr query);
/// Receives range of objects, which contains collections
/// of columns-like objects (e.g. ColumnsDescription or NamesAndTypesList)
/// and deduces the common types of object columns for all entries.
/// @entry_columns_getter should extract reference to collection of
/// columns-like objects from entry to which Iterator points.
/// columns-like object should have fields "name" and "type".
template <typename Iterator, typename EntryColumnsGetter>
ColumnsDescription getObjectColumns(
Iterator begin, Iterator end,
const ColumnsDescription & storage_columns,
EntryColumnsGetter && entry_columns_getter)
{
ColumnsDescription res;
if (begin == end)
{
for (const auto & column : storage_columns)
{
if (isObject(column.type))
{
auto tuple_type = std::make_shared<DataTypeTuple>(
DataTypes{std::make_shared<DataTypeUInt8>()},
Names{ColumnObject::COLUMN_NAME_DUMMY});
res.add({column.name, std::move(tuple_type)});
}
}
return res;
}
std::unordered_map<String, DataTypes> types_in_entries;
for (auto it = begin; it != end; ++it)
{
const auto & entry_columns = entry_columns_getter(*it);
for (const auto & column : entry_columns)
{
auto storage_column = storage_columns.tryGetPhysical(column.name);
if (storage_column && isObject(storage_column->type))
types_in_entries[column.name].push_back(column.type);
}
}
for (const auto & [name, types] : types_in_entries)
res.add({name, getLeastCommonTypeForObject(types)});
return res;
}
}

View File

@ -0,0 +1,3 @@
if (ENABLE_TESTS)
add_subdirectory (tests)
endif ()

View File

@ -172,6 +172,10 @@ String getNameForSubstreamPath(
else
stream_name += "." + it->tuple_element_name;
}
else if (it->type == Substream::ObjectElement)
{
stream_name += escapeForFileName(".") + escapeForFileName(it->object_key_name);
}
}
return stream_name;

View File

@ -125,6 +125,9 @@ public:
SparseElements,
SparseOffsets,
ObjectStructure,
ObjectElement,
Regular,
};
@ -133,6 +136,9 @@ public:
/// Index of tuple element, starting at 1 or name.
String tuple_element_name;
/// Name of subcolumn of object column.
String object_key_name;
/// Do we need to escape a dot in filenames for tuple elements.
bool escape_tuple_delimiter = true;

View File

@ -0,0 +1,183 @@
#pragma once
#include <IO/ReadHelpers.h>
#include <Common/HashTable/HashMap.h>
#include <Common/checkStackSize.h>
#include <DataTypes/Serializations/PathInData.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
class ReadBuffer;
class WriteBuffer;
template <typename Element>
static Field getValueAsField(const Element & element)
{
if (element.isBool()) return element.getBool();
if (element.isInt64()) return element.getInt64();
if (element.isUInt64()) return element.getUInt64();
if (element.isDouble()) return element.getDouble();
if (element.isString()) return element.getString();
if (element.isNull()) return Field();
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unsupported type of JSON field");
}
template <typename ParserImpl>
class JSONDataParser
{
public:
using Element = typename ParserImpl::Element;
void readJSON(String & s, ReadBuffer & buf)
{
readJSONObjectPossiblyInvalid(s, buf);
}
std::optional<ParseResult> parse(const char * begin, size_t length)
{
std::string_view json{begin, length};
Element document;
if (!parser.parse(json, document))
return {};
ParseResult result;
PathInDataBuilder builder;
std::vector<PathInData::Parts> paths;
traverse(document, builder, paths, result.values);
result.paths.reserve(paths.size());
for (auto && path : paths)
result.paths.emplace_back(std::move(path));
return result;
}
private:
void traverse(
const Element & element,
PathInDataBuilder & builder,
std::vector<PathInData::Parts> & paths,
std::vector<Field> & values)
{
checkStackSize();
if (element.isObject())
{
auto object = element.getObject();
paths.reserve(paths.size() + object.size());
values.reserve(values.size() + object.size());
for (auto it = object.begin(); it != object.end(); ++it)
{
const auto & [key, value] = *it;
traverse(value, builder.append(key, false), paths, values);
builder.popBack();
}
}
else if (element.isArray())
{
auto array = element.getArray();
using PathPartsWithArray = std::pair<PathInData::Parts, Array>;
using PathToArray = HashMapWithStackMemory<UInt128, PathPartsWithArray, UInt128TrivialHash, 5>;
/// Traverse elements of array and collect an array
/// of fields by each path.
PathToArray arrays_by_path;
Arena strings_pool;
size_t current_size = 0;
for (auto it = array.begin(); it != array.end(); ++it)
{
std::vector<PathInData::Parts> element_paths;
std::vector<Field> element_values;
PathInDataBuilder element_builder;
traverse(*it, element_builder, element_paths, element_values);
size_t size = element_paths.size();
size_t keys_to_update = arrays_by_path.size();
for (size_t i = 0; i < size; ++i)
{
UInt128 hash = PathInData::getPartsHash(element_paths[i]);
if (auto * found = arrays_by_path.find(hash))
{
auto & path_array = found->getMapped().second;
assert(path_array.size() == current_size);
path_array.push_back(std::move(element_values[i]));
--keys_to_update;
}
else
{
/// We found a new key. Add and empty array with current size.
Array path_array;
path_array.reserve(array.size());
path_array.resize(current_size);
path_array.push_back(std::move(element_values[i]));
auto & elem = arrays_by_path[hash];
elem.first = std::move(element_paths[i]);
elem.second = std::move(path_array);
}
}
/// If some of the keys are missed in current element,
/// add default values for them.
if (keys_to_update)
{
for (auto & [_, value] : arrays_by_path)
{
auto & path_array = value.second;
assert(path_array.size() == current_size || path_array.size() == current_size + 1);
if (path_array.size() == current_size)
path_array.push_back(Field());
}
}
++current_size;
}
if (arrays_by_path.empty())
{
paths.push_back(builder.getParts());
values.push_back(Array());
}
else
{
paths.reserve(paths.size() + arrays_by_path.size());
values.reserve(values.size() + arrays_by_path.size());
for (auto && [_, value] : arrays_by_path)
{
auto && [path, path_array] = value;
/// Merge prefix path and path of array element.
paths.push_back(builder.append(path, true).getParts());
values.push_back(std::move(path_array));
builder.popBack(path.size());
}
}
}
else
{
paths.push_back(builder.getParts());
values.push_back(getValueAsField(element));
}
}
ParserImpl parser;
};
}

View File

@ -0,0 +1,199 @@
#include <DataTypes/Serializations/PathInData.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeArray.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnArray.h>
#include <Common/SipHash.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string.hpp>
namespace DB
{
PathInData::PathInData(std::string_view path_)
: path(path_)
{
const char * begin = path.data();
const char * end = path.data() + path.size();
for (const char * it = path.data(); it != end; ++it)
{
if (*it == '.')
{
size_t size = static_cast<size_t>(it - begin);
parts.emplace_back(std::string_view{begin, size}, false, 0);
begin = it + 1;
}
}
size_t size = static_cast<size_t>(end - begin);
parts.emplace_back(std::string_view{begin, size}, false, 0.);
}
PathInData::PathInData(const Parts & parts_)
: path(buildPath(parts_))
, parts(buildParts(path, parts_))
{
}
PathInData::PathInData(const PathInData & other)
: path(other.path)
, parts(buildParts(path, other.getParts()))
{
}
PathInData & PathInData::operator=(const PathInData & other)
{
if (this != &other)
{
path = other.path;
parts = buildParts(path, other.parts);
}
return *this;
}
UInt128 PathInData::getPartsHash(const Parts & parts_)
{
SipHash hash;
hash.update(parts_.size());
for (const auto & part : parts_)
{
hash.update(part.key.data(), part.key.length());
hash.update(part.is_nested);
hash.update(part.anonymous_array_level);
}
UInt128 res;
hash.get128(res);
return res;
}
void PathInData::writeBinary(WriteBuffer & out) const
{
writeVarUInt(parts.size(), out);
for (const auto & part : parts)
{
writeStringBinary(part.key, out);
writeVarUInt(part.is_nested, out);
writeVarUInt(part.anonymous_array_level, out);
}
}
void PathInData::readBinary(ReadBuffer & in)
{
size_t num_parts;
readVarUInt(num_parts, in);
Arena arena;
Parts temp_parts;
temp_parts.reserve(num_parts);
for (size_t i = 0; i < num_parts; ++i)
{
bool is_nested;
UInt8 anonymous_array_level;
auto ref = readStringBinaryInto(arena, in);
readVarUInt(is_nested, in);
readVarUInt(anonymous_array_level, in);
temp_parts.emplace_back(static_cast<std::string_view>(ref), is_nested, anonymous_array_level);
}
/// Recreate path and parts.
path = buildPath(temp_parts);
parts = buildParts(path, temp_parts);
}
String PathInData::buildPath(const Parts & other_parts)
{
if (other_parts.empty())
return "";
String res;
auto it = other_parts.begin();
res += it->key;
++it;
for (; it != other_parts.end(); ++it)
{
res += ".";
res += it->key;
}
return res;
}
PathInData::Parts PathInData::buildParts(const String & other_path, const Parts & other_parts)
{
if (other_parts.empty())
return {};
Parts res;
const char * begin = other_path.data();
for (const auto & part : other_parts)
{
res.emplace_back(std::string_view{begin, part.key.length()}, part.is_nested, part.anonymous_array_level);
begin += part.key.length() + 1;
}
return res;
}
size_t PathInData::Hash::operator()(const PathInData & value) const
{
auto hash = getPartsHash(value.parts);
return hash.items[0] ^ hash.items[1];
}
PathInDataBuilder & PathInDataBuilder::append(std::string_view key, bool is_array)
{
if (parts.empty())
current_anonymous_array_level += is_array;
if (!key.empty())
{
if (!parts.empty())
parts.back().is_nested = is_array;
parts.emplace_back(key, false, current_anonymous_array_level);
current_anonymous_array_level = 0;
}
return *this;
}
PathInDataBuilder & PathInDataBuilder::append(const PathInData::Parts & path, bool is_array)
{
if (parts.empty())
current_anonymous_array_level += is_array;
if (!path.empty())
{
if (!parts.empty())
parts.back().is_nested = is_array;
auto it = parts.insert(parts.end(), path.begin(), path.end());
for (; it != parts.end(); ++it)
it->anonymous_array_level += current_anonymous_array_level;
current_anonymous_array_level = 0;
}
return *this;
}
void PathInDataBuilder::popBack()
{
parts.pop_back();
}
void PathInDataBuilder::popBack(size_t n)
{
assert(n <= parts.size());
parts.resize(parts.size() - n);
}
}

View File

@ -0,0 +1,112 @@
#pragma once
#include <Core/Types.h>
#include <Core/Field.h>
#include <bitset>
namespace DB
{
class ReadBuffer;
class WriteBuffer;
/// Class that represents path in document, e.g. JSON.
class PathInData
{
public:
struct Part
{
Part() = default;
Part(std::string_view key_, bool is_nested_, UInt8 anonymous_array_level_)
: key(key_), is_nested(is_nested_), anonymous_array_level(anonymous_array_level_)
{
}
/// Name of part of path.
std::string_view key;
/// If this part is Nested, i.e. element
/// related to this key is the array of objects.
bool is_nested = false;
/// Number of array levels between current key and previous key.
/// E.g. in JSON {"k1": [[[{"k2": 1, "k3": 2}]]]}
/// "k1" is nested and has anonymous_array_level = 0.
/// "k2" and "k3" are not nested and have anonymous_array_level = 2.
UInt8 anonymous_array_level = 0;
bool operator==(const Part & other) const = default;
};
using Parts = std::vector<Part>;
PathInData() = default;
explicit PathInData(std::string_view path_);
explicit PathInData(const Parts & parts_);
PathInData(const PathInData & other);
PathInData & operator=(const PathInData & other);
static UInt128 getPartsHash(const Parts & parts_);
bool empty() const { return parts.empty(); }
const String & getPath() const { return path; }
const Parts & getParts() const { return parts; }
bool isNested(size_t i) const { return parts[i].is_nested; }
bool hasNested() const { return std::any_of(parts.begin(), parts.end(), [](const auto & part) { return part.is_nested; }); }
void writeBinary(WriteBuffer & out) const;
void readBinary(ReadBuffer & in);
bool operator==(const PathInData & other) const { return parts == other.parts; }
struct Hash { size_t operator()(const PathInData & value) const; };
private:
/// Creates full path from parts.
static String buildPath(const Parts & other_parts);
/// Creates new parts full from full path with correct string pointers.
static Parts buildParts(const String & other_path, const Parts & other_parts);
/// The full path. Parts are separated by dots.
String path;
/// Parts of the path. All string_view-s in parts must point to the @path.
Parts parts;
};
class PathInDataBuilder
{
public:
const PathInData::Parts & getParts() const { return parts; }
PathInDataBuilder & append(std::string_view key, bool is_array);
PathInDataBuilder & append(const PathInData::Parts & path, bool is_array);
void popBack();
void popBack(size_t n);
private:
PathInData::Parts parts;
/// Number of array levels without key to which
/// next non-empty key will be nested.
/// Example: for JSON { "k1": [[{"k2": 1, "k3": 2}] }
// `k2` and `k3` has anonymous_array_level = 1 in that case.
size_t current_anonymous_array_level = 0;
};
using PathsInData = std::vector<PathInData>;
/// Result of parsing of a document.
/// Contains all paths extracted from document
/// and values which are related to them.
struct ParseResult
{
std::vector<PathInData> paths;
std::vector<Field> values;
};
}

View File

@ -0,0 +1,460 @@
#include <DataTypes/Serializations/SerializationObject.h>
#include <DataTypes/Serializations/JSONDataParser.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/ObjectUtils.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/NestedUtils.h>
#include <Common/JSONParsers/SimdJSONParser.h>
#include <Common/JSONParsers/RapidJSONParser.h>
#include <Common/HashTable/HashSet.h>
#include <Columns/ColumnObject.h>
#include <Common/FieldVisitorToString.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/VarInt.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
extern const int INCORRECT_DATA;
extern const int CANNOT_READ_ALL_DATA;
extern const int LOGICAL_ERROR;
}
namespace
{
/// Visitor that keeps @num_dimensions_to_keep dimensions in arrays
/// and replaces all scalars or nested arrays to @replacement at that level.
class FieldVisitorReplaceScalars : public StaticVisitor<Field>
{
public:
FieldVisitorReplaceScalars(const Field & replacement_, size_t num_dimensions_to_keep_)
: replacement(replacement_), num_dimensions_to_keep(num_dimensions_to_keep_)
{
}
template <typename T>
Field operator()(const T & x) const
{
if constexpr (std::is_same_v<T, Array>)
{
if (num_dimensions_to_keep == 0)
return replacement;
const size_t size = x.size();
Array res(size);
for (size_t i = 0; i < size; ++i)
res[i] = applyVisitor(FieldVisitorReplaceScalars(replacement, num_dimensions_to_keep - 1), x[i]);
return res;
}
else
return replacement;
}
private:
const Field & replacement;
size_t num_dimensions_to_keep;
};
using Node = typename ColumnObject::SubcolumnsTree::Node;
/// Finds a subcolumn from the same Nested type as @entry and inserts
/// an array with default values with consistent sizes as in Nested type.
bool tryInsertDefaultFromNested(
std::shared_ptr<Node> entry, const ColumnObject::SubcolumnsTree & subcolumns)
{
if (!entry->path.hasNested())
return false;
const Node * current_node = subcolumns.findLeaf(entry->path);
const Node * leaf = nullptr;
size_t num_skipped_nested = 0;
while (current_node)
{
/// Try to find the first Nested up to the current node.
const auto * node_nested = subcolumns.findParent(current_node,
[](const auto & candidate) { return candidate.isNested(); });
if (!node_nested)
break;
/// If there are no leaves, skip current node and find
/// the next node up to the current.
leaf = subcolumns.findLeaf(node_nested,
[&](const auto & candidate)
{
return candidate.data.size() == entry->data.size() + 1;
});
if (leaf)
break;
current_node = node_nested->parent;
++num_skipped_nested;
}
if (!leaf)
return false;
auto last_field = leaf->data.getLastField();
if (last_field.isNull())
return false;
const auto & least_common_type = entry->data.getLeastCommonType();
size_t num_dimensions = getNumberOfDimensions(*least_common_type);
assert(num_skipped_nested < num_dimensions);
/// Replace scalars to default values with consistent array sizes.
size_t num_dimensions_to_keep = num_dimensions - num_skipped_nested;
auto default_scalar = num_skipped_nested
? createEmptyArrayField(num_skipped_nested)
: getBaseTypeOfArray(least_common_type)->getDefault();
auto default_field = applyVisitor(FieldVisitorReplaceScalars(default_scalar, num_dimensions_to_keep), last_field);
entry->data.insert(std::move(default_field));
return true;
}
}
template <typename Parser>
template <typename Reader>
void SerializationObject<Parser>::deserializeTextImpl(IColumn & column, Reader && reader) const
{
auto & column_object = assert_cast<ColumnObject &>(column);
String buf;
reader(buf);
auto result = parser.parse(buf.data(), buf.size());
if (!result)
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse object");
auto & [paths, values] = *result;
assert(paths.size() == values.size());
HashSet<StringRef, StringRefHash> paths_set;
size_t column_size = column_object.size();
for (size_t i = 0; i < paths.size(); ++i)
{
auto field_info = getFieldInfo(values[i]);
if (isNothing(field_info.scalar_type))
continue;
if (!paths_set.insert(paths[i].getPath()).second)
throw Exception(ErrorCodes::INCORRECT_DATA,
"Object has ambiguous path: {}", paths[i].getPath());
if (!column_object.hasSubcolumn(paths[i]))
{
if (paths[i].hasNested())
column_object.addNestedSubcolumn(paths[i], field_info, column_size);
else
column_object.addSubcolumn(paths[i], column_size);
}
auto & subcolumn = column_object.getSubcolumn(paths[i]);
assert(subcolumn.size() == column_size);
subcolumn.insert(std::move(values[i]), std::move(field_info));
}
/// Insert default values to missed subcolumns.
const auto & subcolumns = column_object.getSubcolumns();
for (const auto & entry : subcolumns)
{
if (!paths_set.has(entry->path.getPath()))
{
bool inserted = tryInsertDefaultFromNested(entry, subcolumns);
if (!inserted)
entry->data.insertDefault();
}
}
column_object.incrementNumRows();
}
template <typename Parser>
void SerializationObject<Parser>::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
{
deserializeTextImpl(column, [&](String & s) { readStringInto(s, istr); });
}
template <typename Parser>
void SerializationObject<Parser>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
{
deserializeTextImpl(column, [&](String & s) { readEscapedStringInto(s, istr); });
}
template <typename Parser>
void SerializationObject<Parser>::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
{
deserializeTextImpl(column, [&](String & s) { readQuotedStringInto<true>(s, istr); });
}
template <typename Parser>
void SerializationObject<Parser>::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
{
deserializeTextImpl(column, [&](String & s) { parser.readJSON(s, istr); });
}
template <typename Parser>
void SerializationObject<Parser>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
deserializeTextImpl(column, [&](String & s) { readCSVStringInto(s, istr, settings.csv); });
}
template <typename Parser>
template <typename TSettings, typename TStatePtr>
void SerializationObject<Parser>::checkSerializationIsSupported(const TSettings & settings, const TStatePtr & state) const
{
if (settings.position_independent_encoding)
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"DataTypeObject doesn't support serialization with position independent encoding");
if (state)
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"DataTypeObject doesn't support serialization with non-trivial state");
}
template <typename Parser>
void SerializationObject<Parser>::serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
checkSerializationIsSupported(settings, state);
}
template <typename Parser>
void SerializationObject<Parser>::serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
checkSerializationIsSupported(settings, state);
}
template <typename Parser>
void SerializationObject<Parser>::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
{
checkSerializationIsSupported(settings, state);
}
template <typename Parser>
void SerializationObject<Parser>::serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
checkSerializationIsSupported(settings, state);
const auto & column_object = assert_cast<const ColumnObject &>(column);
if (!column_object.isFinalized())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write non-finalized ColumnObject");
settings.path.push_back(Substream::ObjectStructure);
if (auto * stream = settings.getter(settings.path))
writeVarUInt(column_object.getSubcolumns().size(), *stream);
const auto & subcolumns = column_object.getSubcolumns();
for (const auto & entry : subcolumns)
{
settings.path.back() = Substream::ObjectStructure;
settings.path.back().object_key_name = entry->path.getPath();
const auto & type = entry->data.getLeastCommonType();
if (auto * stream = settings.getter(settings.path))
{
entry->path.writeBinary(*stream);
writeStringBinary(type->getName(), *stream);
}
settings.path.back() = Substream::ObjectElement;
if (auto * stream = settings.getter(settings.path))
{
auto serialization = type->getDefaultSerialization();
serialization->serializeBinaryBulkWithMultipleStreams(
entry->data.getFinalizedColumn(), offset, limit, settings, state);
}
}
settings.path.pop_back();
}
template <typename Parser>
void SerializationObject<Parser>::deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const
{
checkSerializationIsSupported(settings, state);
if (!column->empty())
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"DataTypeObject cannot be deserialized to non-empty column");
auto mutable_column = column->assumeMutable();
auto & column_object = typeid_cast<ColumnObject &>(*mutable_column);
size_t num_subcolumns = 0;
settings.path.push_back(Substream::ObjectStructure);
if (auto * stream = settings.getter(settings.path))
readVarUInt(num_subcolumns, *stream);
settings.path.back() = Substream::ObjectElement;
for (size_t i = 0; i < num_subcolumns; ++i)
{
PathInData key;
String type_name;
settings.path.back() = Substream::ObjectStructure;
if (auto * stream = settings.getter(settings.path))
{
key.readBinary(*stream);
readStringBinary(type_name, *stream);
}
else
{
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA,
"Cannot read structure of DataTypeObject, because its stream is missing");
}
settings.path.back() = Substream::ObjectElement;
settings.path.back().object_key_name = key.getPath();
if (auto * stream = settings.getter(settings.path))
{
auto type = DataTypeFactory::instance().get(type_name);
auto serialization = type->getDefaultSerialization();
ColumnPtr subcolumn_data = type->createColumn();
serialization->deserializeBinaryBulkWithMultipleStreams(subcolumn_data, limit, settings, state, cache);
column_object.addSubcolumn(key, subcolumn_data->assumeMutable());
}
else
{
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA,
"Cannot read subcolumn '{}' of DataTypeObject, because its stream is missing", key.getPath());
}
}
settings.path.pop_back();
column_object.checkConsistency();
column_object.finalize();
column = std::move(mutable_column);
}
template <typename Parser>
void SerializationObject<Parser>::serializeBinary(const Field &, WriteBuffer &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject");
}
template <typename Parser>
void SerializationObject<Parser>::deserializeBinary(Field &, ReadBuffer &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject");
}
template <typename Parser>
void SerializationObject<Parser>::serializeBinary(const IColumn &, size_t, WriteBuffer &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject");
}
template <typename Parser>
void SerializationObject<Parser>::deserializeBinary(IColumn &, ReadBuffer &) const
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject");
}
/// TODO: use format different of JSON in serializations.
template <typename Parser>
void SerializationObject<Parser>::serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
const auto & column_object = assert_cast<const ColumnObject &>(column);
const auto & subcolumns = column_object.getSubcolumns();
writeChar('{', ostr);
for (auto it = subcolumns.begin(); it != subcolumns.end(); ++it)
{
if (it != subcolumns.begin())
writeCString(",", ostr);
writeDoubleQuoted((*it)->path.getPath(), ostr);
writeChar(':', ostr);
auto serialization = (*it)->data.getLeastCommonType()->getDefaultSerialization();
serialization->serializeTextJSON((*it)->data.getFinalizedColumn(), row_num, ostr, settings);
}
writeChar('}', ostr);
}
template <typename Parser>
void SerializationObject<Parser>::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
serializeTextImpl(column, row_num, ostr, settings);
}
template <typename Parser>
void SerializationObject<Parser>::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString ostr_str;
serializeTextImpl(column, row_num, ostr_str, settings);
writeEscapedString(ostr_str.str(), ostr);
}
template <typename Parser>
void SerializationObject<Parser>::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString ostr_str;
serializeTextImpl(column, row_num, ostr_str, settings);
writeQuotedString(ostr_str.str(), ostr);
}
template <typename Parser>
void SerializationObject<Parser>::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
serializeTextImpl(column, row_num, ostr, settings);
}
template <typename Parser>
void SerializationObject<Parser>::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
WriteBufferFromOwnString ostr_str;
serializeTextImpl(column, row_num, ostr_str, settings);
writeCSVString(ostr_str.str(), ostr);
}
SerializationPtr getObjectSerialization(const String & schema_format)
{
if (schema_format == "json")
{
#if USE_SIMDJSON
return std::make_shared<SerializationObject<JSONDataParser<SimdJSONParser>>>();
#elif USE_RAPIDJSON
return std::make_shared<SerializationObject<JSONDataParser<RapidJSONParser>>>();
#else
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"To use data type Object with JSON format ClickHouse should be built with Simdjson or Rapidjson");
#endif
}
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unknown schema format '{}'", schema_format);
}
}

View File

@ -0,0 +1,73 @@
#pragma once
#include <DataTypes/Serializations/SimpleTextSerialization.h>
namespace DB
{
/// Serialization for data type Object.
/// Supported only test serialization/deserialization.
/// and binary bulk serialization/deserialization without position independent
/// encoding, i.e. serialization/deserialization into Native format.
template <typename Parser>
class SerializationObject : public ISerialization
{
public:
void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
ColumnPtr & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state,
SubstreamsCache * cache) const override;
void serializeBinary(const Field & field, WriteBuffer & ostr) const override;
void deserializeBinary(Field & field, ReadBuffer & istr) const override;
void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override;
void deserializeBinary(IColumn & column, ReadBuffer & istr) const override;
void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
private:
template <typename TSettings, typename TStatePtr>
void checkSerializationIsSupported(const TSettings & settings, const TStatePtr & state) const;
template <typename Reader>
void deserializeTextImpl(IColumn & column, Reader && reader) const;
void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const;
mutable Parser parser;
};
SerializationPtr getObjectSerialization(const String & schema_format);
}

View File

@ -0,0 +1,209 @@
#pragma once
#include <DataTypes/Serializations/PathInData.h>
#include <DataTypes/IDataType.h>
#include <Columns/IColumn.h>
#include <unordered_map>
namespace DB
{
/// Tree that represents paths in document
/// with additional data in nodes.
template <typename NodeData>
class SubcolumnsTree
{
public:
struct Node
{
enum Kind
{
TUPLE,
NESTED,
SCALAR,
};
explicit Node(Kind kind_) : kind(kind_) {}
Node(Kind kind_, const NodeData & data_) : kind(kind_), data(data_) {}
Node(Kind kind_, const NodeData & data_, const PathInData & path_)
: kind(kind_), data(data_), path(path_) {}
Kind kind = TUPLE;
const Node * parent = nullptr;
std::map<String, std::shared_ptr<Node>, std::less<>> children;
NodeData data;
PathInData path;
bool isNested() const { return kind == NESTED; }
bool isScalar() const { return kind == SCALAR; }
void addChild(const String & key, std::shared_ptr<Node> next_node)
{
next_node->parent = this;
children[key] = std::move(next_node);
}
};
using NodeKind = typename Node::Kind;
using NodePtr = std::shared_ptr<Node>;
/// Add a leaf without any data in other nodes.
bool add(const PathInData & path, const NodeData & leaf_data)
{
return add(path, [&](NodeKind kind, bool exists) -> NodePtr
{
if (exists)
return nullptr;
if (kind == Node::SCALAR)
return std::make_shared<Node>(kind, leaf_data, path);
return std::make_shared<Node>(kind);
});
}
/// Callback for creation of node. Receives kind of node and
/// flag, which is true if node already exists.
using NodeCreator = std::function<NodePtr(NodeKind, bool)>;
bool add(const PathInData & path, const NodeCreator & node_creator)
{
const auto & parts = path.getParts();
if (parts.empty())
return false;
if (!root)
root = std::make_shared<Node>(Node::TUPLE);
Node * current_node = root.get();
for (size_t i = 0; i < parts.size() - 1; ++i)
{
assert(current_node->kind != Node::SCALAR);
auto it = current_node->children.find(parts[i].key);
if (it != current_node->children.end())
{
current_node = it->second.get();
node_creator(current_node->kind, true);
if (current_node->isNested() != parts[i].is_nested)
return false;
}
else
{
auto next_kind = parts[i].is_nested ? Node::NESTED : Node::TUPLE;
auto next_node = node_creator(next_kind, false);
current_node->addChild(String(parts[i].key), next_node);
current_node = next_node.get();
}
}
auto it = current_node->children.find(parts.back().key);
if (it != current_node->children.end())
return false;
auto next_node = node_creator(Node::SCALAR, false);
current_node->addChild(String(parts.back().key), next_node);
leaves.push_back(std::move(next_node));
return true;
}
/// Find node that matches the path the best.
const Node * findBestMatch(const PathInData & path) const
{
return findImpl(path, false);
}
/// Find node that matches the path exactly.
const Node * findExact(const PathInData & path) const
{
return findImpl(path, true);
}
/// Find leaf by path.
const Node * findLeaf(const PathInData & path) const
{
const auto * candidate = findExact(path);
if (!candidate || !candidate->isScalar())
return nullptr;
return candidate;
}
using NodePredicate = std::function<bool(const Node &)>;
/// Finds leaf that satisfies the predicate.
const Node * findLeaf(const NodePredicate & predicate)
{
return findLeaf(root.get(), predicate);
}
static const Node * findLeaf(const Node * node, const NodePredicate & predicate)
{
if (!node)
return nullptr;
if (node->isScalar())
return predicate(*node) ? node : nullptr;
for (const auto & [_, child] : node->children)
if (const auto * leaf = findLeaf(child.get(), predicate))
return leaf;
return nullptr;
}
/// Find first parent node that satisfies the predicate.
static const Node * findParent(const Node * node, const NodePredicate & predicate)
{
while (node && !predicate(*node))
node = node->parent;
return node;
}
bool empty() const { return root == nullptr; }
size_t size() const { return leaves.size(); }
using Nodes = std::vector<NodePtr>;
const Nodes & getLeaves() const { return leaves; }
const Node * getRoot() const { return root.get(); }
using iterator = typename Nodes::iterator;
using const_iterator = typename Nodes::const_iterator;
iterator begin() { return leaves.begin(); }
iterator end() { return leaves.end(); }
const_iterator begin() const { return leaves.begin(); }
const_iterator end() const { return leaves.end(); }
private:
const Node * findImpl(const PathInData & path, bool find_exact) const
{
if (!root)
return nullptr;
const auto & parts = path.getParts();
const Node * current_node = root.get();
for (const auto & part : parts)
{
auto it = current_node->children.find(part.key);
if (it == current_node->children.end())
return find_exact ? nullptr : current_node;
current_node = it->second.get();
}
return current_node;
}
NodePtr root;
Nodes leaves;
};
}

View File

@ -0,0 +1,216 @@
#include <DataTypes/Serializations/JSONDataParser.h>
#include <Common/JSONParsers/SimdJSONParser.h>
#include <IO/ReadBufferFromString.h>
#include <Common/FieldVisitorToString.h>
#include <ostream>
#include <gtest/gtest.h>
#if USE_SIMDJSON
using namespace DB;
const String json1 = R"({"k1" : 1, "k2" : {"k3" : "aa", "k4" : 2}})";
/// Nested(k2 String, k3 Nested(k4 String))
const String json2 =
R"({"k1" : [
{
"k2" : "aaa",
"k3" : [{ "k4" : "bbb" }, { "k4" : "ccc" }]
},
{
"k2" : "ddd",
"k3" : [{ "k4" : "eee" }, { "k4" : "fff" }]
}
]
})";
TEST(JSONDataParser, ReadJSON)
{
{
String json_bad = json1 + "aaaaaaa";
JSONDataParser<SimdJSONParser> parser;
ReadBufferFromString buf(json_bad);
String res;
parser.readJSON(res, buf);
ASSERT_EQ(json1, res);
}
{
String json_bad = json2 + "aaaaaaa";
JSONDataParser<SimdJSONParser> parser;
ReadBufferFromString buf(json_bad);
String res;
parser.readJSON(res, buf);
ASSERT_EQ(json2, res);
}
}
struct JSONPathAndValue
{
PathInData path;
Field value;
JSONPathAndValue(const PathInData & path_, const Field & value_)
: path(path_), value(value_)
{
}
bool operator==(const JSONPathAndValue & other) const = default;
bool operator<(const JSONPathAndValue & other) const { return path.getPath() < other.path.getPath(); }
};
static std::ostream & operator<<(std::ostream & ostr, const JSONPathAndValue & path_and_value)
{
ostr << "{ PathInData{";
bool first = true;
for (const auto & part : path_and_value.path.getParts())
{
ostr << (first ? "{" : ", {") << part.key << ", " << part.is_nested << ", " << part.anonymous_array_level << "}";
first = false;
}
ostr << "}, Field{" << applyVisitor(FieldVisitorToString(), path_and_value.value) << "} }";
return ostr;
}
using JSONValues = std::vector<JSONPathAndValue>;
static void check(
const String & json_str,
const String & tag,
JSONValues expected_values)
{
JSONDataParser<SimdJSONParser> parser;
auto res = parser.parse(json_str.data(), json_str.size());
ASSERT_TRUE(res.has_value()) << tag;
const auto & [paths, values] = *res;
ASSERT_EQ(paths.size(), expected_values.size()) << tag;
ASSERT_EQ(values.size(), expected_values.size()) << tag;
JSONValues result_values;
for (size_t i = 0; i < paths.size(); ++i)
result_values.emplace_back(paths[i], values[i]);
std::sort(expected_values.begin(), expected_values.end());
std::sort(result_values.begin(), result_values.end());
ASSERT_EQ(result_values, expected_values) << tag;
}
TEST(JSONDataParser, Parse)
{
{
check(json1, "json1",
{
{ PathInData{{{"k1", false, 0}}}, 1 },
{ PathInData{{{"k2", false, 0}, {"k3", false, 0}}}, "aa" },
{ PathInData{{{"k2", false, 0}, {"k4", false, 0}}}, 2 },
});
}
{
check(json2, "json2",
{
{ PathInData{{{"k1", true, 0}, {"k2", false, 0}}}, Array{"aaa", "ddd"} },
{ PathInData{{{"k1", true, 0}, {"k3", true, 0}, {"k4", false, 0}}}, Array{Array{"bbb", "ccc"}, Array{"eee", "fff"}} },
});
}
{
/// Nested(k2 Tuple(k3 Array(Int), k4 Array(Int)), k5 String)
const String json3 =
R"({"k1": [
{
"k2": {
"k3": [1, 2],
"k4": [3, 4]
},
"k5": "foo"
},
{
"k2": {
"k3": [5, 6],
"k4": [7, 8]
},
"k5": "bar"
}
]})";
check(json3, "json3",
{
{ PathInData{{{"k1", true, 0}, {"k5", false, 0}}}, Array{"foo", "bar"} },
{ PathInData{{{"k1", true, 0}, {"k2", false, 0}, {"k3", false, 0}}}, Array{Array{1, 2}, Array{5, 6}} },
{ PathInData{{{"k1", true, 0}, {"k2", false, 0}, {"k4", false, 0}}}, Array{Array{3, 4}, Array{7, 8}} },
});
}
{
/// Nested(k2 Nested(k3 Int, k4 Int), k5 String)
const String json4 =
R"({"k1": [
{
"k2": [{"k3": 1, "k4": 3}, {"k3": 2, "k4": 4}],
"k5": "foo"
},
{
"k2": [{"k3": 5, "k4": 7}, {"k3": 6, "k4": 8}],
"k5": "bar"
}
]})";
check(json4, "json4",
{
{ PathInData{{{"k1", true, 0}, {"k5", false, 0}}}, Array{"foo", "bar"} },
{ PathInData{{{"k1", true, 0}, {"k2", true, 0}, {"k3", false, 0}}}, Array{Array{1, 2}, Array{5, 6}} },
{ PathInData{{{"k1", true, 0}, {"k2", true, 0}, {"k4", false, 0}}}, Array{Array{3, 4}, Array{7, 8}} },
});
}
{
const String json5 = R"({"k1": [[1, 2, 3], [4, 5], [6]]})";
check(json5, "json5",
{
{ PathInData{{{"k1", false, 0}}}, Array{Array{1, 2, 3}, Array{4, 5}, Array{6}} }
});
}
{
/// Array(Nested(k2 Int, k3 Int))
const String json6 = R"({
"k1": [
[{"k2": 1, "k3": 2}, {"k2": 3, "k3": 4}],
[{"k2": 5, "k3": 6}]
]
})";
check(json6, "json6",
{
{ PathInData{{{"k1", true, 0}, {"k2", false, 1}}}, Array{Array{1, 3}, Array{5}} },
{ PathInData{{{"k1", true, 0}, {"k3", false, 1}}}, Array{Array{2, 4}, Array{6}} },
});
}
{
/// Nested(k2 Array(Int), k3 Array(Int))
const String json7 = R"({
"k1": [
{"k2": [1, 3], "k3": [2, 4]},
{"k2": [5], "k3": [6]}
]
})";
check(json7, "json7",
{
{ PathInData{{{"k1", true, 0}, {"k2", false, 0}}}, Array{Array{1, 3}, Array{5}} },
{ PathInData{{{"k1", true, 0}, {"k3", false, 0}}}, Array{Array{2, 4}, Array{6}} },
});
}
}
#endif

View File

@ -18,6 +18,8 @@
#include <DataTypes/DataTypeEnum.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/DataTypeFactory.h>
#include <base/EnumReflection.h>
namespace DB
@ -30,28 +32,181 @@ namespace ErrorCodes
namespace
{
String getExceptionMessagePrefix(const DataTypes & types)
String typeToString(const DataTypePtr & type) { return type->getName(); }
String typeToString(const TypeIndex & type) { return String(magic_enum::enum_name(type)); }
template <typename DataTypes>
String getExceptionMessagePrefix(const DataTypes & types)
{
WriteBufferFromOwnString res;
res << "There is no supertype for types ";
bool first = true;
for (const auto & type : types)
{
WriteBufferFromOwnString res;
res << "There is no supertype for types ";
if (!first)
res << ", ";
first = false;
bool first = true;
for (const auto & type : types)
{
if (!first)
res << ", ";
first = false;
res << type->getName();
}
return res.str();
res << typeToString(type);
}
return res.str();
}
DataTypePtr getLeastSupertype(const DataTypes & types)
DataTypePtr getNumericType(const TypeIndexSet & types, bool allow_conversion_to_string)
{
auto throw_or_return = [&](std::string_view message, int error_code)
{
if (allow_conversion_to_string)
return std::make_shared<DataTypeString>();
throw Exception(String(message), error_code);
};
bool all_numbers = true;
size_t max_bits_of_signed_integer = 0;
size_t max_bits_of_unsigned_integer = 0;
size_t max_mantissa_bits_of_floating = 0;
auto maximize = [](size_t & what, size_t value)
{
if (value > what)
what = value;
};
for (const auto & type : types)
{
if (type == TypeIndex::UInt8)
maximize(max_bits_of_unsigned_integer, 8);
else if (type == TypeIndex::UInt16)
maximize(max_bits_of_unsigned_integer, 16);
else if (type == TypeIndex::UInt32)
maximize(max_bits_of_unsigned_integer, 32);
else if (type == TypeIndex::UInt64)
maximize(max_bits_of_unsigned_integer, 64);
else if (type == TypeIndex::UInt128)
maximize(max_bits_of_unsigned_integer, 128);
else if (type == TypeIndex::UInt256)
maximize(max_bits_of_unsigned_integer, 256);
else if (type == TypeIndex::Int8 || type == TypeIndex::Enum8)
maximize(max_bits_of_signed_integer, 8);
else if (type == TypeIndex::Int16 || type == TypeIndex::Enum16)
maximize(max_bits_of_signed_integer, 16);
else if (type == TypeIndex::Int32)
maximize(max_bits_of_signed_integer, 32);
else if (type == TypeIndex::Int64)
maximize(max_bits_of_signed_integer, 64);
else if (type == TypeIndex::Int128)
maximize(max_bits_of_signed_integer, 128);
else if (type == TypeIndex::Int256)
maximize(max_bits_of_signed_integer, 256);
else if (type == TypeIndex::Float32)
maximize(max_mantissa_bits_of_floating, 24);
else if (type == TypeIndex::Float64)
maximize(max_mantissa_bits_of_floating, 53);
else
all_numbers = false;
}
if (max_bits_of_signed_integer || max_bits_of_unsigned_integer || max_mantissa_bits_of_floating)
{
if (!all_numbers)
return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are numbers and some of them are not", ErrorCodes::NO_COMMON_TYPE);
/// If there are signed and unsigned types of same bit-width, the result must be signed number with at least one more bit.
/// Example, common of Int32, UInt32 = Int64.
size_t min_bit_width_of_integer = std::max(max_bits_of_signed_integer, max_bits_of_unsigned_integer);
/// If unsigned is not covered by signed.
if (max_bits_of_signed_integer && max_bits_of_unsigned_integer >= max_bits_of_signed_integer) //-V1051
{
// Because 128 and 256 bit integers are significantly slower, we should not promote to them.
// But if we already have wide numbers, promotion is necessary.
if (min_bit_width_of_integer != 64)
++min_bit_width_of_integer;
else
return throw_or_return(
getExceptionMessagePrefix(types)
+ " because some of them are signed integers and some are unsigned integers,"
" but there is no signed integer type, that can exactly represent all required unsigned integer values",
ErrorCodes::NO_COMMON_TYPE);
}
/// If the result must be floating.
if (max_mantissa_bits_of_floating)
{
size_t min_mantissa_bits = std::max(min_bit_width_of_integer, max_mantissa_bits_of_floating);
if (min_mantissa_bits <= 24)
return std::make_shared<DataTypeFloat32>();
else if (min_mantissa_bits <= 53)
return std::make_shared<DataTypeFloat64>();
else
return throw_or_return(getExceptionMessagePrefix(types)
+ " because some of them are integers and some are floating point,"
" but there is no floating point type, that can exactly represent all required integers", ErrorCodes::NO_COMMON_TYPE);
}
/// If the result must be signed integer.
if (max_bits_of_signed_integer)
{
if (min_bit_width_of_integer <= 8)
return std::make_shared<DataTypeInt8>();
else if (min_bit_width_of_integer <= 16)
return std::make_shared<DataTypeInt16>();
else if (min_bit_width_of_integer <= 32)
return std::make_shared<DataTypeInt32>();
else if (min_bit_width_of_integer <= 64)
return std::make_shared<DataTypeInt64>();
else if (min_bit_width_of_integer <= 128)
return std::make_shared<DataTypeInt128>();
else if (min_bit_width_of_integer <= 256)
return std::make_shared<DataTypeInt256>();
else
return throw_or_return(getExceptionMessagePrefix(types)
+ " because some of them are signed integers and some are unsigned integers,"
" but there is no signed integer type, that can exactly represent all required unsigned integer values", ErrorCodes::NO_COMMON_TYPE);
}
/// All unsigned.
{
if (min_bit_width_of_integer <= 8)
return std::make_shared<DataTypeUInt8>();
else if (min_bit_width_of_integer <= 16)
return std::make_shared<DataTypeUInt16>();
else if (min_bit_width_of_integer <= 32)
return std::make_shared<DataTypeUInt32>();
else if (min_bit_width_of_integer <= 64)
return std::make_shared<DataTypeUInt64>();
else if (min_bit_width_of_integer <= 128)
return std::make_shared<DataTypeUInt128>();
else if (min_bit_width_of_integer <= 256)
return std::make_shared<DataTypeUInt256>();
else
return throw_or_return("Logical error: " + getExceptionMessagePrefix(types)
+ " but as all data types are unsigned integers, we must have found maximum unsigned integer type", ErrorCodes::NO_COMMON_TYPE);
}
}
return {};
}
}
DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_conversion_to_string)
{
auto throw_or_return = [&](std::string_view message, int error_code)
{
if (allow_conversion_to_string)
return std::make_shared<DataTypeString>();
throw Exception(String(message), error_code);
};
/// Trivial cases
if (types.empty())
@ -88,7 +243,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
non_nothing_types.emplace_back(type);
if (non_nothing_types.size() < types.size())
return getLeastSupertype(non_nothing_types);
return getLeastSupertype(non_nothing_types, allow_conversion_to_string);
}
/// For Arrays
@ -113,9 +268,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
if (have_array)
{
if (!all_arrays)
throw Exception(getExceptionMessagePrefix(types) + " because some of them are Array and some of them are not", ErrorCodes::NO_COMMON_TYPE);
return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are Array and some of them are not", ErrorCodes::NO_COMMON_TYPE);
return std::make_shared<DataTypeArray>(getLeastSupertype(nested_types));
return std::make_shared<DataTypeArray>(getLeastSupertype(nested_types, allow_conversion_to_string));
}
}
@ -139,7 +294,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
nested_types[elem_idx].reserve(types.size());
}
else if (tuple_size != type_tuple->getElements().size())
throw Exception(getExceptionMessagePrefix(types) + " because Tuples have different sizes", ErrorCodes::NO_COMMON_TYPE);
return throw_or_return(getExceptionMessagePrefix(types) + " because Tuples have different sizes", ErrorCodes::NO_COMMON_TYPE);
have_tuple = true;
@ -153,11 +308,11 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
if (have_tuple)
{
if (!all_tuples)
throw Exception(getExceptionMessagePrefix(types) + " because some of them are Tuple and some of them are not", ErrorCodes::NO_COMMON_TYPE);
return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are Tuple and some of them are not", ErrorCodes::NO_COMMON_TYPE);
DataTypes common_tuple_types(tuple_size);
for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx)
common_tuple_types[elem_idx] = getLeastSupertype(nested_types[elem_idx]);
common_tuple_types[elem_idx] = getLeastSupertype(nested_types[elem_idx], allow_conversion_to_string);
return std::make_shared<DataTypeTuple>(common_tuple_types);
}
@ -187,9 +342,11 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
if (have_maps)
{
if (!all_maps)
throw Exception(getExceptionMessagePrefix(types) + " because some of them are Maps and some of them are not", ErrorCodes::NO_COMMON_TYPE);
return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are Maps and some of them are not", ErrorCodes::NO_COMMON_TYPE);
return std::make_shared<DataTypeMap>(getLeastSupertype(key_types), getLeastSupertype(value_types));
return std::make_shared<DataTypeMap>(
getLeastSupertype(key_types, allow_conversion_to_string),
getLeastSupertype(value_types, allow_conversion_to_string));
}
}
@ -220,9 +377,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
if (have_low_cardinality)
{
if (have_not_low_cardinality)
return getLeastSupertype(nested_types);
return getLeastSupertype(nested_types, allow_conversion_to_string);
else
return std::make_shared<DataTypeLowCardinality>(getLeastSupertype(nested_types));
return std::make_shared<DataTypeLowCardinality>(getLeastSupertype(nested_types, allow_conversion_to_string));
}
}
@ -248,13 +405,13 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
if (have_nullable)
{
return std::make_shared<DataTypeNullable>(getLeastSupertype(nested_types));
return std::make_shared<DataTypeNullable>(getLeastSupertype(nested_types, allow_conversion_to_string));
}
}
/// Non-recursive rules
std::unordered_set<TypeIndex> type_ids;
TypeIndexSet type_ids;
for (const auto & type : types)
type_ids.insert(type->getTypeId());
@ -268,7 +425,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
{
bool all_strings = type_ids.size() == (have_string + have_fixed_string);
if (!all_strings)
throw Exception(getExceptionMessagePrefix(types) + " because some of them are String/FixedString and some of them are not", ErrorCodes::NO_COMMON_TYPE);
return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are String/FixedString and some of them are not", ErrorCodes::NO_COMMON_TYPE);
return std::make_shared<DataTypeString>();
}
@ -285,7 +442,8 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
{
bool all_date_or_datetime = type_ids.size() == (have_date + have_date32 + have_datetime + have_datetime64);
if (!all_date_or_datetime)
throw Exception(getExceptionMessagePrefix(types) + " because some of them are Date/Date32/DateTime/DateTime64 and some of them are not",
return throw_or_return(getExceptionMessagePrefix(types)
+ " because some of them are Date/Date32/DateTime/DateTime64 and some of them are not",
ErrorCodes::NO_COMMON_TYPE);
if (have_datetime64 == 0 && have_date32 == 0)
@ -362,7 +520,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
}
if (num_supported != type_ids.size())
throw Exception(getExceptionMessagePrefix(types) + " because some of them have no lossless conversion to Decimal",
return throw_or_return(getExceptionMessagePrefix(types) + " because some of them have no lossless conversion to Decimal",
ErrorCodes::NO_COMMON_TYPE);
UInt32 max_scale = 0;
@ -385,7 +543,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
}
if (min_precision > DataTypeDecimal<Decimal128>::maxPrecision())
throw Exception(getExceptionMessagePrefix(types) + " because the least supertype is Decimal("
return throw_or_return(getExceptionMessagePrefix(types) + " because the least supertype is Decimal("
+ toString(min_precision) + ',' + toString(max_scale) + ')',
ErrorCodes::NO_COMMON_TYPE);
@ -399,135 +557,56 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
/// For numeric types, the most complicated part.
{
bool all_numbers = true;
size_t max_bits_of_signed_integer = 0;
size_t max_bits_of_unsigned_integer = 0;
size_t max_mantissa_bits_of_floating = 0;
auto maximize = [](size_t & what, size_t value)
{
if (value > what)
what = value;
};
for (const auto & type : types)
{
if (typeid_cast<const DataTypeUInt8 *>(type.get()))
maximize(max_bits_of_unsigned_integer, 8);
else if (typeid_cast<const DataTypeUInt16 *>(type.get()))
maximize(max_bits_of_unsigned_integer, 16);
else if (typeid_cast<const DataTypeUInt32 *>(type.get()))
maximize(max_bits_of_unsigned_integer, 32);
else if (typeid_cast<const DataTypeUInt64 *>(type.get()))
maximize(max_bits_of_unsigned_integer, 64);
else if (typeid_cast<const DataTypeUInt128 *>(type.get()))
maximize(max_bits_of_unsigned_integer, 128);
else if (typeid_cast<const DataTypeUInt256 *>(type.get()))
maximize(max_bits_of_unsigned_integer, 256);
else if (typeid_cast<const DataTypeInt8 *>(type.get()) || typeid_cast<const DataTypeEnum8 *>(type.get()))
maximize(max_bits_of_signed_integer, 8);
else if (typeid_cast<const DataTypeInt16 *>(type.get()) || typeid_cast<const DataTypeEnum16 *>(type.get()))
maximize(max_bits_of_signed_integer, 16);
else if (typeid_cast<const DataTypeInt32 *>(type.get()))
maximize(max_bits_of_signed_integer, 32);
else if (typeid_cast<const DataTypeInt64 *>(type.get()))
maximize(max_bits_of_signed_integer, 64);
else if (typeid_cast<const DataTypeInt128 *>(type.get()))
maximize(max_bits_of_signed_integer, 128);
else if (typeid_cast<const DataTypeInt256 *>(type.get()))
maximize(max_bits_of_signed_integer, 256);
else if (typeid_cast<const DataTypeFloat32 *>(type.get()))
maximize(max_mantissa_bits_of_floating, 24);
else if (typeid_cast<const DataTypeFloat64 *>(type.get()))
maximize(max_mantissa_bits_of_floating, 53);
else
all_numbers = false;
}
if (max_bits_of_signed_integer || max_bits_of_unsigned_integer || max_mantissa_bits_of_floating)
{
if (!all_numbers)
throw Exception(getExceptionMessagePrefix(types) + " because some of them are numbers and some of them are not", ErrorCodes::NO_COMMON_TYPE);
/// If there are signed and unsigned types of same bit-width, the result must be signed number with at least one more bit.
/// Example, common of Int32, UInt32 = Int64.
size_t min_bit_width_of_integer = std::max(max_bits_of_signed_integer, max_bits_of_unsigned_integer);
/// If unsigned is not covered by signed.
if (max_bits_of_signed_integer && max_bits_of_unsigned_integer >= max_bits_of_signed_integer) //-V1051
{
// Because 128 and 256 bit integers are significantly slower, we should not promote to them.
// But if we already have wide numbers, promotion is necessary.
if (min_bit_width_of_integer != 64)
++min_bit_width_of_integer;
else
throw Exception(
getExceptionMessagePrefix(types)
+ " because some of them are signed integers and some are unsigned integers,"
" but there is no signed integer type, that can exactly represent all required unsigned integer values",
ErrorCodes::NO_COMMON_TYPE);
}
/// If the result must be floating.
if (max_mantissa_bits_of_floating)
{
size_t min_mantissa_bits = std::max(min_bit_width_of_integer, max_mantissa_bits_of_floating);
if (min_mantissa_bits <= 24)
return std::make_shared<DataTypeFloat32>();
else if (min_mantissa_bits <= 53)
return std::make_shared<DataTypeFloat64>();
else
throw Exception(getExceptionMessagePrefix(types)
+ " because some of them are integers and some are floating point,"
" but there is no floating point type, that can exactly represent all required integers", ErrorCodes::NO_COMMON_TYPE);
}
/// If the result must be signed integer.
if (max_bits_of_signed_integer)
{
if (min_bit_width_of_integer <= 8)
return std::make_shared<DataTypeInt8>();
else if (min_bit_width_of_integer <= 16)
return std::make_shared<DataTypeInt16>();
else if (min_bit_width_of_integer <= 32)
return std::make_shared<DataTypeInt32>();
else if (min_bit_width_of_integer <= 64)
return std::make_shared<DataTypeInt64>();
else if (min_bit_width_of_integer <= 128)
return std::make_shared<DataTypeInt128>();
else if (min_bit_width_of_integer <= 256)
return std::make_shared<DataTypeInt256>();
else
throw Exception(getExceptionMessagePrefix(types)
+ " because some of them are signed integers and some are unsigned integers,"
" but there is no signed integer type, that can exactly represent all required unsigned integer values", ErrorCodes::NO_COMMON_TYPE);
}
/// All unsigned.
{
if (min_bit_width_of_integer <= 8)
return std::make_shared<DataTypeUInt8>();
else if (min_bit_width_of_integer <= 16)
return std::make_shared<DataTypeUInt16>();
else if (min_bit_width_of_integer <= 32)
return std::make_shared<DataTypeUInt32>();
else if (min_bit_width_of_integer <= 64)
return std::make_shared<DataTypeUInt64>();
else if (min_bit_width_of_integer <= 128)
return std::make_shared<DataTypeUInt128>();
else if (min_bit_width_of_integer <= 256)
return std::make_shared<DataTypeUInt256>();
else
throw Exception("Logical error: " + getExceptionMessagePrefix(types)
+ " but as all data types are unsigned integers, we must have found maximum unsigned integer type", ErrorCodes::NO_COMMON_TYPE);
}
}
auto numeric_type = getNumericType(type_ids, allow_conversion_to_string);
if (numeric_type)
return numeric_type;
}
/// All other data types (UUID, AggregateFunction, Enum...) are compatible only if they are the same (checked in trivial cases).
throw Exception(getExceptionMessagePrefix(types), ErrorCodes::NO_COMMON_TYPE);
return throw_or_return(getExceptionMessagePrefix(types), ErrorCodes::NO_COMMON_TYPE);
}
DataTypePtr getLeastSupertype(const TypeIndexSet & types, bool allow_conversion_to_string)
{
auto throw_or_return = [&](std::string_view message, int error_code)
{
if (allow_conversion_to_string)
return std::make_shared<DataTypeString>();
throw Exception(String(message), error_code);
};
TypeIndexSet types_set;
for (const auto & type : types)
{
if (WhichDataType(type).isNothing())
continue;
if (!WhichDataType(type).isSimple())
throw Exception(ErrorCodes::NO_COMMON_TYPE,
"Cannot get common type by type ids with parametric type {}", typeToString(type));
types_set.insert(type);
}
if (types_set.empty())
return std::make_shared<DataTypeNothing>();
if (types.count(TypeIndex::String))
{
if (types.size() != 1)
return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are String and some of them are not", ErrorCodes::NO_COMMON_TYPE);
return std::make_shared<DataTypeString>();
}
/// For numeric types, the most complicated part.
auto numeric_type = getNumericType(types, allow_conversion_to_string);
if (numeric_type)
return numeric_type;
/// All other data types (UUID, AggregateFunction, Enum...) are compatible only if they are the same (checked in trivial cases).
return throw_or_return(getExceptionMessagePrefix(types), ErrorCodes::NO_COMMON_TYPE);
}
DataTypePtr tryGetLeastSupertype(const DataTypes & types)

View File

@ -7,12 +7,16 @@ namespace DB
{
/** Get data type that covers all possible values of passed data types.
* If there is no such data type, throws an exception.
* If there is no such data type, throws an exception
* or if 'allow_conversion_to_string' is true returns String as common type.
*
* Examples: least common supertype for UInt8, Int8 - Int16.
* Examples: there is no least common supertype for Array(UInt8), Int8.
*/
DataTypePtr getLeastSupertype(const DataTypes & types);
DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_conversion_to_string = false);
using TypeIndexSet = std::unordered_set<TypeIndex>;
DataTypePtr getLeastSupertype(const TypeIndexSet & types, bool allow_conversion_to_string = false);
/// Same as above but return nullptr instead of throwing exception.
DataTypePtr tryGetLeastSupertype(const DataTypes & types);

View File

@ -406,13 +406,24 @@ ASTPtr DatabasePostgreSQL::getCreateTableQueryImpl(const String & table_name, Co
ASTs storage_children = ast_storage->children;
auto storage_engine_arguments = ast_storage->engine->arguments;
/// Remove extra engine argument (`schema` and `use_table_cache`)
if (storage_engine_arguments->children.size() >= 5)
storage_engine_arguments->children.resize(4);
if (storage_engine_arguments->children.empty())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected number of arguments: {}", storage_engine_arguments->children.size());
/// Add table_name to engine arguments
assert(storage_engine_arguments->children.size() >= 2);
storage_engine_arguments->children.insert(storage_engine_arguments->children.begin() + 2, std::make_shared<ASTLiteral>(table_id.table_name));
/// Check for named collection.
if (typeid_cast<ASTIdentifier *>(storage_engine_arguments->children[0].get()))
{
storage_engine_arguments->children.push_back(makeASTFunction("equals", std::make_shared<ASTIdentifier>("table"), std::make_shared<ASTLiteral>(table_id.table_name)));
}
else
{
/// Remove extra engine argument (`schema` and `use_table_cache`)
if (storage_engine_arguments->children.size() >= 5)
storage_engine_arguments->children.resize(4);
/// Add table_name to engine arguments.
if (storage_engine_arguments->children.size() >= 2)
storage_engine_arguments->children.insert(storage_engine_arguments->children.begin() + 2, std::make_shared<ASTLiteral>(table_id.table_name));
}
return create_table_query;
}

View File

@ -9,9 +9,9 @@
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeMap.h>
#include <Functions/SimdJSONParser.h>
#include <Functions/RapidJSONParser.h>
#include <Functions/DummyJSONParser.h>
#include <Common/JSONParsers/SimdJSONParser.h>
#include <Common/JSONParsers/RapidJSONParser.h>
#include <Common/JSONParsers/DummyJSONParser.h>
#include <base/find_symbols.h>
@ -169,6 +169,10 @@ DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field)
value_type = type;
}
if (!value_type)
return nullptr;
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), value_type);
}

View File

@ -74,6 +74,7 @@ void registerOutputFormatCapnProto(FormatFactory & factory);
void registerInputFormatRegexp(FormatFactory & factory);
void registerInputFormatJSONAsString(FormatFactory & factory);
void registerInputFormatJSONAsObject(FormatFactory & factory);
void registerInputFormatLineAsString(FormatFactory & factory);
void registerInputFormatCapnProto(FormatFactory & factory);
@ -84,6 +85,7 @@ void registerInputFormatHiveText(FormatFactory & factory);
/// Non trivial prefix and suffix checkers for disabling parallel parsing.
void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory);
void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factory);
void registerNonTrivialPrefixAndSuffixCheckerJSONAsObject(FormatFactory & factory);
void registerArrowSchemaReader(FormatFactory & factory);
void registerParquetSchemaReader(FormatFactory & factory);
@ -175,6 +177,7 @@ void registerFormats()
registerInputFormatRegexp(factory);
registerInputFormatJSONAsString(factory);
registerInputFormatLineAsString(factory);
registerInputFormatJSONAsObject(factory);
#if USE_HIVE
registerInputFormatHiveText(factory);
#endif
@ -183,6 +186,7 @@ void registerFormats()
registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(factory);
registerNonTrivialPrefixAndSuffixCheckerJSONAsString(factory);
registerNonTrivialPrefixAndSuffixCheckerJSONAsObject(factory);
registerArrowSchemaReader(factory);
registerParquetSchemaReader(factory);

View File

@ -33,22 +33,27 @@ public:
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
explicit CastOverloadResolverImpl(std::optional<Diagnostic> diagnostic_, bool keep_nullable_)
: diagnostic(std::move(diagnostic_)), keep_nullable(keep_nullable_)
explicit CastOverloadResolverImpl(std::optional<Diagnostic> diagnostic_, bool keep_nullable_, bool cast_ipv4_ipv6_default_on_conversion_error_)
: diagnostic(std::move(diagnostic_))
, keep_nullable(keep_nullable_)
, cast_ipv4_ipv6_default_on_conversion_error(cast_ipv4_ipv6_default_on_conversion_error_)
{
}
static FunctionOverloadResolverPtr create(ContextPtr context)
{
const auto & settings_ref = context->getSettingsRef();
if constexpr (internal)
return createImpl();
return createImpl({}, context->getSettingsRef().cast_keep_nullable);
return createImpl({}, false /*keep_nullable*/, false /*cast_ipv4_ipv6_default_on_conversion_error*/);
return createImpl({}, settings_ref.cast_keep_nullable, settings_ref.cast_ipv4_ipv6_default_on_conversion_error);
}
static FunctionOverloadResolverPtr createImpl(std::optional<Diagnostic> diagnostic = {}, bool keep_nullable = false)
static FunctionOverloadResolverPtr createImpl(std::optional<Diagnostic> diagnostic = {}, bool keep_nullable = false, bool cast_ipv4_ipv6_default_on_conversion_error = false)
{
assert(!internal || !keep_nullable);
return std::make_unique<CastOverloadResolverImpl>(std::move(diagnostic), keep_nullable);
return std::make_unique<CastOverloadResolverImpl>(std::move(diagnostic), keep_nullable, cast_ipv4_ipv6_default_on_conversion_error);
}
protected:
@ -61,7 +66,7 @@ protected:
data_types[i] = arguments[i].type;
auto monotonicity = MonotonicityHelper::getMonotonicityInformation(arguments.front().type, return_type.get());
return std::make_unique<FunctionCast<FunctionName>>(name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type);
return std::make_unique<FunctionCast<FunctionName>>(name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type, cast_ipv4_ipv6_default_on_conversion_error);
}
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
@ -98,6 +103,7 @@ protected:
private:
std::optional<Diagnostic> diagnostic;
bool keep_nullable;
bool cast_ipv4_ipv6_default_on_conversion_error;
};
@ -115,7 +121,10 @@ struct CastInternalOverloadName
static constexpr auto accurate_cast_or_null_name = "accurate_CastOrNull";
};
template <CastType cast_type> using CastOverloadResolver = CastOverloadResolverImpl<cast_type, false, CastOverloadName, CastName>;
template <CastType cast_type> using CastInternalOverloadResolver = CastOverloadResolverImpl<cast_type, true, CastInternalOverloadName, CastInternalName>;
template <CastType cast_type>
using CastOverloadResolver = CastOverloadResolverImpl<cast_type, false, CastOverloadName, CastName>;
template <CastType cast_type>
using CastInternalOverloadResolver = CastOverloadResolverImpl<cast_type, true, CastInternalOverloadName, CastInternalName>;
}

View File

@ -8,13 +8,13 @@
#include <Core/Settings.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/DummyJSONParser.h>
#include <Common/JSONParsers/DummyJSONParser.h>
#include <Functions/IFunction.h>
#include <Functions/JSONPath/ASTs/ASTJSONPath.h>
#include <Functions/JSONPath/Generator/GeneratorJSONPath.h>
#include <Functions/JSONPath/Parsers/ParserJSONPath.h>
#include <Functions/RapidJSONParser.h>
#include <Functions/SimdJSONParser.h>
#include <Common/JSONParsers/RapidJSONParser.h>
#include <Common/JSONParsers/SimdJSONParser.h>
#include <Interpreters/Context.h>
#include <Parsers/IParser.h>
#include <Parsers/Lexer.h>

View File

@ -8,26 +8,21 @@
namespace DB
{
/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
* Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
* Using a naive Bayesian classifier, find the most likely charset and language and return it
*/
template <bool detect_language>
struct CharsetClassificationImpl
namespace
{
/* We need to solve zero-frequency problem for Naive Bayes Classifier
* If the bigram is not found in the text, we assume that the probability of its meeting is 1e-06.
* 1e-06 is minimal value in our marked-up dictionary.
*/
static constexpr Float64 zero_frequency = 1e-06;
constexpr Float64 zero_frequency = 1e-06;
/// If the data size is bigger than this, behaviour is unspecified for this function.
static constexpr size_t max_string_size = 1u << 15;
constexpr size_t max_string_size = 1UL << 15;
static ALWAYS_INLINE inline Float64 naiveBayes(
template <typename ModelMap>
ALWAYS_INLINE inline Float64 naiveBayes(
const FrequencyHolder::EncodingMap & standard,
const HashMap<UInt16, UInt64> & model,
const ModelMap & model,
Float64 max_result)
{
Float64 res = 0;
@ -52,10 +47,11 @@ struct CharsetClassificationImpl
}
/// Сount how many times each bigram occurs in the text.
static ALWAYS_INLINE inline void calculateStats(
template <typename ModelMap>
ALWAYS_INLINE inline void calculateStats(
const UInt8 * data,
const size_t size,
HashMap<UInt16, UInt64> & model)
ModelMap & model)
{
UInt16 hash = 0;
for (size_t i = 0; i < size; ++i)
@ -65,7 +61,15 @@ struct CharsetClassificationImpl
++model[hash];
}
}
}
/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
* Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
* Using a naive Bayesian classifier, find the most likely charset and language and return it
*/
template <bool detect_language>
struct CharsetClassificationImpl
{
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
@ -74,7 +78,7 @@ struct CharsetClassificationImpl
{
const auto & encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
if (detect_language)
if constexpr (detect_language)
/// 2 chars for ISO code + 1 zero byte
res_data.reserve(offsets.size() * 3);
else
@ -83,37 +87,43 @@ struct CharsetClassificationImpl
res_offsets.resize(offsets.size());
size_t res_offset = 0;
size_t current_result_offset = 0;
double zero_frequency_log = log(zero_frequency);
for (size_t i = 0; i < offsets.size(); ++i)
{
const UInt8 * str = data.data() + offsets[i - 1];
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
std::string_view res;
HashMap<UInt16, UInt64> model;
HashMapWithStackMemory<UInt16, UInt64, DefaultHash<UInt16>, 4> model;
calculateStats(str, str_len, model);
std::string_view result_value;
/// Go through the dictionary and find the charset with the highest weight
Float64 max_result = log(zero_frequency) * (max_string_size);
Float64 max_result = zero_frequency_log * (max_string_size);
for (const auto & item : encodings_freq)
{
Float64 score = naiveBayes(item.map, model, max_result);
if (max_result < score)
{
max_result = score;
res = detect_language ? item.lang : item.name;
if constexpr (detect_language)
result_value = item.lang;
else
result_value = item.name;
}
}
res_data.resize(res_offset + res.size() + 1);
memcpy(&res_data[res_offset], res.data(), res.size());
size_t result_value_size = result_value.size();
res_data.resize(current_result_offset + result_value_size + 1);
memcpy(&res_data[current_result_offset], result_value.data(), result_value_size);
res_data[current_result_offset + result_value_size] = '\0';
current_result_offset += result_value_size + 1;
res_data[res_offset + res.size()] = 0;
res_offset += res.size() + 1;
res_offsets[i] = res_offset;
res_offsets[i] = current_result_offset;
}
}
};

Some files were not shown because too many files have changed in this diff Show More