mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 08:32:02 +00:00
Merge branch 'master' into parallel-downloading-url-engine
This commit is contained in:
commit
103a3fa140
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -1,2 +1,3 @@
|
||||
contrib/* linguist-vendored
|
||||
*.h linguist-language=C++
|
||||
tests/queries/0_stateless/data_json/* binary
|
||||
|
@ -60,5 +60,5 @@ clientPort=2181 \n\
|
||||
maxClientCnxns=80' > /opt/zookeeper/conf/zoo.cfg
|
||||
RUN mkdir /zookeeper && chmod -R 777 /zookeeper
|
||||
|
||||
ENV TZ=Europe/Moscow
|
||||
ENV TZ=Etc/UTC
|
||||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||
|
@ -40,7 +40,7 @@ RUN apt-get update \
|
||||
/tmp/* \
|
||||
&& apt-get clean
|
||||
|
||||
ENV TZ=Europe/Moscow
|
||||
ENV TZ=Etc/UTC
|
||||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||
|
||||
ENV DOCKER_CHANNEL stable
|
||||
|
@ -13,10 +13,18 @@ Alias: `INET_NTOA`.
|
||||
|
||||
## IPv4StringToNum(s) {#ipv4stringtonums}
|
||||
|
||||
The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it returns 0.
|
||||
The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it throws exception.
|
||||
|
||||
Alias: `INET_ATON`.
|
||||
|
||||
## IPv4StringToNumOrDefault(s) {#ipv4stringtonums}
|
||||
|
||||
Same as `IPv4StringToNum`, but if the IPv4 address has an invalid format, it returns 0.
|
||||
|
||||
## IPv4StringToNumOrNull(s) {#ipv4stringtonums}
|
||||
|
||||
Same as `IPv4StringToNum`, but if the IPv4 address has an invalid format, it returns null.
|
||||
|
||||
## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum}
|
||||
|
||||
Similar to IPv4NumToString, but using xxx instead of the last octet.
|
||||
@ -123,7 +131,7 @@ LIMIT 10
|
||||
|
||||
## IPv6StringToNum {#ipv6stringtonums}
|
||||
|
||||
The reverse function of [IPv6NumToString](#ipv6numtostringx). If the IPv6 address has an invalid format, it returns a string of null bytes.
|
||||
The reverse function of [IPv6NumToString](#ipv6numtostringx). If the IPv6 address has an invalid format, it throws exception.
|
||||
|
||||
If the input string contains a valid IPv4 address, returns its IPv6 equivalent.
|
||||
HEX can be uppercase or lowercase.
|
||||
@ -168,6 +176,14 @@ Result:
|
||||
|
||||
- [cutIPv6](#cutipv6x-bytestocutforipv6-bytestocutforipv4).
|
||||
|
||||
## IPv6StringToNumOrDefault(s) {#ipv6stringtonums}
|
||||
|
||||
Same as `IPv6StringToNum`, but if the IPv6 address has an invalid format, it returns 0.
|
||||
|
||||
## IPv6StringToNumOrNull(s) {#ipv6stringtonums}
|
||||
|
||||
Same as `IPv6StringToNum`, but if the IPv6 address has an invalid format, it returns null.
|
||||
|
||||
## IPv4ToIPv6(x) {#ipv4toipv6x}
|
||||
|
||||
Takes a `UInt32` number. Interprets it as an IPv4 address in [big endian](https://en.wikipedia.org/wiki/Endianness). Returns a `FixedString(16)` value containing the IPv6 address in binary format. Examples:
|
||||
@ -261,6 +277,14 @@ SELECT
|
||||
└───────────────────────────────────┴──────────────────────────┘
|
||||
```
|
||||
|
||||
## toIPv4OrDefault(string) {#toipv4ordefaultstring}
|
||||
|
||||
Same as `toIPv4`, but if the IPv4 address has an invalid format, it returns 0.
|
||||
|
||||
## toIPv4OrNull(string) {#toipv4ornullstring}
|
||||
|
||||
Same as `toIPv4`, but if the IPv4 address has an invalid format, it returns null.
|
||||
|
||||
## toIPv6 {#toipv6string}
|
||||
|
||||
Converts a string form of IPv6 address to [IPv6](../../sql-reference/data-types/domains/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value.
|
||||
@ -317,6 +341,14 @@ Result:
|
||||
└─────────────────────┘
|
||||
```
|
||||
|
||||
## IPv6StringToNumOrDefault(s) {#toipv6ordefaultstring}
|
||||
|
||||
Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns 0.
|
||||
|
||||
## IPv6StringToNumOrNull(s) {#toipv6ornullstring}
|
||||
|
||||
Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns null.
|
||||
|
||||
## isIPv4String {#isipv4string}
|
||||
|
||||
Determines whether the input string is an IPv4 address or not. If `string` is IPv6 address returns `0`.
|
||||
|
48
docs/en/sql-reference/functions/statistics.md
Normal file
48
docs/en/sql-reference/functions/statistics.md
Normal file
@ -0,0 +1,48 @@
|
||||
---
|
||||
toc_priority: 69
|
||||
toc_title: Statistics
|
||||
---
|
||||
|
||||
# Functions for Working with Statistics {#functions-for-working-with-statistics}
|
||||
|
||||
# proportionsZTest {#proportionsztest}
|
||||
|
||||
Applies proportion z-test to samples from two populations (X and Y). The alternative is 'two-sided'.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
proportionsZTest(successes_x, successes_y, trials_x, trials_y, significance_level, usevar)
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `successes_x` — The number of successes for X in trials.
|
||||
- `successes_y` — The number of successes for X in trials.
|
||||
- `trials_x` — The number of trials for X.
|
||||
- `trials_y` — The number of trials for Y.
|
||||
- `significance_level`
|
||||
- `usevar` - It can be `'pooled'` or `'unpooled'`.
|
||||
- `'pooled'` - The variance of the two populations are assumed to be equal.
|
||||
- `'unpooled'` - The assumption of equal variances is dropped.
|
||||
|
||||
**Returned value**
|
||||
|
||||
- A tuple with the (z-statistic, p-value, confidence-interval-lower, confidence-interval-upper).
|
||||
|
||||
Type: [Tuple](../../sql-reference/data-types/tuple.md).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled');
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
(-0.20656724435948853,0.8363478437079654,-0.09345975390115283,0.07563797172293502)
|
||||
```
|
||||
|
@ -1,4 +1,4 @@
|
||||
Babel==2.8.0
|
||||
Babel==2.9.1
|
||||
backports-abc==0.5
|
||||
backports.functools-lru-cache==1.6.1
|
||||
beautifulsoup4==4.9.1
|
||||
@ -10,22 +10,22 @@ cssmin==0.2.0
|
||||
future==0.18.2
|
||||
htmlmin==0.1.12
|
||||
idna==2.10
|
||||
Jinja2>=2.11.3
|
||||
Jinja2>=3.0.3
|
||||
jinja2-highlight==0.6.1
|
||||
jsmin==3.0.0
|
||||
livereload==2.6.2
|
||||
livereload==2.6.3
|
||||
Markdown==3.3.2
|
||||
MarkupSafe==1.1.1
|
||||
MarkupSafe==2.1.0
|
||||
mkdocs==1.1.2
|
||||
mkdocs-htmlproofer-plugin==0.0.3
|
||||
mkdocs-macros-plugin==0.4.20
|
||||
nltk==3.5
|
||||
nltk==3.7
|
||||
nose==1.3.7
|
||||
protobuf==3.14.0
|
||||
numpy==1.21.2
|
||||
pymdown-extensions==8.0
|
||||
python-slugify==4.0.1
|
||||
PyYAML==5.4.1
|
||||
PyYAML==6.0
|
||||
repackage==0.7.3
|
||||
requests==2.25.1
|
||||
singledispatch==3.4.0.3
|
||||
@ -34,5 +34,6 @@ soupsieve==2.0.1
|
||||
termcolor==1.1.0
|
||||
tornado==6.1
|
||||
Unidecode==1.1.1
|
||||
urllib3>=1.26.5
|
||||
Pygments>=2.7.4
|
||||
urllib3>=1.26.8
|
||||
Pygments>=2.11.2
|
||||
|
||||
|
@ -787,6 +787,7 @@ void Client::printHelpMessage(const OptionsDescription & options_description)
|
||||
{
|
||||
std::cout << options_description.main_description.value() << "\n";
|
||||
std::cout << options_description.external_description.value() << "\n";
|
||||
std::cout << options_description.hosts_and_ports_description.value() << "\n";
|
||||
std::cout << "In addition, --param_name=value can be specified for substitution of parameters for parametrized queries.\n";
|
||||
}
|
||||
|
||||
|
@ -304,8 +304,8 @@ void LocalServer::setupUsers()
|
||||
|
||||
ConfigurationPtr users_config;
|
||||
auto & access_control = global_context->getAccessControl();
|
||||
access_control.setPlaintextPasswordSetting(config().getBool("allow_plaintext_password", true));
|
||||
access_control.setNoPasswordSetting(config().getBool("allow_no_password", true));
|
||||
access_control.setNoPasswordAllowed(config().getBool("allow_no_password", true));
|
||||
access_control.setPlaintextPasswordAllowed(config().getBool("allow_plaintext_password", true));
|
||||
if (config().has("users_config") || config().has("config-file") || fs::exists("config.xml"))
|
||||
{
|
||||
const auto users_config_path = config().getString("users_config", config().getString("config-file", "config.xml"));
|
||||
|
@ -1074,9 +1074,10 @@ if (ThreadFuzzer::instance().isEffective())
|
||||
auto & access_control = global_context->getAccessControl();
|
||||
if (config().has("custom_settings_prefixes"))
|
||||
access_control.setCustomSettingsPrefixes(config().getString("custom_settings_prefixes"));
|
||||
///set the allow_plaintext_and_no_password setting in context.
|
||||
access_control.setPlaintextPasswordSetting(config().getBool("allow_plaintext_password", true));
|
||||
access_control.setNoPasswordSetting(config().getBool("allow_no_password", true));
|
||||
|
||||
access_control.setNoPasswordAllowed(config().getBool("allow_no_password", true));
|
||||
access_control.setPlaintextPasswordAllowed(config().getBool("allow_plaintext_password", true));
|
||||
|
||||
/// Initialize access storages.
|
||||
try
|
||||
{
|
||||
|
@ -368,7 +368,7 @@
|
||||
<!-- Path to temporary data for processing hard queries. -->
|
||||
<tmp_path>/var/lib/clickhouse/tmp/</tmp_path>
|
||||
|
||||
<!-- Disable AuthType Plaintext_password and No_password for ACL. -->
|
||||
<!-- Disable AuthType plaintext_password and no_password for ACL. -->
|
||||
<!-- <allow_plaintext_password>0</allow_plaintext_password> -->
|
||||
<!-- <allow_no_password>0</allow_no_password> -->`
|
||||
|
||||
|
@ -173,7 +173,8 @@ void AccessControl::addUsersConfigStorage(const String & storage_name_, const Po
|
||||
auto check_setting_name_function = [this](const std::string_view & setting_name) { checkSettingNameIsAllowed(setting_name); };
|
||||
auto is_no_password_allowed_function = [this]() -> bool { return isNoPasswordAllowed(); };
|
||||
auto is_plaintext_password_allowed_function = [this]() -> bool { return isPlaintextPasswordAllowed(); };
|
||||
auto new_storage = std::make_shared<UsersConfigAccessStorage>(storage_name_, check_setting_name_function,is_no_password_allowed_function,is_plaintext_password_allowed_function);
|
||||
auto new_storage = std::make_shared<UsersConfigAccessStorage>(storage_name_, check_setting_name_function,
|
||||
is_no_password_allowed_function, is_plaintext_password_allowed_function);
|
||||
new_storage->setConfig(users_config_);
|
||||
addStorage(new_storage);
|
||||
LOG_DEBUG(getLogger(), "Added {} access storage '{}', path: {}",
|
||||
@ -209,7 +210,8 @@ void AccessControl::addUsersConfigStorage(
|
||||
auto check_setting_name_function = [this](const std::string_view & setting_name) { checkSettingNameIsAllowed(setting_name); };
|
||||
auto is_no_password_allowed_function = [this]() -> bool { return isNoPasswordAllowed(); };
|
||||
auto is_plaintext_password_allowed_function = [this]() -> bool { return isPlaintextPasswordAllowed(); };
|
||||
auto new_storage = std::make_shared<UsersConfigAccessStorage>(storage_name_, check_setting_name_function,is_no_password_allowed_function,is_plaintext_password_allowed_function);
|
||||
auto new_storage = std::make_shared<UsersConfigAccessStorage>(storage_name_, check_setting_name_function,
|
||||
is_no_password_allowed_function, is_plaintext_password_allowed_function);
|
||||
new_storage->load(users_config_path_, include_from_path_, preprocessed_dir_, get_zookeeper_function_);
|
||||
addStorage(new_storage);
|
||||
LOG_DEBUG(getLogger(), "Added {} access storage '{}', path: {}", String(new_storage->getStorageType()), new_storage->getStorageName(), new_storage->getPath());
|
||||
@ -411,7 +413,8 @@ UUID AccessControl::authenticate(const Credentials & credentials, const Poco::Ne
|
||||
{
|
||||
try
|
||||
{
|
||||
return MultipleAccessStorage::authenticate(credentials, address, *external_authenticators,allow_no_password, allow_plaintext_password);
|
||||
return MultipleAccessStorage::authenticate(credentials, address, *external_authenticators, allow_no_password,
|
||||
allow_plaintext_password);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
@ -447,26 +450,38 @@ void AccessControl::setCustomSettingsPrefixes(const String & comma_separated_pre
|
||||
setCustomSettingsPrefixes(prefixes);
|
||||
}
|
||||
|
||||
void AccessControl::setPlaintextPasswordSetting(bool allow_plaintext_password_)
|
||||
{
|
||||
allow_plaintext_password = allow_plaintext_password_;
|
||||
}
|
||||
void AccessControl::setNoPasswordSetting(bool allow_no_password_)
|
||||
{
|
||||
allow_no_password = allow_no_password_;
|
||||
}
|
||||
|
||||
bool AccessControl::isSettingNameAllowed(const std::string_view & setting_name) const
|
||||
bool AccessControl::isSettingNameAllowed(const std::string_view setting_name) const
|
||||
{
|
||||
return custom_settings_prefixes->isSettingNameAllowed(setting_name);
|
||||
}
|
||||
|
||||
void AccessControl::checkSettingNameIsAllowed(const std::string_view & setting_name) const
|
||||
void AccessControl::checkSettingNameIsAllowed(const std::string_view setting_name) const
|
||||
{
|
||||
custom_settings_prefixes->checkSettingNameIsAllowed(setting_name);
|
||||
}
|
||||
|
||||
|
||||
void AccessControl::setNoPasswordAllowed(bool allow_no_password_)
|
||||
{
|
||||
allow_no_password = allow_no_password_;
|
||||
}
|
||||
|
||||
bool AccessControl::isNoPasswordAllowed() const
|
||||
{
|
||||
return allow_no_password;
|
||||
}
|
||||
|
||||
void AccessControl::setPlaintextPasswordAllowed(bool allow_plaintext_password_)
|
||||
{
|
||||
allow_plaintext_password = allow_plaintext_password_;
|
||||
}
|
||||
|
||||
bool AccessControl::isPlaintextPasswordAllowed() const
|
||||
{
|
||||
return allow_plaintext_password;
|
||||
}
|
||||
|
||||
|
||||
std::shared_ptr<const ContextAccess> AccessControl::getContextAccess(
|
||||
const UUID & user_id,
|
||||
const std::vector<UUID> & current_roles,
|
||||
@ -550,15 +565,6 @@ std::vector<QuotaUsage> AccessControl::getAllQuotasUsage() const
|
||||
return quota_cache->getAllQuotasUsage();
|
||||
}
|
||||
|
||||
bool AccessControl::isPlaintextPasswordAllowed() const
|
||||
{
|
||||
return allow_plaintext_password;
|
||||
}
|
||||
|
||||
bool AccessControl::isNoPasswordAllowed() const
|
||||
{
|
||||
return allow_no_password;
|
||||
}
|
||||
|
||||
std::shared_ptr<const EnabledSettings> AccessControl::getEnabledSettings(
|
||||
const UUID & user_id,
|
||||
|
@ -49,8 +49,6 @@ class AccessControl : public MultipleAccessStorage
|
||||
public:
|
||||
AccessControl();
|
||||
~AccessControl() override;
|
||||
std::atomic_bool allow_plaintext_password;
|
||||
std::atomic_bool allow_no_password;
|
||||
|
||||
/// Parses access entities from a configuration loaded from users.xml.
|
||||
/// This function add UsersConfigAccessStorage if it wasn't added before.
|
||||
@ -113,12 +111,16 @@ public:
|
||||
/// This function also enables custom prefixes to be used.
|
||||
void setCustomSettingsPrefixes(const Strings & prefixes);
|
||||
void setCustomSettingsPrefixes(const String & comma_separated_prefixes);
|
||||
bool isSettingNameAllowed(const std::string_view & name) const;
|
||||
void checkSettingNameIsAllowed(const std::string_view & name) const;
|
||||
bool isSettingNameAllowed(const std::string_view name) const;
|
||||
void checkSettingNameIsAllowed(const std::string_view name) const;
|
||||
|
||||
//sets allow_plaintext_password and allow_no_password setting
|
||||
void setPlaintextPasswordSetting(const bool allow_plaintext_password_);
|
||||
void setNoPasswordSetting(const bool allow_no_password_);
|
||||
/// Allows users without password (by default it's allowed).
|
||||
void setNoPasswordAllowed(const bool allow_no_password_);
|
||||
bool isNoPasswordAllowed() const;
|
||||
|
||||
/// Allows users with plaintext password (by default it's allowed).
|
||||
void setPlaintextPasswordAllowed(const bool allow_plaintext_password_);
|
||||
bool isPlaintextPasswordAllowed() const;
|
||||
|
||||
UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address) const;
|
||||
void setExternalAuthenticatorsConfig(const Poco::Util::AbstractConfiguration & config);
|
||||
@ -153,9 +155,6 @@ public:
|
||||
|
||||
std::vector<QuotaUsage> getAllQuotasUsage() const;
|
||||
|
||||
bool isPlaintextPasswordAllowed() const;
|
||||
bool isNoPasswordAllowed() const;
|
||||
|
||||
std::shared_ptr<const EnabledSettings> getEnabledSettings(
|
||||
const UUID & user_id,
|
||||
const SettingsProfileElements & settings_from_user,
|
||||
@ -177,6 +176,8 @@ private:
|
||||
std::unique_ptr<SettingsProfilesCache> settings_profiles_cache;
|
||||
std::unique_ptr<ExternalAuthenticators> external_authenticators;
|
||||
std::unique_ptr<CustomSettingsPrefixes> custom_settings_prefixes;
|
||||
std::atomic_bool allow_plaintext_password = true;
|
||||
std::atomic_bool allow_no_password = true;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -120,7 +120,7 @@ AccessEntityPtr deserializeAccessEntityImpl(const String & definition)
|
||||
if (res)
|
||||
throw Exception("Two access entities attached in the same file", ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
|
||||
res = user = std::make_unique<User>();
|
||||
InterpreterCreateUserQuery::updateUserFromQuery(*user, *create_user_query);
|
||||
InterpreterCreateUserQuery::updateUserFromQuery(*user, *create_user_query, /* allow_no_password = */ true, /* allow_plaintext_password = */ true);
|
||||
}
|
||||
else if (auto * create_role_query = query->as<ASTCreateRoleQuery>())
|
||||
{
|
||||
|
@ -441,7 +441,9 @@ void IAccessStorage::notify(const Notifications & notifications)
|
||||
UUID IAccessStorage::authenticate(
|
||||
const Credentials & credentials,
|
||||
const Poco::Net::IPAddress & address,
|
||||
const ExternalAuthenticators & external_authenticators, bool allow_no_password, bool allow_plaintext_password) const
|
||||
const ExternalAuthenticators & external_authenticators,
|
||||
bool allow_no_password,
|
||||
bool allow_plaintext_password) const
|
||||
{
|
||||
return *authenticateImpl(credentials, address, external_authenticators, /* throw_if_user_not_exists = */ true, allow_no_password, allow_plaintext_password);
|
||||
}
|
||||
@ -451,7 +453,9 @@ std::optional<UUID> IAccessStorage::authenticate(
|
||||
const Credentials & credentials,
|
||||
const Poco::Net::IPAddress & address,
|
||||
const ExternalAuthenticators & external_authenticators,
|
||||
bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const
|
||||
bool throw_if_user_not_exists,
|
||||
bool allow_no_password,
|
||||
bool allow_plaintext_password) const
|
||||
{
|
||||
return authenticateImpl(credentials, address, external_authenticators, throw_if_user_not_exists, allow_no_password, allow_plaintext_password);
|
||||
}
|
||||
@ -461,7 +465,9 @@ std::optional<UUID> IAccessStorage::authenticateImpl(
|
||||
const Credentials & credentials,
|
||||
const Poco::Net::IPAddress & address,
|
||||
const ExternalAuthenticators & external_authenticators,
|
||||
bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const
|
||||
bool throw_if_user_not_exists,
|
||||
bool allow_no_password,
|
||||
bool allow_plaintext_password) const
|
||||
{
|
||||
if (auto id = find<User>(credentials.getUserName()))
|
||||
{
|
||||
@ -469,8 +475,11 @@ std::optional<UUID> IAccessStorage::authenticateImpl(
|
||||
{
|
||||
if (!isAddressAllowed(*user, address))
|
||||
throwAddressNotAllowed(address);
|
||||
if (isNoPasswordAllowed(*user, allow_no_password) || isPlaintextPasswordAllowed(*user, allow_plaintext_password))
|
||||
throwPasswordTypeNotAllowed();
|
||||
|
||||
auto auth_type = user->auth_data.getType();
|
||||
if (((auth_type == AuthenticationType::NO_PASSWORD) && !allow_no_password) ||
|
||||
((auth_type == AuthenticationType::PLAINTEXT_PASSWORD) && !allow_plaintext_password))
|
||||
throwAuthenticationTypeNotAllowed(auth_type);
|
||||
|
||||
if (!areCredentialsValid(*user, credentials, external_authenticators))
|
||||
throwInvalidCredentials();
|
||||
@ -506,15 +515,6 @@ bool IAccessStorage::isAddressAllowed(const User & user, const Poco::Net::IPAddr
|
||||
return user.allowed_client_hosts.contains(address);
|
||||
}
|
||||
|
||||
bool IAccessStorage::isPlaintextPasswordAllowed(const User & user, bool allow_plaintext_password)
|
||||
{
|
||||
return !allow_plaintext_password && user.auth_data.getType() == AuthenticationType::PLAINTEXT_PASSWORD;
|
||||
}
|
||||
|
||||
bool IAccessStorage::isNoPasswordAllowed(const User & user, bool allow_no_password)
|
||||
{
|
||||
return !allow_no_password && user.auth_data.getType() == AuthenticationType::NO_PASSWORD;
|
||||
}
|
||||
|
||||
UUID IAccessStorage::generateRandomID()
|
||||
{
|
||||
@ -610,11 +610,12 @@ void IAccessStorage::throwAddressNotAllowed(const Poco::Net::IPAddress & address
|
||||
throw Exception("Connections from " + address.toString() + " are not allowed", ErrorCodes::IP_ADDRESS_NOT_ALLOWED);
|
||||
}
|
||||
|
||||
void IAccessStorage::throwPasswordTypeNotAllowed()
|
||||
void IAccessStorage::throwAuthenticationTypeNotAllowed(AuthenticationType auth_type)
|
||||
{
|
||||
throw Exception(
|
||||
"Authentication denied for users configured with AuthType PLAINTEXT_PASSWORD and NO_PASSWORD. Please check with Clickhouse admin to allow allow PLAINTEXT_PASSWORD and NO_PASSWORD through server configuration ",
|
||||
ErrorCodes::AUTHENTICATION_FAILED);
|
||||
ErrorCodes::AUTHENTICATION_FAILED,
|
||||
"Authentication type {} is not allowed, check the setting allow_{} in the server configuration",
|
||||
toString(auth_type), AuthenticationTypeInfo::get(auth_type).name);
|
||||
}
|
||||
void IAccessStorage::throwInvalidCredentials()
|
||||
{
|
||||
|
@ -18,6 +18,7 @@ namespace DB
|
||||
struct User;
|
||||
class Credentials;
|
||||
class ExternalAuthenticators;
|
||||
enum class AuthenticationType;
|
||||
|
||||
/// Contains entities, i.e. instances of classes derived from IAccessEntity.
|
||||
/// The implementations of this class MUST be thread-safe.
|
||||
@ -148,7 +149,7 @@ public:
|
||||
|
||||
/// Finds a user, check the provided credentials and returns the ID of the user if they are valid.
|
||||
/// Throws an exception if no such user or credentials are invalid.
|
||||
UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool allow_no_password=true, bool allow_plaintext_password=true) const;
|
||||
UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool allow_no_password, bool allow_plaintext_password) const;
|
||||
std::optional<UUID> authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const;
|
||||
|
||||
protected:
|
||||
@ -164,8 +165,6 @@ protected:
|
||||
virtual std::optional<UUID> authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const;
|
||||
virtual bool areCredentialsValid(const User & user, const Credentials & credentials, const ExternalAuthenticators & external_authenticators) const;
|
||||
virtual bool isAddressAllowed(const User & user, const Poco::Net::IPAddress & address) const;
|
||||
static bool isPlaintextPasswordAllowed(const User & user, bool allow_plaintext_password) ;
|
||||
static bool isNoPasswordAllowed(const User & user, bool allow_no_password);
|
||||
static UUID generateRandomID();
|
||||
Poco::Logger * getLogger() const;
|
||||
static String formatEntityTypeWithName(AccessEntityType type, const String & name) { return AccessEntityTypeInfo::get(type).formatEntityNameWithType(name); }
|
||||
@ -181,7 +180,7 @@ protected:
|
||||
[[noreturn]] void throwReadonlyCannotRemove(AccessEntityType type, const String & name) const;
|
||||
[[noreturn]] static void throwAddressNotAllowed(const Poco::Net::IPAddress & address);
|
||||
[[noreturn]] static void throwInvalidCredentials();
|
||||
[[noreturn]] static void throwPasswordTypeNotAllowed();
|
||||
[[noreturn]] static void throwAuthenticationTypeNotAllowed(AuthenticationType auth_type);
|
||||
using Notification = std::tuple<OnChangedHandler, UUID, AccessEntityPtr>;
|
||||
using Notifications = std::vector<Notification>;
|
||||
static void notify(const Notifications & notifications);
|
||||
|
@ -481,7 +481,9 @@ std::optional<UUID> LDAPAccessStorage::authenticateImpl(
|
||||
const Credentials & credentials,
|
||||
const Poco::Net::IPAddress & address,
|
||||
const ExternalAuthenticators & external_authenticators,
|
||||
bool throw_if_user_not_exists,bool allow_no_password __attribute__((unused)), bool allow_plaintext_password __attribute__((unused))) const
|
||||
bool throw_if_user_not_exists,
|
||||
bool /* allow_no_password */,
|
||||
bool /* allow_plaintext_password */) const
|
||||
{
|
||||
std::scoped_lock lock(mutex);
|
||||
auto id = memory_storage.find<User>(credentials.getUserName());
|
||||
|
@ -449,14 +449,20 @@ void MultipleAccessStorage::updateSubscriptionsToNestedStorages(std::unique_lock
|
||||
}
|
||||
|
||||
|
||||
std::optional<UUID> MultipleAccessStorage::authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists,bool allow_no_password, bool allow_plaintext_password) const
|
||||
std::optional<UUID>
|
||||
MultipleAccessStorage::authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address,
|
||||
const ExternalAuthenticators & external_authenticators,
|
||||
bool throw_if_user_not_exists,
|
||||
bool allow_no_password, bool allow_plaintext_password) const
|
||||
{
|
||||
auto storages = getStoragesInternal();
|
||||
for (size_t i = 0; i != storages->size(); ++i)
|
||||
{
|
||||
const auto & storage = (*storages)[i];
|
||||
bool is_last_storage = (i == storages->size() - 1);
|
||||
auto id = storage->authenticate(credentials, address, external_authenticators, (throw_if_user_not_exists && is_last_storage), allow_no_password, allow_plaintext_password);
|
||||
auto id = storage->authenticate(credentials, address, external_authenticators,
|
||||
(throw_if_user_not_exists && is_last_storage),
|
||||
allow_no_password, allow_plaintext_password);
|
||||
if (id)
|
||||
{
|
||||
std::lock_guard lock{mutex};
|
||||
|
@ -28,8 +28,6 @@ namespace ErrorCodes
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int UNKNOWN_ADDRESS_PATTERN_TYPE;
|
||||
extern const int NOT_IMPLEMENTED;
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
|
||||
}
|
||||
|
||||
namespace
|
||||
@ -50,7 +48,7 @@ namespace
|
||||
UUID generateID(const IAccessEntity & entity) { return generateID(entity.getType(), entity.getName()); }
|
||||
|
||||
|
||||
UserPtr parseUser(const Poco::Util::AbstractConfiguration & config, const String & user_name)
|
||||
UserPtr parseUser(const Poco::Util::AbstractConfiguration & config, const String & user_name, bool allow_no_password, bool allow_plaintext_password)
|
||||
{
|
||||
auto user = std::make_shared<User>();
|
||||
user->setName(user_name);
|
||||
@ -130,6 +128,15 @@ namespace
|
||||
user->auth_data.setSSLCertificateCommonNames(std::move(common_names));
|
||||
}
|
||||
|
||||
auto auth_type = user->auth_data.getType();
|
||||
if (((auth_type == AuthenticationType::NO_PASSWORD) && !allow_no_password) ||
|
||||
((auth_type == AuthenticationType::PLAINTEXT_PASSWORD) && !allow_plaintext_password))
|
||||
{
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Authentication type {} is not allowed, check the setting allow_{} in the server configuration",
|
||||
toString(auth_type), AuthenticationTypeInfo::get(auth_type).name);
|
||||
}
|
||||
|
||||
const auto profile_name_config = user_config + ".profile";
|
||||
if (config.has(profile_name_config))
|
||||
{
|
||||
@ -225,24 +232,18 @@ namespace
|
||||
}
|
||||
|
||||
|
||||
std::vector<AccessEntityPtr> parseUsers(const Poco::Util::AbstractConfiguration & config, Fn<bool()> auto && is_no_password_allowed_function, Fn<bool()> auto && is_plaintext_password_allowed_function)
|
||||
std::vector<AccessEntityPtr> parseUsers(const Poco::Util::AbstractConfiguration & config, bool allow_no_password, bool allow_plaintext_password)
|
||||
{
|
||||
Poco::Util::AbstractConfiguration::Keys user_names;
|
||||
config.keys("users", user_names);
|
||||
|
||||
std::vector<AccessEntityPtr> users;
|
||||
users.reserve(user_names.size());
|
||||
bool allow_plaintext_password = is_plaintext_password_allowed_function();
|
||||
bool allow_no_password = is_no_password_allowed_function();
|
||||
for (const auto & user_name : user_names)
|
||||
{
|
||||
try
|
||||
{
|
||||
String user_config = "users." + user_name;
|
||||
if ((config.has(user_config + ".password") && !allow_plaintext_password) || (config.has(user_config + ".no_password") && !allow_no_password))
|
||||
throw Exception("Incorrect User configuration. User is not allowed to configure PLAINTEXT_PASSWORD or NO_PASSWORD. Please configure User with authtype SHA256_PASSWORD_HASH, SHA256_PASSWORD, DOUBLE_SHA1_PASSWORD OR enable setting allow_plaintext_and_no_password in server configuration to configure user with plaintext and no password Auth_Type"
|
||||
" Though it is not recommended to use plaintext_password and No_password for user authentication.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
users.push_back(parseUser(config, user_name));
|
||||
users.push_back(parseUser(config, user_name, allow_no_password, allow_plaintext_password));
|
||||
}
|
||||
catch (Exception & e)
|
||||
{
|
||||
@ -562,8 +563,10 @@ void UsersConfigAccessStorage::parseFromConfig(const Poco::Util::AbstractConfigu
|
||||
{
|
||||
try
|
||||
{
|
||||
bool no_password_allowed = is_no_password_allowed_function();
|
||||
bool plaintext_password_allowed = is_plaintext_password_allowed_function();
|
||||
std::vector<std::pair<UUID, AccessEntityPtr>> all_entities;
|
||||
for (const auto & entity : parseUsers(config,is_no_password_allowed_function, is_plaintext_password_allowed_function))
|
||||
for (const auto & entity : parseUsers(config, no_password_allowed, plaintext_password_allowed))
|
||||
all_entities.emplace_back(generateID(*entity), entity);
|
||||
for (const auto & entity : parseQuotas(config))
|
||||
all_entities.emplace_back(generateID(*entity), entity);
|
||||
|
@ -38,7 +38,8 @@ struct AggregateFunctionWithProperties
|
||||
AggregateFunctionWithProperties(const AggregateFunctionWithProperties &) = default;
|
||||
AggregateFunctionWithProperties & operator = (const AggregateFunctionWithProperties &) = default;
|
||||
|
||||
template <typename Creator, std::enable_if_t<!std::is_same_v<Creator, AggregateFunctionWithProperties>> * = nullptr>
|
||||
template <typename Creator>
|
||||
requires (!std::is_same_v<Creator, AggregateFunctionWithProperties>)
|
||||
AggregateFunctionWithProperties(Creator creator_, AggregateFunctionProperties properties_ = {}) /// NOLINT
|
||||
: creator(std::forward<Creator>(creator_)), properties(std::move(properties_))
|
||||
{
|
||||
|
@ -569,6 +569,14 @@ if (ENABLE_TESTS)
|
||||
clickhouse_common_zookeeper
|
||||
string_utils)
|
||||
|
||||
if (TARGET ch_contrib::simdjson)
|
||||
target_link_libraries(unit_tests_dbms PRIVATE ch_contrib::simdjson)
|
||||
endif()
|
||||
|
||||
if(TARGET ch_contrib::rapidjson)
|
||||
target_include_directories(unit_tests_dbms PRIVATE ch_contrib::rapidjson)
|
||||
endif()
|
||||
|
||||
if (TARGET ch_contrib::yaml_cpp)
|
||||
target_link_libraries(unit_tests_dbms PRIVATE ch_contrib::yaml_cpp)
|
||||
endif()
|
||||
|
@ -1092,10 +1092,11 @@ void ClientBase::sendData(Block & sample, const ColumnsDescription & columns_des
|
||||
|
||||
try
|
||||
{
|
||||
auto metadata = storage->getInMemoryMetadataPtr();
|
||||
sendDataFromPipe(
|
||||
storage->read(
|
||||
sample.getNames(),
|
||||
storage->getInMemoryMetadataPtr(),
|
||||
storage->getStorageSnapshot(metadata),
|
||||
query_info,
|
||||
global_context,
|
||||
{},
|
||||
|
@ -297,7 +297,7 @@ ColumnPtr ColumnAggregateFunction::filter(const Filter & filter, ssize_t result_
|
||||
{
|
||||
size_t size = data.size();
|
||||
if (size != filter.size())
|
||||
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filter.size(), size);
|
||||
|
||||
if (size == 0)
|
||||
return cloneEmpty();
|
||||
|
@ -608,7 +608,7 @@ ColumnPtr ColumnArray::filterString(const Filter & filt, ssize_t result_size_hin
|
||||
{
|
||||
size_t col_size = getOffsets().size();
|
||||
if (col_size != filt.size())
|
||||
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), col_size);
|
||||
|
||||
if (0 == col_size)
|
||||
return ColumnArray::create(data);
|
||||
@ -676,7 +676,7 @@ ColumnPtr ColumnArray::filterGeneric(const Filter & filt, ssize_t result_size_hi
|
||||
{
|
||||
size_t size = getOffsets().size();
|
||||
if (size != filt.size())
|
||||
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size);
|
||||
|
||||
if (size == 0)
|
||||
return ColumnArray::create(data);
|
||||
@ -1189,4 +1189,12 @@ void ColumnArray::gather(ColumnGathererStream & gatherer)
|
||||
gatherer.gather(*this);
|
||||
}
|
||||
|
||||
size_t ColumnArray::getNumberOfDimensions() const
|
||||
{
|
||||
const auto * nested_array = checkAndGetColumn<ColumnArray>(*data);
|
||||
if (!nested_array)
|
||||
return 1;
|
||||
return 1 + nested_array->getNumberOfDimensions(); /// Every modern C++ compiler optimizes tail recursion.
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -169,6 +169,8 @@ public:
|
||||
|
||||
bool isCollationSupported() const override { return getData().isCollationSupported(); }
|
||||
|
||||
size_t getNumberOfDimensions() const;
|
||||
|
||||
private:
|
||||
WrappedPtr data;
|
||||
WrappedPtr offsets;
|
||||
|
@ -266,7 +266,7 @@ ColumnPtr ColumnDecimal<T>::filter(const IColumn::Filter & filt, ssize_t result_
|
||||
{
|
||||
size_t size = data.size();
|
||||
if (size != filt.size())
|
||||
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size);
|
||||
|
||||
auto res = this->create(0, scale);
|
||||
Container & res_data = res->getData();
|
||||
|
@ -207,7 +207,7 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result
|
||||
{
|
||||
size_t col_size = size();
|
||||
if (col_size != filt.size())
|
||||
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), col_size);
|
||||
|
||||
auto res = ColumnFixedString::create(n);
|
||||
|
||||
|
@ -144,15 +144,15 @@ public:
|
||||
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override
|
||||
{
|
||||
return null_map->getRatioOfDefaultRows(sample_ratio);
|
||||
return getRatioOfDefaultRowsImpl<ColumnNullable>(sample_ratio);
|
||||
}
|
||||
|
||||
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override
|
||||
{
|
||||
null_map->getIndicesOfNonDefaultRows(indices, from, limit);
|
||||
getIndicesOfNonDefaultRowsImpl<ColumnNullable>(indices, from, limit);
|
||||
}
|
||||
|
||||
ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override;
|
||||
ColumnPtr createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override;
|
||||
|
||||
bool isNullable() const override { return true; }
|
||||
bool isFixedAndContiguous() const override { return false; }
|
||||
|
780
src/Columns/ColumnObject.cpp
Normal file
780
src/Columns/ColumnObject.cpp
Normal file
@ -0,0 +1,780 @@
|
||||
#include <Core/Field.h>
|
||||
#include <Columns/ColumnObject.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <DataTypes/ObjectUtils.h>
|
||||
#include <DataTypes/getLeastSupertype.h>
|
||||
#include <DataTypes/DataTypeNothing.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <DataTypes/DataTypeFactory.h>
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
#include <Interpreters/castColumn.h>
|
||||
#include <Interpreters/convertFieldToType.h>
|
||||
#include <Common/HashTable/HashSet.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
extern const int DUPLICATE_COLUMN;
|
||||
extern const int NUMBER_OF_DIMENSIONS_MISMATHED;
|
||||
extern const int NOT_IMPLEMENTED;
|
||||
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
/// Recreates column with default scalar values and keeps sizes of arrays.
|
||||
ColumnPtr recreateColumnWithDefaultValues(
|
||||
const ColumnPtr & column, const DataTypePtr & scalar_type, size_t num_dimensions)
|
||||
{
|
||||
const auto * column_array = checkAndGetColumn<ColumnArray>(column.get());
|
||||
if (column_array && num_dimensions)
|
||||
{
|
||||
return ColumnArray::create(
|
||||
recreateColumnWithDefaultValues(
|
||||
column_array->getDataPtr(), scalar_type, num_dimensions - 1),
|
||||
IColumn::mutate(column_array->getOffsetsPtr()));
|
||||
}
|
||||
|
||||
return createArrayOfType(scalar_type, num_dimensions)->createColumn()->cloneResized(column->size());
|
||||
}
|
||||
|
||||
/// Replaces NULL fields to given field or empty array.
|
||||
class FieldVisitorReplaceNull : public StaticVisitor<Field>
|
||||
{
|
||||
public:
|
||||
explicit FieldVisitorReplaceNull(
|
||||
const Field & replacement_, size_t num_dimensions_)
|
||||
: replacement(replacement_)
|
||||
, num_dimensions(num_dimensions_)
|
||||
{
|
||||
}
|
||||
|
||||
Field operator()(const Null &) const
|
||||
{
|
||||
return num_dimensions
|
||||
? createEmptyArrayField(num_dimensions)
|
||||
: replacement;
|
||||
}
|
||||
|
||||
Field operator()(const Array & x) const
|
||||
{
|
||||
assert(num_dimensions > 0);
|
||||
const size_t size = x.size();
|
||||
Array res(size);
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
res[i] = applyVisitor(FieldVisitorReplaceNull(replacement, num_dimensions - 1), x[i]);
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Field operator()(const T & x) const { return x; }
|
||||
|
||||
private:
|
||||
const Field & replacement;
|
||||
size_t num_dimensions;
|
||||
};
|
||||
|
||||
/// Calculates number of dimensions in array field.
|
||||
/// Returns 0 for scalar fields.
|
||||
class FieldVisitorToNumberOfDimensions : public StaticVisitor<size_t>
|
||||
{
|
||||
public:
|
||||
size_t operator()(const Array & x) const
|
||||
{
|
||||
const size_t size = x.size();
|
||||
std::optional<size_t> dimensions;
|
||||
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
/// Do not count Nulls, because they will be replaced by default
|
||||
/// values with proper number of dimensions.
|
||||
if (x[i].isNull())
|
||||
continue;
|
||||
|
||||
size_t current_dimensions = applyVisitor(*this, x[i]);
|
||||
if (!dimensions)
|
||||
dimensions = current_dimensions;
|
||||
else if (current_dimensions != *dimensions)
|
||||
throw Exception(ErrorCodes::NUMBER_OF_DIMENSIONS_MISMATHED,
|
||||
"Number of dimensions mismatched among array elements");
|
||||
}
|
||||
|
||||
return 1 + dimensions.value_or(0);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t operator()(const T &) const { return 0; }
|
||||
};
|
||||
|
||||
/// Visitor that allows to get type of scalar field
|
||||
/// or least common type of scalars in array.
|
||||
/// More optimized version of FieldToDataType.
|
||||
class FieldVisitorToScalarType : public StaticVisitor<>
|
||||
{
|
||||
public:
|
||||
using FieldType = Field::Types::Which;
|
||||
|
||||
void operator()(const Array & x)
|
||||
{
|
||||
size_t size = x.size();
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
applyVisitor(*this, x[i]);
|
||||
}
|
||||
|
||||
void operator()(const UInt64 & x)
|
||||
{
|
||||
field_types.insert(FieldType::UInt64);
|
||||
if (x <= std::numeric_limits<UInt8>::max())
|
||||
type_indexes.insert(TypeIndex::UInt8);
|
||||
else if (x <= std::numeric_limits<UInt16>::max())
|
||||
type_indexes.insert(TypeIndex::UInt16);
|
||||
else if (x <= std::numeric_limits<UInt32>::max())
|
||||
type_indexes.insert(TypeIndex::UInt32);
|
||||
else
|
||||
type_indexes.insert(TypeIndex::UInt64);
|
||||
}
|
||||
|
||||
void operator()(const Int64 & x)
|
||||
{
|
||||
field_types.insert(FieldType::Int64);
|
||||
if (x <= std::numeric_limits<Int8>::max() && x >= std::numeric_limits<Int8>::min())
|
||||
type_indexes.insert(TypeIndex::Int8);
|
||||
else if (x <= std::numeric_limits<Int16>::max() && x >= std::numeric_limits<Int16>::min())
|
||||
type_indexes.insert(TypeIndex::Int16);
|
||||
else if (x <= std::numeric_limits<Int32>::max() && x >= std::numeric_limits<Int32>::min())
|
||||
type_indexes.insert(TypeIndex::Int32);
|
||||
else
|
||||
type_indexes.insert(TypeIndex::Int64);
|
||||
}
|
||||
|
||||
void operator()(const Null &)
|
||||
{
|
||||
have_nulls = true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void operator()(const T &)
|
||||
{
|
||||
field_types.insert(Field::TypeToEnum<NearestFieldType<T>>::value);
|
||||
type_indexes.insert(TypeToTypeIndex<NearestFieldType<T>>);
|
||||
}
|
||||
|
||||
DataTypePtr getScalarType() const { return getLeastSupertype(type_indexes, true); }
|
||||
bool haveNulls() const { return have_nulls; }
|
||||
bool needConvertField() const { return field_types.size() > 1; }
|
||||
|
||||
private:
|
||||
TypeIndexSet type_indexes;
|
||||
std::unordered_set<FieldType> field_types;
|
||||
bool have_nulls = false;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
FieldInfo getFieldInfo(const Field & field)
|
||||
{
|
||||
FieldVisitorToScalarType to_scalar_type_visitor;
|
||||
applyVisitor(to_scalar_type_visitor, field);
|
||||
|
||||
return
|
||||
{
|
||||
to_scalar_type_visitor.getScalarType(),
|
||||
to_scalar_type_visitor.haveNulls(),
|
||||
to_scalar_type_visitor.needConvertField(),
|
||||
applyVisitor(FieldVisitorToNumberOfDimensions(), field),
|
||||
};
|
||||
}
|
||||
|
||||
ColumnObject::Subcolumn::Subcolumn(MutableColumnPtr && data_, bool is_nullable_)
|
||||
: least_common_type(getDataTypeByColumn(*data_))
|
||||
, is_nullable(is_nullable_)
|
||||
{
|
||||
data.push_back(std::move(data_));
|
||||
}
|
||||
|
||||
ColumnObject::Subcolumn::Subcolumn(
|
||||
size_t size_, bool is_nullable_)
|
||||
: least_common_type(std::make_shared<DataTypeNothing>())
|
||||
, is_nullable(is_nullable_)
|
||||
, num_of_defaults_in_prefix(size_)
|
||||
{
|
||||
}
|
||||
|
||||
size_t ColumnObject::Subcolumn::Subcolumn::size() const
|
||||
{
|
||||
size_t res = num_of_defaults_in_prefix;
|
||||
for (const auto & part : data)
|
||||
res += part->size();
|
||||
return res;
|
||||
}
|
||||
|
||||
size_t ColumnObject::Subcolumn::Subcolumn::byteSize() const
|
||||
{
|
||||
size_t res = 0;
|
||||
for (const auto & part : data)
|
||||
res += part->byteSize();
|
||||
return res;
|
||||
}
|
||||
|
||||
size_t ColumnObject::Subcolumn::Subcolumn::allocatedBytes() const
|
||||
{
|
||||
size_t res = 0;
|
||||
for (const auto & part : data)
|
||||
res += part->allocatedBytes();
|
||||
return res;
|
||||
}
|
||||
|
||||
void ColumnObject::Subcolumn::checkTypes() const
|
||||
{
|
||||
DataTypes prefix_types;
|
||||
prefix_types.reserve(data.size());
|
||||
for (size_t i = 0; i < data.size(); ++i)
|
||||
{
|
||||
auto current_type = getDataTypeByColumn(*data[i]);
|
||||
prefix_types.push_back(current_type);
|
||||
auto prefix_common_type = getLeastSupertype(prefix_types);
|
||||
if (!prefix_common_type->equals(*current_type))
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Data type {} of column at position {} cannot represent all columns from i-th prefix",
|
||||
current_type->getName(), i);
|
||||
}
|
||||
}
|
||||
|
||||
void ColumnObject::Subcolumn::insert(Field field)
|
||||
{
|
||||
auto info = getFieldInfo(field);
|
||||
insert(std::move(field), std::move(info));
|
||||
}
|
||||
|
||||
void ColumnObject::Subcolumn::insert(Field field, FieldInfo info)
|
||||
{
|
||||
auto base_type = info.scalar_type;
|
||||
|
||||
if (isNothing(base_type) && info.num_dimensions == 0)
|
||||
{
|
||||
insertDefault();
|
||||
return;
|
||||
}
|
||||
|
||||
auto column_dim = getNumberOfDimensions(*least_common_type);
|
||||
auto value_dim = info.num_dimensions;
|
||||
|
||||
if (isNothing(least_common_type))
|
||||
column_dim = value_dim;
|
||||
|
||||
if (field.isNull())
|
||||
value_dim = column_dim;
|
||||
|
||||
if (value_dim != column_dim)
|
||||
throw Exception(ErrorCodes::NUMBER_OF_DIMENSIONS_MISMATHED,
|
||||
"Dimension of types mismatched between inserted value and column. "
|
||||
"Dimension of value: {}. Dimension of column: {}",
|
||||
value_dim, column_dim);
|
||||
|
||||
if (is_nullable)
|
||||
base_type = makeNullable(base_type);
|
||||
|
||||
if (!is_nullable && info.have_nulls)
|
||||
field = applyVisitor(FieldVisitorReplaceNull(base_type->getDefault(), value_dim), std::move(field));
|
||||
|
||||
auto value_type = createArrayOfType(base_type, value_dim);
|
||||
bool type_changed = false;
|
||||
|
||||
if (data.empty())
|
||||
{
|
||||
data.push_back(value_type->createColumn());
|
||||
least_common_type = value_type;
|
||||
}
|
||||
else if (!least_common_type->equals(*value_type))
|
||||
{
|
||||
value_type = getLeastSupertype(DataTypes{value_type, least_common_type}, true);
|
||||
type_changed = true;
|
||||
if (!least_common_type->equals(*value_type))
|
||||
{
|
||||
data.push_back(value_type->createColumn());
|
||||
least_common_type = value_type;
|
||||
}
|
||||
}
|
||||
|
||||
if (type_changed || info.need_convert)
|
||||
field = convertFieldToTypeOrThrow(field, *value_type);
|
||||
|
||||
data.back()->insert(field);
|
||||
}
|
||||
|
||||
void ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn & src, size_t start, size_t length)
|
||||
{
|
||||
assert(src.isFinalized());
|
||||
|
||||
const auto & src_column = src.data.back();
|
||||
const auto & src_type = src.least_common_type;
|
||||
|
||||
if (data.empty())
|
||||
{
|
||||
least_common_type = src_type;
|
||||
data.push_back(src_type->createColumn());
|
||||
data.back()->insertRangeFrom(*src_column, start, length);
|
||||
}
|
||||
else if (least_common_type->equals(*src_type))
|
||||
{
|
||||
data.back()->insertRangeFrom(*src_column, start, length);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto new_least_common_type = getLeastSupertype(DataTypes{least_common_type, src_type}, true);
|
||||
auto casted_column = castColumn({src_column, src_type, ""}, new_least_common_type);
|
||||
|
||||
if (!least_common_type->equals(*new_least_common_type))
|
||||
{
|
||||
least_common_type = new_least_common_type;
|
||||
data.push_back(least_common_type->createColumn());
|
||||
}
|
||||
|
||||
data.back()->insertRangeFrom(*casted_column, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
void ColumnObject::Subcolumn::finalize()
|
||||
{
|
||||
if (isFinalized() || data.empty())
|
||||
return;
|
||||
|
||||
const auto & to_type = least_common_type;
|
||||
auto result_column = to_type->createColumn();
|
||||
|
||||
if (num_of_defaults_in_prefix)
|
||||
result_column->insertManyDefaults(num_of_defaults_in_prefix);
|
||||
|
||||
for (auto & part : data)
|
||||
{
|
||||
auto from_type = getDataTypeByColumn(*part);
|
||||
size_t part_size = part->size();
|
||||
|
||||
if (!from_type->equals(*to_type))
|
||||
{
|
||||
auto offsets = ColumnUInt64::create();
|
||||
auto & offsets_data = offsets->getData();
|
||||
|
||||
/// We need to convert only non-default values and then recreate column
|
||||
/// with default value of new type, because default values (which represents misses in data)
|
||||
/// may be inconsistent between types (e.g "0" in UInt64 and empty string in String).
|
||||
|
||||
part->getIndicesOfNonDefaultRows(offsets_data, 0, part_size);
|
||||
|
||||
if (offsets->size() == part_size)
|
||||
{
|
||||
part = castColumn({part, from_type, ""}, to_type);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto values = part->index(*offsets, offsets->size());
|
||||
values = castColumn({values, from_type, ""}, to_type);
|
||||
part = values->createWithOffsets(offsets_data, to_type->getDefault(), part_size, /*shift=*/ 0);
|
||||
}
|
||||
}
|
||||
|
||||
result_column->insertRangeFrom(*part, 0, part_size);
|
||||
}
|
||||
|
||||
data = { std::move(result_column) };
|
||||
num_of_defaults_in_prefix = 0;
|
||||
}
|
||||
|
||||
void ColumnObject::Subcolumn::insertDefault()
|
||||
{
|
||||
if (data.empty())
|
||||
++num_of_defaults_in_prefix;
|
||||
else
|
||||
data.back()->insertDefault();
|
||||
}
|
||||
|
||||
void ColumnObject::Subcolumn::insertManyDefaults(size_t length)
|
||||
{
|
||||
if (data.empty())
|
||||
num_of_defaults_in_prefix += length;
|
||||
else
|
||||
data.back()->insertManyDefaults(length);
|
||||
}
|
||||
|
||||
void ColumnObject::Subcolumn::popBack(size_t n)
|
||||
{
|
||||
assert(n <= size());
|
||||
|
||||
size_t num_removed = 0;
|
||||
for (auto it = data.rbegin(); it != data.rend(); ++it)
|
||||
{
|
||||
if (n == 0)
|
||||
break;
|
||||
|
||||
auto & column = *it;
|
||||
if (n < column->size())
|
||||
{
|
||||
column->popBack(n);
|
||||
n = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
++num_removed;
|
||||
n -= column->size();
|
||||
}
|
||||
}
|
||||
|
||||
data.resize(data.size() - num_removed);
|
||||
num_of_defaults_in_prefix -= n;
|
||||
}
|
||||
|
||||
Field ColumnObject::Subcolumn::getLastField() const
|
||||
{
|
||||
if (data.empty())
|
||||
return Field();
|
||||
|
||||
const auto & last_part = data.back();
|
||||
assert(!last_part->empty());
|
||||
return (*last_part)[last_part->size() - 1];
|
||||
}
|
||||
|
||||
ColumnObject::Subcolumn ColumnObject::Subcolumn::recreateWithDefaultValues(const FieldInfo & field_info) const
|
||||
{
|
||||
auto scalar_type = field_info.scalar_type;
|
||||
if (is_nullable)
|
||||
scalar_type = makeNullable(scalar_type);
|
||||
|
||||
Subcolumn new_subcolumn;
|
||||
new_subcolumn.least_common_type = createArrayOfType(scalar_type, field_info.num_dimensions);
|
||||
new_subcolumn.is_nullable = is_nullable;
|
||||
new_subcolumn.num_of_defaults_in_prefix = num_of_defaults_in_prefix;
|
||||
new_subcolumn.data.reserve(data.size());
|
||||
|
||||
for (const auto & part : data)
|
||||
new_subcolumn.data.push_back(recreateColumnWithDefaultValues(
|
||||
part, scalar_type, field_info.num_dimensions));
|
||||
|
||||
return new_subcolumn;
|
||||
}
|
||||
|
||||
IColumn & ColumnObject::Subcolumn::getFinalizedColumn()
|
||||
{
|
||||
assert(isFinalized());
|
||||
return *data[0];
|
||||
}
|
||||
|
||||
const IColumn & ColumnObject::Subcolumn::getFinalizedColumn() const
|
||||
{
|
||||
assert(isFinalized());
|
||||
return *data[0];
|
||||
}
|
||||
|
||||
const ColumnPtr & ColumnObject::Subcolumn::getFinalizedColumnPtr() const
|
||||
{
|
||||
assert(isFinalized());
|
||||
return data[0];
|
||||
}
|
||||
|
||||
ColumnObject::ColumnObject(bool is_nullable_)
|
||||
: is_nullable(is_nullable_)
|
||||
, num_rows(0)
|
||||
{
|
||||
}
|
||||
|
||||
ColumnObject::ColumnObject(SubcolumnsTree && subcolumns_, bool is_nullable_)
|
||||
: is_nullable(is_nullable_)
|
||||
, subcolumns(std::move(subcolumns_))
|
||||
, num_rows(subcolumns.empty() ? 0 : (*subcolumns.begin())->data.size())
|
||||
|
||||
{
|
||||
checkConsistency();
|
||||
}
|
||||
|
||||
void ColumnObject::checkConsistency() const
|
||||
{
|
||||
if (subcolumns.empty())
|
||||
return;
|
||||
|
||||
for (const auto & leaf : subcolumns)
|
||||
{
|
||||
if (num_rows != leaf->data.size())
|
||||
{
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Sizes of subcolumns are inconsistent in ColumnObject."
|
||||
" Subcolumn '{}' has {} rows, but expected size is {}",
|
||||
leaf->path.getPath(), leaf->data.size(), num_rows);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t ColumnObject::size() const
|
||||
{
|
||||
#ifndef NDEBUG
|
||||
checkConsistency();
|
||||
#endif
|
||||
return num_rows;
|
||||
}
|
||||
|
||||
MutableColumnPtr ColumnObject::cloneResized(size_t new_size) const
|
||||
{
|
||||
/// cloneResized with new_size == 0 is used for cloneEmpty().
|
||||
if (new_size != 0)
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
|
||||
"ColumnObject doesn't support resize to non-zero length");
|
||||
|
||||
return ColumnObject::create(is_nullable);
|
||||
}
|
||||
|
||||
size_t ColumnObject::byteSize() const
|
||||
{
|
||||
size_t res = 0;
|
||||
for (const auto & entry : subcolumns)
|
||||
res += entry->data.byteSize();
|
||||
return res;
|
||||
}
|
||||
|
||||
size_t ColumnObject::allocatedBytes() const
|
||||
{
|
||||
size_t res = 0;
|
||||
for (const auto & entry : subcolumns)
|
||||
res += entry->data.allocatedBytes();
|
||||
return res;
|
||||
}
|
||||
|
||||
void ColumnObject::forEachSubcolumn(ColumnCallback callback)
|
||||
{
|
||||
if (!isFinalized())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot iterate over non-finalized ColumnObject");
|
||||
|
||||
for (auto & entry : subcolumns)
|
||||
callback(entry->data.data.back());
|
||||
}
|
||||
|
||||
void ColumnObject::insert(const Field & field)
|
||||
{
|
||||
const auto & object = field.get<const Object &>();
|
||||
|
||||
HashSet<StringRef, StringRefHash> inserted;
|
||||
size_t old_size = size();
|
||||
for (const auto & [key_str, value] : object)
|
||||
{
|
||||
PathInData key(key_str);
|
||||
inserted.insert(key_str);
|
||||
if (!hasSubcolumn(key))
|
||||
addSubcolumn(key, old_size);
|
||||
|
||||
auto & subcolumn = getSubcolumn(key);
|
||||
subcolumn.insert(value);
|
||||
}
|
||||
|
||||
for (auto & entry : subcolumns)
|
||||
if (!inserted.has(entry->path.getPath()))
|
||||
entry->data.insertDefault();
|
||||
|
||||
++num_rows;
|
||||
}
|
||||
|
||||
void ColumnObject::insertDefault()
|
||||
{
|
||||
for (auto & entry : subcolumns)
|
||||
entry->data.insertDefault();
|
||||
|
||||
++num_rows;
|
||||
}
|
||||
|
||||
Field ColumnObject::operator[](size_t n) const
|
||||
{
|
||||
if (!isFinalized())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get Field from non-finalized ColumnObject");
|
||||
|
||||
Object object;
|
||||
for (const auto & entry : subcolumns)
|
||||
object[entry->path.getPath()] = (*entry->data.data.back())[n];
|
||||
|
||||
return object;
|
||||
}
|
||||
|
||||
void ColumnObject::get(size_t n, Field & res) const
|
||||
{
|
||||
if (!isFinalized())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get Field from non-finalized ColumnObject");
|
||||
|
||||
auto & object = res.get<Object &>();
|
||||
for (const auto & entry : subcolumns)
|
||||
{
|
||||
auto it = object.try_emplace(entry->path.getPath()).first;
|
||||
entry->data.data.back()->get(n, it->second);
|
||||
}
|
||||
}
|
||||
|
||||
void ColumnObject::insertRangeFrom(const IColumn & src, size_t start, size_t length)
|
||||
{
|
||||
const auto & src_object = assert_cast<const ColumnObject &>(src);
|
||||
|
||||
for (auto & entry : subcolumns)
|
||||
{
|
||||
if (src_object.hasSubcolumn(entry->path))
|
||||
entry->data.insertRangeFrom(src_object.getSubcolumn(entry->path), start, length);
|
||||
else
|
||||
entry->data.insertManyDefaults(length);
|
||||
}
|
||||
|
||||
num_rows += length;
|
||||
finalize();
|
||||
}
|
||||
|
||||
ColumnPtr ColumnObject::replicate(const Offsets & offsets) const
|
||||
{
|
||||
if (!isFinalized())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot replicate non-finalized ColumnObject");
|
||||
|
||||
auto res_column = ColumnObject::create(is_nullable);
|
||||
for (const auto & entry : subcolumns)
|
||||
{
|
||||
auto replicated_data = entry->data.data.back()->replicate(offsets)->assumeMutable();
|
||||
res_column->addSubcolumn(entry->path, std::move(replicated_data));
|
||||
}
|
||||
|
||||
return res_column;
|
||||
}
|
||||
|
||||
void ColumnObject::popBack(size_t length)
|
||||
{
|
||||
for (auto & entry : subcolumns)
|
||||
entry->data.popBack(length);
|
||||
|
||||
num_rows -= length;
|
||||
}
|
||||
|
||||
const ColumnObject::Subcolumn & ColumnObject::getSubcolumn(const PathInData & key) const
|
||||
{
|
||||
if (const auto * node = subcolumns.findLeaf(key))
|
||||
return node->data;
|
||||
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in ColumnObject", key.getPath());
|
||||
}
|
||||
|
||||
ColumnObject::Subcolumn & ColumnObject::getSubcolumn(const PathInData & key)
|
||||
{
|
||||
if (const auto * node = subcolumns.findLeaf(key))
|
||||
return const_cast<SubcolumnsTree::Node *>(node)->data;
|
||||
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in ColumnObject", key.getPath());
|
||||
}
|
||||
|
||||
bool ColumnObject::hasSubcolumn(const PathInData & key) const
|
||||
{
|
||||
return subcolumns.findLeaf(key) != nullptr;
|
||||
}
|
||||
|
||||
void ColumnObject::addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn)
|
||||
{
|
||||
size_t new_size = subcolumn->size();
|
||||
bool inserted = subcolumns.add(key, Subcolumn(std::move(subcolumn), is_nullable));
|
||||
|
||||
if (!inserted)
|
||||
throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Subcolumn '{}' already exists", key.getPath());
|
||||
|
||||
if (num_rows == 0)
|
||||
num_rows = new_size;
|
||||
else if (new_size != num_rows)
|
||||
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH,
|
||||
"Size of subcolumn {} ({}) is inconsistent with column size ({})",
|
||||
key.getPath(), new_size, num_rows);
|
||||
}
|
||||
|
||||
void ColumnObject::addSubcolumn(const PathInData & key, size_t new_size)
|
||||
{
|
||||
bool inserted = subcolumns.add(key, Subcolumn(new_size, is_nullable));
|
||||
if (!inserted)
|
||||
throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Subcolumn '{}' already exists", key.getPath());
|
||||
|
||||
if (num_rows == 0)
|
||||
num_rows = new_size;
|
||||
else if (new_size != num_rows)
|
||||
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH,
|
||||
"Required size of subcolumn {} ({}) is inconsistent with column size ({})",
|
||||
key.getPath(), new_size, num_rows);
|
||||
}
|
||||
|
||||
void ColumnObject::addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size)
|
||||
{
|
||||
if (!key.hasNested())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Cannot add Nested subcolumn, because path doesn't contain Nested");
|
||||
|
||||
bool inserted = false;
|
||||
/// We find node that represents the same Nested type as @key.
|
||||
const auto * nested_node = subcolumns.findBestMatch(key);
|
||||
|
||||
if (nested_node)
|
||||
{
|
||||
/// Find any leaf of Nested subcolumn.
|
||||
const auto * leaf = subcolumns.findLeaf(nested_node, [&](const auto &) { return true; });
|
||||
assert(leaf);
|
||||
|
||||
/// Recreate subcolumn with default values and the same sizes of arrays.
|
||||
auto new_subcolumn = leaf->data.recreateWithDefaultValues(field_info);
|
||||
|
||||
/// It's possible that we have already inserted value from current row
|
||||
/// to this subcolumn. So, adjust size to expected.
|
||||
if (new_subcolumn.size() > new_size)
|
||||
new_subcolumn.popBack(new_subcolumn.size() - new_size);
|
||||
|
||||
assert(new_subcolumn.size() == new_size);
|
||||
inserted = subcolumns.add(key, new_subcolumn);
|
||||
}
|
||||
else
|
||||
{
|
||||
/// If node was not found just add subcolumn with empty arrays.
|
||||
inserted = subcolumns.add(key, Subcolumn(new_size, is_nullable));
|
||||
}
|
||||
|
||||
if (!inserted)
|
||||
throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Subcolumn '{}' already exists", key.getPath());
|
||||
|
||||
if (num_rows == 0)
|
||||
num_rows = new_size;
|
||||
}
|
||||
|
||||
PathsInData ColumnObject::getKeys() const
|
||||
{
|
||||
PathsInData keys;
|
||||
keys.reserve(subcolumns.size());
|
||||
for (const auto & entry : subcolumns)
|
||||
keys.emplace_back(entry->path);
|
||||
return keys;
|
||||
}
|
||||
|
||||
bool ColumnObject::isFinalized() const
|
||||
{
|
||||
return std::all_of(subcolumns.begin(), subcolumns.end(),
|
||||
[](const auto & entry) { return entry->data.isFinalized(); });
|
||||
}
|
||||
|
||||
void ColumnObject::finalize()
|
||||
{
|
||||
size_t old_size = size();
|
||||
SubcolumnsTree new_subcolumns;
|
||||
for (auto && entry : subcolumns)
|
||||
{
|
||||
const auto & least_common_type = entry->data.getLeastCommonType();
|
||||
|
||||
/// Do not add subcolumns, which consists only from NULLs.
|
||||
if (isNothing(getBaseTypeOfArray(least_common_type)))
|
||||
continue;
|
||||
|
||||
entry->data.finalize();
|
||||
new_subcolumns.add(entry->path, entry->data);
|
||||
}
|
||||
|
||||
/// If all subcolumns were skipped add a dummy subcolumn,
|
||||
/// because Tuple type must have at least one element.
|
||||
if (new_subcolumns.empty())
|
||||
new_subcolumns.add(PathInData{COLUMN_NAME_DUMMY}, Subcolumn{ColumnUInt8::create(old_size, 0), is_nullable});
|
||||
|
||||
std::swap(subcolumns, new_subcolumns);
|
||||
checkObjectHasNoAmbiguosPaths(getKeys());
|
||||
}
|
||||
|
||||
}
|
219
src/Columns/ColumnObject.h
Normal file
219
src/Columns/ColumnObject.h
Normal file
@ -0,0 +1,219 @@
|
||||
#pragma once
|
||||
|
||||
#include <Core/Field.h>
|
||||
#include <Core/Names.h>
|
||||
#include <Columns/IColumn.h>
|
||||
#include <Common/PODArray.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <DataTypes/Serializations/JSONDataParser.h>
|
||||
#include <DataTypes/Serializations/SubcolumnsTree.h>
|
||||
|
||||
#include <DataTypes/IDataType.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
/// Info that represents a scalar or array field in a decomposed view.
|
||||
/// It allows to recreate field with different number
|
||||
/// of dimensions or nullability.
|
||||
struct FieldInfo
|
||||
{
|
||||
/// The common type of of all scalars in field.
|
||||
DataTypePtr scalar_type;
|
||||
|
||||
/// Do we have NULL scalar in field.
|
||||
bool have_nulls;
|
||||
|
||||
/// If true then we have scalars with different types in array and
|
||||
/// we need to convert scalars to the common type.
|
||||
bool need_convert;
|
||||
|
||||
/// Number of dimension in array. 0 if field is scalar.
|
||||
size_t num_dimensions;
|
||||
};
|
||||
|
||||
FieldInfo getFieldInfo(const Field & field);
|
||||
|
||||
/** A column that represents object with dynamic set of subcolumns.
|
||||
* Subcolumns are identified by paths in document and are stored in
|
||||
* a trie-like structure. ColumnObject is not suitable for writing into tables
|
||||
* and it should be converted to Tuple with fixed set of subcolumns before that.
|
||||
*/
|
||||
class ColumnObject final : public COWHelper<IColumn, ColumnObject>
|
||||
{
|
||||
public:
|
||||
/** Class that represents one subcolumn.
|
||||
* It stores values in several parts of column
|
||||
* and keeps current common type of all parts.
|
||||
* We add a new column part with a new type, when we insert a field,
|
||||
* which can't be converted to the current common type.
|
||||
* After insertion of all values subcolumn should be finalized
|
||||
* for writing and other operations.
|
||||
*/
|
||||
class Subcolumn
|
||||
{
|
||||
public:
|
||||
Subcolumn() = default;
|
||||
Subcolumn(size_t size_, bool is_nullable_);
|
||||
Subcolumn(MutableColumnPtr && data_, bool is_nullable_);
|
||||
|
||||
size_t size() const;
|
||||
size_t byteSize() const;
|
||||
size_t allocatedBytes() const;
|
||||
|
||||
bool isFinalized() const { return data.size() == 1 && num_of_defaults_in_prefix == 0; }
|
||||
const DataTypePtr & getLeastCommonType() const { return least_common_type; }
|
||||
|
||||
/// Checks the consistency of column's parts stored in @data.
|
||||
void checkTypes() const;
|
||||
|
||||
/// Inserts a field, which scalars can be arbitrary, but number of
|
||||
/// dimensions should be consistent with current common type.
|
||||
void insert(Field field);
|
||||
void insert(Field field, FieldInfo info);
|
||||
|
||||
void insertDefault();
|
||||
void insertManyDefaults(size_t length);
|
||||
void insertRangeFrom(const Subcolumn & src, size_t start, size_t length);
|
||||
void popBack(size_t n);
|
||||
|
||||
/// Converts all column's parts to the common type and
|
||||
/// creates a single column that stores all values.
|
||||
void finalize();
|
||||
|
||||
/// Returns last inserted field.
|
||||
Field getLastField() const;
|
||||
|
||||
/// Recreates subcolumn with default scalar values and keeps sizes of arrays.
|
||||
/// Used to create columns of type Nested with consistent array sizes.
|
||||
Subcolumn recreateWithDefaultValues(const FieldInfo & field_info) const;
|
||||
|
||||
/// Returns single column if subcolumn in finalizes.
|
||||
/// Otherwise -- undefined behaviour.
|
||||
IColumn & getFinalizedColumn();
|
||||
const IColumn & getFinalizedColumn() const;
|
||||
const ColumnPtr & getFinalizedColumnPtr() const;
|
||||
|
||||
friend class ColumnObject;
|
||||
|
||||
private:
|
||||
/// Current least common type of all values inserted to this subcolumn.
|
||||
DataTypePtr least_common_type;
|
||||
|
||||
/// If true then common type type of subcolumn is Nullable
|
||||
/// and default values are NULLs.
|
||||
bool is_nullable = false;
|
||||
|
||||
/// Parts of column. Parts should be in increasing order in terms of subtypes/supertypes.
|
||||
/// That means that the least common type for i-th prefix is the type of i-th part
|
||||
/// and it's the supertype for all type of column from 0 to i-1.
|
||||
std::vector<WrappedPtr> data;
|
||||
|
||||
/// Until we insert any non-default field we don't know further
|
||||
/// least common type and we count number of defaults in prefix,
|
||||
/// which will be converted to the default type of final common type.
|
||||
size_t num_of_defaults_in_prefix = 0;
|
||||
};
|
||||
|
||||
using SubcolumnsTree = SubcolumnsTree<Subcolumn>;
|
||||
|
||||
private:
|
||||
/// If true then all subcolumns are nullable.
|
||||
const bool is_nullable;
|
||||
|
||||
SubcolumnsTree subcolumns;
|
||||
size_t num_rows;
|
||||
|
||||
public:
|
||||
static constexpr auto COLUMN_NAME_DUMMY = "_dummy";
|
||||
|
||||
explicit ColumnObject(bool is_nullable_);
|
||||
ColumnObject(SubcolumnsTree && subcolumns_, bool is_nullable_);
|
||||
|
||||
/// Checks that all subcolumns have consistent sizes.
|
||||
void checkConsistency() const;
|
||||
|
||||
bool hasSubcolumn(const PathInData & key) const;
|
||||
|
||||
const Subcolumn & getSubcolumn(const PathInData & key) const;
|
||||
Subcolumn & getSubcolumn(const PathInData & key);
|
||||
|
||||
void incrementNumRows() { ++num_rows; }
|
||||
|
||||
/// Adds a subcolumn from existing IColumn.
|
||||
void addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn);
|
||||
|
||||
/// Adds a subcolumn of specific size with default values.
|
||||
void addSubcolumn(const PathInData & key, size_t new_size);
|
||||
|
||||
/// Adds a subcolumn of type Nested of specific size with default values.
|
||||
/// It cares about consistency of sizes of Nested arrays.
|
||||
void addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size);
|
||||
|
||||
const SubcolumnsTree & getSubcolumns() const { return subcolumns; }
|
||||
SubcolumnsTree & getSubcolumns() { return subcolumns; }
|
||||
PathsInData getKeys() const;
|
||||
|
||||
/// Finalizes all subcolumns.
|
||||
void finalize();
|
||||
bool isFinalized() const;
|
||||
|
||||
/// Part of interface
|
||||
|
||||
const char * getFamilyName() const override { return "Object"; }
|
||||
TypeIndex getDataType() const override { return TypeIndex::Object; }
|
||||
|
||||
size_t size() const override;
|
||||
MutableColumnPtr cloneResized(size_t new_size) const override;
|
||||
size_t byteSize() const override;
|
||||
size_t allocatedBytes() const override;
|
||||
void forEachSubcolumn(ColumnCallback callback) override;
|
||||
void insert(const Field & field) override;
|
||||
void insertDefault() override;
|
||||
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
|
||||
ColumnPtr replicate(const Offsets & offsets) const override;
|
||||
void popBack(size_t length) override;
|
||||
Field operator[](size_t n) const override;
|
||||
void get(size_t n, Field & res) const override;
|
||||
|
||||
/// All other methods throw exception.
|
||||
|
||||
ColumnPtr decompress() const override { throwMustBeConcrete(); }
|
||||
StringRef getDataAt(size_t) const override { throwMustBeConcrete(); }
|
||||
bool isDefaultAt(size_t) const override { throwMustBeConcrete(); }
|
||||
void insertData(const char *, size_t) override { throwMustBeConcrete(); }
|
||||
StringRef serializeValueIntoArena(size_t, Arena &, char const *&) const override { throwMustBeConcrete(); }
|
||||
const char * deserializeAndInsertFromArena(const char *) override { throwMustBeConcrete(); }
|
||||
const char * skipSerializedInArena(const char *) const override { throwMustBeConcrete(); }
|
||||
void updateHashWithValue(size_t, SipHash &) const override { throwMustBeConcrete(); }
|
||||
void updateWeakHash32(WeakHash32 &) const override { throwMustBeConcrete(); }
|
||||
void updateHashFast(SipHash &) const override { throwMustBeConcrete(); }
|
||||
ColumnPtr filter(const Filter &, ssize_t) const override { throwMustBeConcrete(); }
|
||||
void expand(const Filter &, bool) override { throwMustBeConcrete(); }
|
||||
ColumnPtr permute(const Permutation &, size_t) const override { throwMustBeConcrete(); }
|
||||
ColumnPtr index(const IColumn &, size_t) const override { throwMustBeConcrete(); }
|
||||
int compareAt(size_t, size_t, const IColumn &, int) const override { throwMustBeConcrete(); }
|
||||
void compareColumn(const IColumn &, size_t, PaddedPODArray<UInt64> *, PaddedPODArray<Int8> &, int, int) const override { throwMustBeConcrete(); }
|
||||
bool hasEqualValues() const override { throwMustBeConcrete(); }
|
||||
void getPermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &) const override { throwMustBeConcrete(); }
|
||||
void updatePermutation(PermutationSortDirection, PermutationSortStability, size_t, int, Permutation &, EqualRanges &) const override { throwMustBeConcrete(); }
|
||||
MutableColumns scatter(ColumnIndex, const Selector &) const override { throwMustBeConcrete(); }
|
||||
void gather(ColumnGathererStream &) override { throwMustBeConcrete(); }
|
||||
void getExtremes(Field &, Field &) const override { throwMustBeConcrete(); }
|
||||
size_t byteSizeAt(size_t) const override { throwMustBeConcrete(); }
|
||||
double getRatioOfDefaultRows(double) const override { throwMustBeConcrete(); }
|
||||
void getIndicesOfNonDefaultRows(Offsets &, size_t, size_t) const override { throwMustBeConcrete(); }
|
||||
|
||||
private:
|
||||
[[noreturn]] static void throwMustBeConcrete()
|
||||
{
|
||||
throw Exception("ColumnObject must be converted to ColumnTuple before use", ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
@ -288,7 +288,7 @@ void ColumnSparse::popBack(size_t n)
|
||||
ColumnPtr ColumnSparse::filter(const Filter & filt, ssize_t) const
|
||||
{
|
||||
if (_size != filt.size())
|
||||
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), _size);
|
||||
|
||||
if (offsets->empty())
|
||||
{
|
||||
|
@ -381,7 +381,7 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
|
||||
{
|
||||
size_t size = data.size();
|
||||
if (size != filt.size())
|
||||
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size);
|
||||
|
||||
auto res = this->create();
|
||||
Container & res_data = res->getData();
|
||||
@ -450,7 +450,7 @@ void ColumnVector<T>::applyZeroMap(const IColumn::Filter & filt, bool inverted)
|
||||
{
|
||||
size_t size = data.size();
|
||||
if (size != filt.size())
|
||||
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size);
|
||||
|
||||
const UInt8 * filt_pos = filt.data();
|
||||
const UInt8 * filt_end = filt_pos + size;
|
||||
|
@ -192,7 +192,7 @@ namespace
|
||||
{
|
||||
const size_t size = src_offsets.size();
|
||||
if (size != filt.size())
|
||||
throw Exception("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size);
|
||||
|
||||
ResultOffsetsBuilder result_offsets_builder(res_offsets);
|
||||
|
||||
|
@ -883,8 +883,8 @@ public:
|
||||
return toDayNum(years_lut[year - DATE_LUT_MIN_YEAR]);
|
||||
}
|
||||
|
||||
template <typename Date,
|
||||
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
|
||||
template <typename Date>
|
||||
requires std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>
|
||||
inline auto toStartOfQuarterInterval(Date d, UInt64 quarters) const
|
||||
{
|
||||
if (quarters == 1)
|
||||
@ -892,8 +892,8 @@ public:
|
||||
return toStartOfMonthInterval(d, quarters * 3);
|
||||
}
|
||||
|
||||
template <typename Date,
|
||||
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
|
||||
template <typename Date>
|
||||
requires std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>
|
||||
inline auto toStartOfMonthInterval(Date d, UInt64 months) const
|
||||
{
|
||||
if (months == 1)
|
||||
@ -906,8 +906,8 @@ public:
|
||||
return toDayNum(years_months_lut[month_total_index / months * months]);
|
||||
}
|
||||
|
||||
template <typename Date,
|
||||
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
|
||||
template <typename Date>
|
||||
requires std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>
|
||||
inline auto toStartOfWeekInterval(Date d, UInt64 weeks) const
|
||||
{
|
||||
if (weeks == 1)
|
||||
@ -920,8 +920,8 @@ public:
|
||||
return ExtendedDayNum(4 + (d - 4) / days * days);
|
||||
}
|
||||
|
||||
template <typename Date,
|
||||
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
|
||||
template <typename Date>
|
||||
requires std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>
|
||||
inline Time toStartOfDayInterval(Date d, UInt64 days) const
|
||||
{
|
||||
if (days == 1)
|
||||
@ -1219,10 +1219,8 @@ public:
|
||||
|
||||
/// If resulting month has less deys than source month, then saturation can happen.
|
||||
/// Example: 31 Aug + 1 month = 30 Sep.
|
||||
template <
|
||||
typename DateTime,
|
||||
typename
|
||||
= std::enable_if_t<std::is_same_v<DateTime, UInt32> || std::is_same_v<DateTime, Int64> || std::is_same_v<DateTime, time_t>>>
|
||||
template <typename DateTime>
|
||||
requires std::is_same_v<DateTime, UInt32> || std::is_same_v<DateTime, Int64> || std::is_same_v<DateTime, time_t>
|
||||
inline Time NO_SANITIZE_UNDEFINED addMonths(DateTime t, Int64 delta) const
|
||||
{
|
||||
const auto result_day = addMonthsIndex(t, delta);
|
||||
@ -1247,8 +1245,8 @@ public:
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename Date,
|
||||
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
|
||||
template <typename Date>
|
||||
requires std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>
|
||||
inline auto NO_SANITIZE_UNDEFINED addMonths(Date d, Int64 delta) const
|
||||
{
|
||||
if constexpr (std::is_same_v<Date, DayNum>)
|
||||
@ -1280,10 +1278,8 @@ public:
|
||||
}
|
||||
|
||||
/// Saturation can occur if 29 Feb is mapped to non-leap year.
|
||||
template <
|
||||
typename DateTime,
|
||||
typename
|
||||
= std::enable_if_t<std::is_same_v<DateTime, UInt32> || std::is_same_v<DateTime, Int64> || std::is_same_v<DateTime, time_t>>>
|
||||
template <typename DateTime>
|
||||
requires std::is_same_v<DateTime, UInt32> || std::is_same_v<DateTime, Int64> || std::is_same_v<DateTime, time_t>
|
||||
inline Time addYears(DateTime t, Int64 delta) const
|
||||
{
|
||||
auto result_day = addYearsIndex(t, delta);
|
||||
@ -1308,8 +1304,8 @@ public:
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename Date,
|
||||
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
|
||||
template <typename Date>
|
||||
requires std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>
|
||||
inline auto addYears(Date d, Int64 delta) const
|
||||
{
|
||||
if constexpr (std::is_same_v<Date, DayNum>)
|
||||
|
@ -613,6 +613,7 @@
|
||||
M(642, CANNOT_PACK_ARCHIVE) \
|
||||
M(643, CANNOT_UNPACK_ARCHIVE) \
|
||||
M(644, REMOTE_FS_OBJECT_CACHE_ERROR) \
|
||||
M(645, NUMBER_OF_DIMENSIONS_MISMATHED) \
|
||||
\
|
||||
M(999, KEEPER_EXCEPTION) \
|
||||
M(1000, POCO_EXCEPTION) \
|
||||
|
@ -205,7 +205,8 @@ void rethrowFirstException(const Exceptions & exceptions);
|
||||
|
||||
|
||||
template <typename T>
|
||||
std::enable_if_t<std::is_pointer_v<T>, T> exception_cast(std::exception_ptr e)
|
||||
requires std::is_pointer_v<T>
|
||||
T exception_cast(std::exception_ptr e)
|
||||
{
|
||||
try
|
||||
{
|
||||
|
@ -46,6 +46,11 @@ public:
|
||||
throw Exception("Cannot convert Map to " + demangle(typeid(T).name()), ErrorCodes::CANNOT_CONVERT_TYPE);
|
||||
}
|
||||
|
||||
T operator() (const Object &) const
|
||||
{
|
||||
throw Exception("Cannot convert Object to " + demangle(typeid(T).name()), ErrorCodes::CANNOT_CONVERT_TYPE);
|
||||
}
|
||||
|
||||
T operator() (const UInt64 & x) const { return T(x); }
|
||||
T operator() (const Int64 & x) const { return T(x); }
|
||||
T operator() (const Int128 & x) const { return T(x); }
|
||||
@ -113,7 +118,8 @@ public:
|
||||
throw Exception("Cannot convert AggregateFunctionStateData to " + demangle(typeid(T).name()), ErrorCodes::CANNOT_CONVERT_TYPE);
|
||||
}
|
||||
|
||||
template <typename U, typename = std::enable_if_t<is_big_int_v<U>> >
|
||||
template <typename U>
|
||||
requires is_big_int_v<U>
|
||||
T operator() (const U & x) const
|
||||
{
|
||||
if constexpr (is_decimal<T>)
|
||||
|
@ -95,6 +95,23 @@ String FieldVisitorDump::operator() (const Map & x) const
|
||||
return wb.str();
|
||||
}
|
||||
|
||||
String FieldVisitorDump::operator() (const Object & x) const
|
||||
{
|
||||
WriteBufferFromOwnString wb;
|
||||
|
||||
wb << "Object_(";
|
||||
for (auto it = x.begin(); it != x.end(); ++it)
|
||||
{
|
||||
if (it != x.begin())
|
||||
wb << ", ";
|
||||
wb << "(" << it->first << ", " << applyVisitor(*this, it->second) << ")";
|
||||
}
|
||||
wb << ')';
|
||||
|
||||
return wb.str();
|
||||
|
||||
}
|
||||
|
||||
String FieldVisitorDump::operator() (const AggregateFunctionStateData & x) const
|
||||
{
|
||||
WriteBufferFromOwnString wb;
|
||||
|
@ -22,6 +22,7 @@ public:
|
||||
String operator() (const Array & x) const;
|
||||
String operator() (const Tuple & x) const;
|
||||
String operator() (const Map & x) const;
|
||||
String operator() (const Object & x) const;
|
||||
String operator() (const DecimalField<Decimal32> & x) const;
|
||||
String operator() (const DecimalField<Decimal64> & x) const;
|
||||
String operator() (const DecimalField<Decimal128> & x) const;
|
||||
|
@ -94,6 +94,19 @@ void FieldVisitorHash::operator() (const Array & x) const
|
||||
applyVisitor(*this, elem);
|
||||
}
|
||||
|
||||
void FieldVisitorHash::operator() (const Object & x) const
|
||||
{
|
||||
UInt8 type = Field::Types::Object;
|
||||
hash.update(type);
|
||||
hash.update(x.size());
|
||||
|
||||
for (const auto & [key, value]: x)
|
||||
{
|
||||
hash.update(key);
|
||||
applyVisitor(*this, value);
|
||||
}
|
||||
}
|
||||
|
||||
void FieldVisitorHash::operator() (const DecimalField<Decimal32> & x) const
|
||||
{
|
||||
UInt8 type = Field::Types::Decimal32;
|
||||
|
@ -28,6 +28,7 @@ public:
|
||||
void operator() (const Array & x) const;
|
||||
void operator() (const Tuple & x) const;
|
||||
void operator() (const Map & x) const;
|
||||
void operator() (const Object & x) const;
|
||||
void operator() (const DecimalField<Decimal32> & x) const;
|
||||
void operator() (const DecimalField<Decimal64> & x) const;
|
||||
void operator() (const DecimalField<Decimal128> & x) const;
|
||||
|
@ -26,6 +26,7 @@ bool FieldVisitorSum::operator() (String &) const { throw Exception("Cannot sum
|
||||
bool FieldVisitorSum::operator() (Array &) const { throw Exception("Cannot sum Arrays", ErrorCodes::LOGICAL_ERROR); }
|
||||
bool FieldVisitorSum::operator() (Tuple &) const { throw Exception("Cannot sum Tuples", ErrorCodes::LOGICAL_ERROR); }
|
||||
bool FieldVisitorSum::operator() (Map &) const { throw Exception("Cannot sum Maps", ErrorCodes::LOGICAL_ERROR); }
|
||||
bool FieldVisitorSum::operator() (Object &) const { throw Exception("Cannot sum Objects", ErrorCodes::LOGICAL_ERROR); }
|
||||
bool FieldVisitorSum::operator() (UUID &) const { throw Exception("Cannot sum UUIDs", ErrorCodes::LOGICAL_ERROR); }
|
||||
|
||||
bool FieldVisitorSum::operator() (AggregateFunctionStateData &) const
|
||||
|
@ -25,6 +25,7 @@ public:
|
||||
bool operator() (Array &) const;
|
||||
bool operator() (Tuple &) const;
|
||||
bool operator() (Map &) const;
|
||||
bool operator() (Object &) const;
|
||||
bool operator() (UUID &) const;
|
||||
bool operator() (AggregateFunctionStateData &) const;
|
||||
bool operator() (bool &) const;
|
||||
@ -36,7 +37,8 @@ public:
|
||||
return x.getValue() != T(0);
|
||||
}
|
||||
|
||||
template <typename T, typename = std::enable_if_t<is_big_int_v<T>> >
|
||||
template <typename T>
|
||||
requires is_big_int_v<T>
|
||||
bool operator() (T & x) const
|
||||
{
|
||||
x += rhs.reinterpret<T>();
|
||||
|
@ -126,5 +126,24 @@ String FieldVisitorToString::operator() (const Map & x) const
|
||||
return wb.str();
|
||||
}
|
||||
|
||||
String FieldVisitorToString::operator() (const Object & x) const
|
||||
{
|
||||
WriteBufferFromOwnString wb;
|
||||
|
||||
wb << '{';
|
||||
for (auto it = x.begin(); it != x.end(); ++it)
|
||||
{
|
||||
if (it != x.begin())
|
||||
wb << ", ";
|
||||
|
||||
writeDoubleQuoted(it->first, wb);
|
||||
wb << ": " << applyVisitor(*this, it->second);
|
||||
}
|
||||
wb << '}';
|
||||
|
||||
return wb.str();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -22,6 +22,7 @@ public:
|
||||
String operator() (const Array & x) const;
|
||||
String operator() (const Tuple & x) const;
|
||||
String operator() (const Map & x) const;
|
||||
String operator() (const Object & x) const;
|
||||
String operator() (const DecimalField<Decimal32> & x) const;
|
||||
String operator() (const DecimalField<Decimal64> & x) const;
|
||||
String operator() (const DecimalField<Decimal128> & x) const;
|
||||
|
@ -66,6 +66,20 @@ void FieldVisitorWriteBinary::operator() (const Map & x, WriteBuffer & buf) cons
|
||||
}
|
||||
}
|
||||
|
||||
void FieldVisitorWriteBinary::operator() (const Object & x, WriteBuffer & buf) const
|
||||
{
|
||||
const size_t size = x.size();
|
||||
writeBinary(size, buf);
|
||||
|
||||
for (const auto & [key, value] : x)
|
||||
{
|
||||
const UInt8 type = value.getType();
|
||||
writeBinary(type, buf);
|
||||
writeBinary(key, buf);
|
||||
Field::dispatch([&buf] (const auto & val) { FieldVisitorWriteBinary()(val, buf); }, value);
|
||||
}
|
||||
}
|
||||
|
||||
void FieldVisitorWriteBinary::operator()(const bool & x, WriteBuffer & buf) const
|
||||
{
|
||||
writeBinary(UInt8(x), buf);
|
||||
|
@ -21,6 +21,7 @@ public:
|
||||
void operator() (const Array & x, WriteBuffer & buf) const;
|
||||
void operator() (const Tuple & x, WriteBuffer & buf) const;
|
||||
void operator() (const Map & x, WriteBuffer & buf) const;
|
||||
void operator() (const Object & x, WriteBuffer & buf) const;
|
||||
void operator() (const DecimalField<Decimal32> & x, WriteBuffer & buf) const;
|
||||
void operator() (const DecimalField<Decimal64> & x, WriteBuffer & buf) const;
|
||||
void operator() (const DecimalField<Decimal128> & x, WriteBuffer & buf) const;
|
||||
|
@ -46,7 +46,16 @@ FileSegment::State FileSegment::state() const
|
||||
size_t FileSegment::getDownloadOffset() const
|
||||
{
|
||||
std::lock_guard segment_lock(mutex);
|
||||
return range().left + downloaded_size;
|
||||
return range().left + getDownloadedSize(segment_lock);
|
||||
}
|
||||
|
||||
size_t FileSegment::getDownloadedSize(std::lock_guard<std::mutex> & /* segment_lock */) const
|
||||
{
|
||||
if (download_state == State::DOWNLOADED)
|
||||
return downloaded_size;
|
||||
|
||||
std::lock_guard download_lock(download_mutex);
|
||||
return downloaded_size;
|
||||
}
|
||||
|
||||
String FileSegment::getCallerId()
|
||||
@ -174,7 +183,12 @@ void FileSegment::write(const char * from, size_t size)
|
||||
try
|
||||
{
|
||||
cache_writer->write(from, size);
|
||||
|
||||
std::lock_guard download_lock(download_mutex);
|
||||
|
||||
cache_writer->next();
|
||||
|
||||
downloaded_size += size;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
@ -189,9 +203,6 @@ void FileSegment::write(const char * from, size_t size)
|
||||
|
||||
throw;
|
||||
}
|
||||
|
||||
std::lock_guard segment_lock(mutex);
|
||||
downloaded_size += size;
|
||||
}
|
||||
|
||||
FileSegment::State FileSegment::wait()
|
||||
@ -225,15 +236,15 @@ bool FileSegment::reserve(size_t size)
|
||||
{
|
||||
std::lock_guard segment_lock(mutex);
|
||||
|
||||
auto caller_id = getCallerId();
|
||||
if (downloader_id != caller_id)
|
||||
throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Space can be reserved only by downloader (current: {}, expected: {})", caller_id, downloader_id);
|
||||
|
||||
if (downloaded_size + size > range().size())
|
||||
throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR,
|
||||
"Attempt to reserve space too much space ({}) for file segment with range: {} (downloaded size: {})",
|
||||
size, range().toString(), downloaded_size);
|
||||
|
||||
auto caller_id = getCallerId();
|
||||
if (downloader_id != caller_id)
|
||||
throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Space can be reserved only by downloader (current: {}, expected: {})", caller_id, downloader_id);
|
||||
|
||||
assert(reserved_size >= downloaded_size);
|
||||
}
|
||||
|
||||
@ -323,7 +334,7 @@ void FileSegment::complete()
|
||||
if (download_state == State::SKIP_CACHE || detached)
|
||||
return;
|
||||
|
||||
if (downloaded_size == range().size() && download_state != State::DOWNLOADED)
|
||||
if (download_state != State::DOWNLOADED && getDownloadedSize(segment_lock) == range().size())
|
||||
setDownloaded(segment_lock);
|
||||
|
||||
if (download_state == State::DOWNLOADING || download_state == State::EMPTY)
|
||||
@ -350,10 +361,11 @@ void FileSegment::completeImpl(bool allow_non_strict_checking)
|
||||
|
||||
if (!download_can_continue)
|
||||
{
|
||||
if (!downloaded_size)
|
||||
size_t current_downloaded_size = getDownloadedSize(segment_lock);
|
||||
if (current_downloaded_size == 0)
|
||||
{
|
||||
download_state = State::SKIP_CACHE;
|
||||
LOG_TEST(log, "Remove cell {} (downloaded: {})", range().toString(), downloaded_size);
|
||||
LOG_TEST(log, "Remove cell {} (nothing downloaded)", range().toString());
|
||||
cache->remove(key(), offset(), cache_lock, segment_lock);
|
||||
|
||||
detached = true;
|
||||
@ -366,7 +378,7 @@ void FileSegment::completeImpl(bool allow_non_strict_checking)
|
||||
* in FileSegmentsHolder represent a contiguous range, so we can resize
|
||||
* it only when nobody needs it.
|
||||
*/
|
||||
LOG_TEST(log, "Resize cell {} to downloaded: {}", range().toString(), downloaded_size);
|
||||
LOG_TEST(log, "Resize cell {} to downloaded: {}", range().toString(), current_downloaded_size);
|
||||
cache->reduceSizeToDownloaded(key(), offset(), cache_lock, segment_lock);
|
||||
|
||||
detached = true;
|
||||
@ -397,7 +409,7 @@ String FileSegment::getInfoForLog() const
|
||||
WriteBufferFromOwnString info;
|
||||
info << "File segment: " << range().toString() << ", ";
|
||||
info << "state: " << download_state << ", ";
|
||||
info << "downloaded size: " << downloaded_size << ", ";
|
||||
info << "downloaded size: " << getDownloadedSize(segment_lock) << ", ";
|
||||
info << "downloader id: " << downloader_id << ", ";
|
||||
info << "caller id: " << getCallerId();
|
||||
|
||||
|
@ -129,6 +129,7 @@ private:
|
||||
void setDownloaded(std::lock_guard<std::mutex> & segment_lock);
|
||||
static String getCallerIdImpl(bool allow_non_strict_checking = false);
|
||||
void resetDownloaderImpl(std::lock_guard<std::mutex> & segment_lock);
|
||||
size_t getDownloadedSize(std::lock_guard<std::mutex> & segment_lock) const;
|
||||
|
||||
const Range segment_range;
|
||||
|
||||
@ -144,6 +145,14 @@ private:
|
||||
mutable std::mutex mutex;
|
||||
std::condition_variable cv;
|
||||
|
||||
/// Protects downloaded_size access with actual write into fs.
|
||||
/// downloaded_size is not protected by download_mutex in methods which
|
||||
/// can never be run in parallel to FileSegment::write() method
|
||||
/// as downloaded_size is updated only in FileSegment::write() method.
|
||||
/// Such methods are identified by isDownloader() check at their start,
|
||||
/// e.g. they are executed strictly by the same thread, sequentially.
|
||||
mutable std::mutex download_mutex;
|
||||
|
||||
Key file_key;
|
||||
IFileCache * cache;
|
||||
|
||||
|
@ -1,5 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#include <base/StringRef.h>
|
||||
#include <base/logger_useful.h>
|
||||
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <Common/Arena.h>
|
||||
#include <Common/getResource.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
@ -10,11 +16,6 @@
|
||||
#include <IO/readFloatText.h>
|
||||
#include <IO/ZstdInflatingReadBuffer.h>
|
||||
|
||||
#include <base/StringRef.h>
|
||||
#include <base/logger_useful.h>
|
||||
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -34,7 +35,6 @@ namespace ErrorCodes
|
||||
|
||||
class FrequencyHolder
|
||||
{
|
||||
|
||||
public:
|
||||
struct Language
|
||||
{
|
||||
@ -52,6 +52,7 @@ public:
|
||||
public:
|
||||
using Map = HashMap<StringRef, Float64>;
|
||||
using Container = std::vector<Language>;
|
||||
|
||||
using EncodingMap = HashMap<UInt16, Float64>;
|
||||
using EncodingContainer = std::vector<Encoding>;
|
||||
|
||||
@ -61,6 +62,30 @@ public:
|
||||
return instance;
|
||||
}
|
||||
|
||||
const Map & getEmotionalDict() const
|
||||
{
|
||||
return emotional_dict;
|
||||
}
|
||||
|
||||
const EncodingContainer & getEncodingsFrequency() const
|
||||
{
|
||||
return encodings_freq;
|
||||
}
|
||||
|
||||
const Container & getProgrammingFrequency() const
|
||||
{
|
||||
return programming_freq;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
FrequencyHolder()
|
||||
{
|
||||
loadEmotionalDict();
|
||||
loadEncodingsFrequency();
|
||||
loadProgrammingFrequency();
|
||||
}
|
||||
|
||||
void loadEncodingsFrequency()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("EncodingsFrequency");
|
||||
@ -119,7 +144,6 @@ public:
|
||||
LOG_TRACE(log, "Charset frequencies was added, charsets count: {}", encodings_freq.size());
|
||||
}
|
||||
|
||||
|
||||
void loadEmotionalDict()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("EmotionalDict");
|
||||
@ -158,7 +182,6 @@ public:
|
||||
LOG_TRACE(log, "Emotional dictionary was added. Word count: {}", std::to_string(count));
|
||||
}
|
||||
|
||||
|
||||
void loadProgrammingFrequency()
|
||||
{
|
||||
Poco::Logger * log = &Poco::Logger::get("ProgrammingFrequency");
|
||||
@ -211,42 +234,10 @@ public:
|
||||
LOG_TRACE(log, "Programming languages frequencies was added");
|
||||
}
|
||||
|
||||
const Map & getEmotionalDict()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
if (emotional_dict.empty())
|
||||
loadEmotionalDict();
|
||||
|
||||
return emotional_dict;
|
||||
}
|
||||
|
||||
|
||||
const EncodingContainer & getEncodingsFrequency()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
if (encodings_freq.empty())
|
||||
loadEncodingsFrequency();
|
||||
|
||||
return encodings_freq;
|
||||
}
|
||||
|
||||
const Container & getProgrammingFrequency()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
if (programming_freq.empty())
|
||||
loadProgrammingFrequency();
|
||||
|
||||
return programming_freq;
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
Arena string_pool;
|
||||
|
||||
Map emotional_dict;
|
||||
Container programming_freq;
|
||||
EncodingContainer encodings_freq;
|
||||
|
||||
std::mutex mutex;
|
||||
};
|
||||
}
|
||||
|
@ -130,6 +130,7 @@ public:
|
||||
IntervalTree() { nodes.resize(1); }
|
||||
|
||||
template <typename TValue = Value, std::enable_if_t<std::is_same_v<TValue, IntervalTreeVoidValue>, bool> = true>
|
||||
requires std::is_same_v<Value, IntervalTreeVoidValue>
|
||||
ALWAYS_INLINE bool emplace(Interval interval)
|
||||
{
|
||||
assert(!tree_is_built);
|
||||
|
@ -76,7 +76,8 @@ public:
|
||||
void add(const char * value) { add(std::make_unique<JSONString>(value)); }
|
||||
void add(bool value) { add(std::make_unique<JSONBool>(std::move(value))); }
|
||||
|
||||
template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, bool> = true>
|
||||
template <typename T>
|
||||
requires std::is_arithmetic_v<T>
|
||||
void add(T value) { add(std::make_unique<JSONNumber<T>>(value)); }
|
||||
|
||||
void format(const FormatSettings & settings, FormatContext & context) override;
|
||||
@ -100,7 +101,8 @@ public:
|
||||
void add(std::string key, std::string_view value) { add(std::move(key), std::make_unique<JSONString>(value)); }
|
||||
void add(std::string key, bool value) { add(std::move(key), std::make_unique<JSONBool>(std::move(value))); }
|
||||
|
||||
template <typename T, std::enable_if_t<std::is_arithmetic_v<T>, bool> = true>
|
||||
template <typename T>
|
||||
requires std::is_arithmetic_v<T>
|
||||
void add(std::string key, T value) { add(std::move(key), std::make_unique<JSONNumber<T>>(value)); }
|
||||
|
||||
void format(const FormatSettings & settings, FormatContext & context) override;
|
||||
|
@ -82,7 +82,8 @@ private:
|
||||
#endif
|
||||
|
||||
public:
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
StringSearcher(const CharT * needle_, const size_t needle_size_)
|
||||
: needle{reinterpret_cast<const uint8_t *>(needle_)}, needle_size{needle_size_}
|
||||
{
|
||||
@ -191,7 +192,8 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
ALWAYS_INLINE bool compareTrivial(const CharT * haystack_pos, const CharT * const haystack_end, const uint8_t * needle_pos) const
|
||||
{
|
||||
while (haystack_pos < haystack_end && needle_pos < needle_end)
|
||||
@ -217,7 +219,8 @@ public:
|
||||
return needle_pos == needle_end;
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * haystack_end, const CharT * pos) const
|
||||
{
|
||||
|
||||
@ -262,7 +265,8 @@ public:
|
||||
|
||||
/** Returns haystack_end if not found.
|
||||
*/
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
|
||||
{
|
||||
if (0 == needle_size)
|
||||
@ -338,7 +342,8 @@ public:
|
||||
return haystack_end;
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const size_t haystack_size) const
|
||||
{
|
||||
return search(haystack, haystack + haystack_size);
|
||||
@ -367,7 +372,8 @@ private:
|
||||
#endif
|
||||
|
||||
public:
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
StringSearcher(const CharT * needle_, const size_t needle_size)
|
||||
: needle{reinterpret_cast<const uint8_t *>(needle_)}, needle_end{needle + needle_size}
|
||||
{
|
||||
@ -399,7 +405,8 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * /*haystack_end*/, const CharT * pos) const
|
||||
{
|
||||
#ifdef __SSE4_1__
|
||||
@ -453,7 +460,8 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
|
||||
{
|
||||
if (needle == needle_end)
|
||||
@ -540,7 +548,8 @@ public:
|
||||
return haystack_end;
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const size_t haystack_size) const
|
||||
{
|
||||
return search(haystack, haystack + haystack_size);
|
||||
@ -568,7 +577,8 @@ private:
|
||||
#endif
|
||||
|
||||
public:
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
StringSearcher(const CharT * needle_, const size_t needle_size)
|
||||
: needle{reinterpret_cast<const uint8_t *>(needle_)}, needle_end{needle + needle_size}
|
||||
{
|
||||
@ -596,7 +606,8 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * /*haystack_end*/, const CharT * pos) const
|
||||
{
|
||||
#ifdef __SSE4_1__
|
||||
@ -642,7 +653,8 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
|
||||
{
|
||||
if (needle == needle_end)
|
||||
@ -722,7 +734,8 @@ public:
|
||||
return haystack_end;
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const size_t haystack_size) const
|
||||
{
|
||||
return search(haystack, haystack + haystack_size);
|
||||
@ -740,7 +753,8 @@ class TokenSearcher : public StringSearcherBase
|
||||
size_t needle_size;
|
||||
|
||||
public:
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
TokenSearcher(const CharT * needle_, const size_t needle_size_)
|
||||
: searcher{needle_, needle_size_},
|
||||
needle_size(needle_size_)
|
||||
@ -752,7 +766,8 @@ public:
|
||||
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
ALWAYS_INLINE bool compare(const CharT * haystack, const CharT * haystack_end, const CharT * pos) const
|
||||
{
|
||||
// use searcher only if pos is in the beginning of token and pos + searcher.needle_size is end of token.
|
||||
@ -762,7 +777,8 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
|
||||
{
|
||||
// use searcher.search(), then verify that returned value is a token
|
||||
@ -781,13 +797,15 @@ public:
|
||||
return haystack_end;
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const size_t haystack_size) const
|
||||
{
|
||||
return search(haystack, haystack + haystack_size);
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
ALWAYS_INLINE bool isToken(const CharT * haystack, const CharT * const haystack_end, const CharT* p) const
|
||||
{
|
||||
return (p == haystack || isTokenSeparator(*(p - 1)))
|
||||
@ -819,11 +837,13 @@ struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase
|
||||
{
|
||||
const char * const needle;
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
LibCASCIICaseSensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */)
|
||||
: needle(reinterpret_cast<const char *>(needle_)) {}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
|
||||
{
|
||||
const auto * res = strstr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
|
||||
@ -832,7 +852,8 @@ struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase
|
||||
return reinterpret_cast<const CharT *>(res);
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const size_t haystack_size) const
|
||||
{
|
||||
return search(haystack, haystack + haystack_size);
|
||||
@ -843,11 +864,13 @@ struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase
|
||||
{
|
||||
const char * const needle;
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
LibCASCIICaseInsensitiveStringSearcher(const CharT * const needle_, const size_t /* needle_size */)
|
||||
: needle(reinterpret_cast<const char *>(needle_)) {}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const CharT * const haystack_end) const
|
||||
{
|
||||
const auto * res = strcasestr(reinterpret_cast<const char *>(haystack), reinterpret_cast<const char *>(needle));
|
||||
@ -856,7 +879,8 @@ struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase
|
||||
return reinterpret_cast<const CharT *>(res);
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
const CharT * search(const CharT * haystack, const size_t haystack_size) const
|
||||
{
|
||||
return search(haystack, haystack + haystack_size);
|
||||
|
@ -9,7 +9,6 @@
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <optional>
|
||||
#include <sstream>
|
||||
#include <unordered_set>
|
||||
|
||||
#include <fcntl.h>
|
||||
@ -21,6 +20,8 @@
|
||||
#include <sys/types.h>
|
||||
#include <dirent.h>
|
||||
|
||||
#include <boost/algorithm/string/split.hpp>
|
||||
|
||||
#include <base/errnoToString.h>
|
||||
|
||||
|
||||
@ -247,9 +248,9 @@ static_assert(sizeof(raw_events_info) / sizeof(raw_events_info[0]) == NUMBER_OF_
|
||||
#undef CACHE_EVENT
|
||||
|
||||
// A map of event name -> event index, to parse event list in settings.
|
||||
static std::unordered_map<std::string, size_t> populateEventMap()
|
||||
static std::unordered_map<std::string_view, size_t> populateEventMap()
|
||||
{
|
||||
std::unordered_map<std::string, size_t> name_to_index;
|
||||
std::unordered_map<std::string_view, size_t> name_to_index;
|
||||
name_to_index.reserve(NUMBER_OF_RAW_EVENTS);
|
||||
|
||||
for (size_t i = 0; i < NUMBER_OF_RAW_EVENTS; ++i)
|
||||
@ -455,10 +456,10 @@ std::vector<size_t> PerfEventsCounters::eventIndicesFromString(const std::string
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<std::string> event_names;
|
||||
boost::split(event_names, events_list, [](char c) { return c == ','; });
|
||||
|
||||
std::istringstream iss(events_list); // STYLE_CHECK_ALLOW_STD_STRING_STREAM
|
||||
std::string event_name;
|
||||
while (std::getline(iss, event_name, ','))
|
||||
for (auto & event_name : event_names)
|
||||
{
|
||||
// Allow spaces at the beginning of the token, so that you can write 'a, b'.
|
||||
event_name.erase(0, event_name.find_first_not_of(' '));
|
||||
|
@ -75,7 +75,8 @@ inline size_t countCodePoints(const UInt8 * data, size_t size)
|
||||
}
|
||||
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
size_t convertCodePointToUTF8(int code_point, CharT * out_bytes, size_t out_length)
|
||||
{
|
||||
static const Poco::UTF8Encoding utf8;
|
||||
@ -84,7 +85,8 @@ size_t convertCodePointToUTF8(int code_point, CharT * out_bytes, size_t out_leng
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
|
||||
template <typename CharT>
|
||||
requires (sizeof(CharT) == 1)
|
||||
std::optional<uint32_t> convertUTF8ToCodePoint(const CharT * in_bytes, size_t in_length)
|
||||
{
|
||||
static const Poco::UTF8Encoding utf8;
|
||||
|
@ -13,6 +13,9 @@
|
||||
#cmakedefine01 USE_CASSANDRA
|
||||
#cmakedefine01 USE_SENTRY
|
||||
#cmakedefine01 USE_GRPC
|
||||
#cmakedefine01 USE_SIMDJSON
|
||||
#cmakedefine01 USE_RAPIDJSON
|
||||
|
||||
#cmakedefine01 USE_DATASKETCHES
|
||||
#cmakedefine01 USE_YAML_CPP
|
||||
#cmakedefine01 CLICKHOUSE_SPLIT_BINARY
|
||||
|
@ -25,7 +25,8 @@ namespace DB
|
||||
* In the rest, behaves like a dynamic_cast.
|
||||
*/
|
||||
template <typename To, typename From>
|
||||
std::enable_if_t<std::is_reference_v<To>, To> typeid_cast(From & from)
|
||||
requires std::is_reference_v<To>
|
||||
To typeid_cast(From & from)
|
||||
{
|
||||
try
|
||||
{
|
||||
@ -43,7 +44,8 @@ std::enable_if_t<std::is_reference_v<To>, To> typeid_cast(From & from)
|
||||
|
||||
|
||||
template <typename To, typename From>
|
||||
std::enable_if_t<std::is_pointer_v<To>, To> typeid_cast(From * from)
|
||||
requires std::is_pointer_v<To>
|
||||
To typeid_cast(From * from)
|
||||
{
|
||||
try
|
||||
{
|
||||
@ -60,7 +62,8 @@ std::enable_if_t<std::is_pointer_v<To>, To> typeid_cast(From * from)
|
||||
|
||||
|
||||
template <typename To, typename From>
|
||||
std::enable_if_t<is_shared_ptr_v<To>, To> typeid_cast(const std::shared_ptr<From> & from)
|
||||
requires is_shared_ptr_v<To>
|
||||
To typeid_cast(const std::shared_ptr<From> & from)
|
||||
{
|
||||
try
|
||||
{
|
||||
|
@ -726,18 +726,6 @@ void convertToFullIfSparse(Block & block)
|
||||
column.column = recursiveRemoveSparse(column.column);
|
||||
}
|
||||
|
||||
ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & column)
|
||||
{
|
||||
auto current_column = block.getByName(column.getNameInStorage()).column;
|
||||
current_column = current_column->decompress();
|
||||
|
||||
if (column.isSubcolumn())
|
||||
return column.getTypeInStorage()->getSubcolumn(column.getSubcolumnName(), current_column);
|
||||
|
||||
return current_column;
|
||||
}
|
||||
|
||||
|
||||
Block materializeBlock(const Block & block)
|
||||
{
|
||||
if (!block)
|
||||
|
@ -196,10 +196,6 @@ void getBlocksDifference(const Block & lhs, const Block & rhs, std::string & out
|
||||
|
||||
void convertToFullIfSparse(Block & block);
|
||||
|
||||
/// Helps in-memory storages to extract columns from block.
|
||||
/// Properly handles cases, when column is a subcolumn and when it is compressed.
|
||||
ColumnPtr getColumnFromBlock(const Block & block, const NameAndTypePair & column);
|
||||
|
||||
/// Converts columns-constants to full columns ("materializes" them).
|
||||
Block materializeBlock(const Block & block);
|
||||
void materializeBlockInplace(Block & block);
|
||||
|
@ -115,8 +115,8 @@ private:
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
static std::enable_if_t<is_decimal<T> && is_decimal<U>, Shift>
|
||||
getScales(const DataTypePtr & left_type, const DataTypePtr & right_type)
|
||||
requires is_decimal<T> && is_decimal<U>
|
||||
static Shift getScales(const DataTypePtr & left_type, const DataTypePtr & right_type)
|
||||
{
|
||||
const DataTypeDecimalBase<T> * decimal0 = checkDecimalBase<T>(*left_type);
|
||||
const DataTypeDecimalBase<U> * decimal1 = checkDecimalBase<U>(*right_type);
|
||||
@ -137,8 +137,8 @@ private:
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
static std::enable_if_t<is_decimal<T> && !is_decimal<U>, Shift>
|
||||
getScales(const DataTypePtr & left_type, const DataTypePtr &)
|
||||
requires is_decimal<T> && (!is_decimal<U>)
|
||||
static Shift getScales(const DataTypePtr & left_type, const DataTypePtr &)
|
||||
{
|
||||
Shift shift;
|
||||
const DataTypeDecimalBase<T> * decimal0 = checkDecimalBase<T>(*left_type);
|
||||
@ -148,8 +148,8 @@ private:
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
static std::enable_if_t<!is_decimal<T> && is_decimal<U>, Shift>
|
||||
getScales(const DataTypePtr &, const DataTypePtr & right_type)
|
||||
requires (!is_decimal<T>) && is_decimal<U>
|
||||
static Shift getScales(const DataTypePtr &, const DataTypePtr & right_type)
|
||||
{
|
||||
Shift shift;
|
||||
const DataTypeDecimalBase<U> * decimal1 = checkDecimalBase<U>(*right_type);
|
||||
|
@ -99,6 +99,12 @@ inline Field getBinaryValue(UInt8 type, ReadBuffer & buf)
|
||||
readBinary(value, buf);
|
||||
return value;
|
||||
}
|
||||
case Field::Types::Object:
|
||||
{
|
||||
Object value;
|
||||
readBinary(value, buf);
|
||||
return value;
|
||||
}
|
||||
case Field::Types::AggregateFunctionState:
|
||||
{
|
||||
AggregateFunctionStateData value;
|
||||
@ -208,6 +214,40 @@ void writeText(const Map & x, WriteBuffer & buf)
|
||||
writeFieldText(Field(x), buf);
|
||||
}
|
||||
|
||||
void readBinary(Object & x, ReadBuffer & buf)
|
||||
{
|
||||
size_t size;
|
||||
readBinary(size, buf);
|
||||
|
||||
for (size_t index = 0; index < size; ++index)
|
||||
{
|
||||
UInt8 type;
|
||||
String key;
|
||||
readBinary(type, buf);
|
||||
readBinary(key, buf);
|
||||
x[key] = getBinaryValue(type, buf);
|
||||
}
|
||||
}
|
||||
|
||||
void writeBinary(const Object & x, WriteBuffer & buf)
|
||||
{
|
||||
const size_t size = x.size();
|
||||
writeBinary(size, buf);
|
||||
|
||||
for (const auto & [key, value] : x)
|
||||
{
|
||||
const UInt8 type = value.getType();
|
||||
writeBinary(type, buf);
|
||||
writeBinary(key, buf);
|
||||
Field::dispatch([&buf] (const auto & val) { FieldVisitorWriteBinary()(val, buf); }, value);
|
||||
}
|
||||
}
|
||||
|
||||
void writeText(const Object & x, WriteBuffer & buf)
|
||||
{
|
||||
writeFieldText(Field(x), buf);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void readQuoted(DecimalField<T> & x, ReadBuffer & buf)
|
||||
{
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <type_traits>
|
||||
#include <functional>
|
||||
|
||||
@ -49,10 +50,22 @@ DEFINE_FIELD_VECTOR(Array);
|
||||
DEFINE_FIELD_VECTOR(Tuple);
|
||||
|
||||
/// An array with the following structure: [(key1, value1), (key2, value2), ...]
|
||||
DEFINE_FIELD_VECTOR(Map);
|
||||
DEFINE_FIELD_VECTOR(Map); /// TODO: use map instead of vector.
|
||||
|
||||
#undef DEFINE_FIELD_VECTOR
|
||||
|
||||
using FieldMap = std::map<String, Field, std::less<String>, AllocatorWithMemoryTracking<std::pair<const String, Field>>>;
|
||||
|
||||
#define DEFINE_FIELD_MAP(X) \
|
||||
struct X : public FieldMap \
|
||||
{ \
|
||||
using FieldMap::FieldMap; \
|
||||
}
|
||||
|
||||
DEFINE_FIELD_MAP(Object);
|
||||
|
||||
#undef DEFINE_FIELD_MAP
|
||||
|
||||
struct AggregateFunctionStateData
|
||||
{
|
||||
String name; /// Name with arguments.
|
||||
@ -219,6 +232,7 @@ template <> struct NearestFieldTypeImpl<String> { using Type = String; };
|
||||
template <> struct NearestFieldTypeImpl<Array> { using Type = Array; };
|
||||
template <> struct NearestFieldTypeImpl<Tuple> { using Type = Tuple; };
|
||||
template <> struct NearestFieldTypeImpl<Map> { using Type = Map; };
|
||||
template <> struct NearestFieldTypeImpl<Object> { using Type = Object; };
|
||||
template <> struct NearestFieldTypeImpl<bool> { using Type = UInt64; };
|
||||
template <> struct NearestFieldTypeImpl<Null> { using Type = Null; };
|
||||
|
||||
@ -283,6 +297,7 @@ public:
|
||||
Map = 26,
|
||||
UUID = 27,
|
||||
Bool = 28,
|
||||
Object = 29,
|
||||
};
|
||||
};
|
||||
|
||||
@ -472,6 +487,7 @@ public:
|
||||
case Types::Array: return get<Array>() < rhs.get<Array>();
|
||||
case Types::Tuple: return get<Tuple>() < rhs.get<Tuple>();
|
||||
case Types::Map: return get<Map>() < rhs.get<Map>();
|
||||
case Types::Object: return get<Object>() < rhs.get<Object>();
|
||||
case Types::Decimal32: return get<DecimalField<Decimal32>>() < rhs.get<DecimalField<Decimal32>>();
|
||||
case Types::Decimal64: return get<DecimalField<Decimal64>>() < rhs.get<DecimalField<Decimal64>>();
|
||||
case Types::Decimal128: return get<DecimalField<Decimal128>>() < rhs.get<DecimalField<Decimal128>>();
|
||||
@ -510,6 +526,7 @@ public:
|
||||
case Types::Array: return get<Array>() <= rhs.get<Array>();
|
||||
case Types::Tuple: return get<Tuple>() <= rhs.get<Tuple>();
|
||||
case Types::Map: return get<Map>() <= rhs.get<Map>();
|
||||
case Types::Object: return get<Object>() <= rhs.get<Object>();
|
||||
case Types::Decimal32: return get<DecimalField<Decimal32>>() <= rhs.get<DecimalField<Decimal32>>();
|
||||
case Types::Decimal64: return get<DecimalField<Decimal64>>() <= rhs.get<DecimalField<Decimal64>>();
|
||||
case Types::Decimal128: return get<DecimalField<Decimal128>>() <= rhs.get<DecimalField<Decimal128>>();
|
||||
@ -548,6 +565,7 @@ public:
|
||||
case Types::Array: return get<Array>() == rhs.get<Array>();
|
||||
case Types::Tuple: return get<Tuple>() == rhs.get<Tuple>();
|
||||
case Types::Map: return get<Map>() == rhs.get<Map>();
|
||||
case Types::Object: return get<Object>() == rhs.get<Object>();
|
||||
case Types::UInt128: return get<UInt128>() == rhs.get<UInt128>();
|
||||
case Types::UInt256: return get<UInt256>() == rhs.get<UInt256>();
|
||||
case Types::Int128: return get<Int128>() == rhs.get<Int128>();
|
||||
@ -597,6 +615,7 @@ public:
|
||||
bool value = bool(field.template get<UInt64>());
|
||||
return f(value);
|
||||
}
|
||||
case Types::Object: return f(field.template get<Object>());
|
||||
case Types::Decimal32: return f(field.template get<DecimalField<Decimal32>>());
|
||||
case Types::Decimal64: return f(field.template get<DecimalField<Decimal64>>());
|
||||
case Types::Decimal128: return f(field.template get<DecimalField<Decimal128>>());
|
||||
@ -713,6 +732,9 @@ private:
|
||||
case Types::Map:
|
||||
destroy<Map>();
|
||||
break;
|
||||
case Types::Object:
|
||||
destroy<Object>();
|
||||
break;
|
||||
case Types::AggregateFunctionState:
|
||||
destroy<AggregateFunctionStateData>();
|
||||
break;
|
||||
@ -737,26 +759,27 @@ private:
|
||||
using Row = std::vector<Field>;
|
||||
|
||||
|
||||
template <> struct Field::TypeToEnum<Null> { static const Types::Which value = Types::Null; };
|
||||
template <> struct Field::TypeToEnum<UInt64> { static const Types::Which value = Types::UInt64; };
|
||||
template <> struct Field::TypeToEnum<UInt128> { static const Types::Which value = Types::UInt128; };
|
||||
template <> struct Field::TypeToEnum<UInt256> { static const Types::Which value = Types::UInt256; };
|
||||
template <> struct Field::TypeToEnum<Int64> { static const Types::Which value = Types::Int64; };
|
||||
template <> struct Field::TypeToEnum<Int128> { static const Types::Which value = Types::Int128; };
|
||||
template <> struct Field::TypeToEnum<Int256> { static const Types::Which value = Types::Int256; };
|
||||
template <> struct Field::TypeToEnum<UUID> { static const Types::Which value = Types::UUID; };
|
||||
template <> struct Field::TypeToEnum<Float64> { static const Types::Which value = Types::Float64; };
|
||||
template <> struct Field::TypeToEnum<String> { static const Types::Which value = Types::String; };
|
||||
template <> struct Field::TypeToEnum<Array> { static const Types::Which value = Types::Array; };
|
||||
template <> struct Field::TypeToEnum<Tuple> { static const Types::Which value = Types::Tuple; };
|
||||
template <> struct Field::TypeToEnum<Map> { static const Types::Which value = Types::Map; };
|
||||
template <> struct Field::TypeToEnum<DecimalField<Decimal32>>{ static const Types::Which value = Types::Decimal32; };
|
||||
template <> struct Field::TypeToEnum<DecimalField<Decimal64>>{ static const Types::Which value = Types::Decimal64; };
|
||||
template <> struct Field::TypeToEnum<DecimalField<Decimal128>>{ static const Types::Which value = Types::Decimal128; };
|
||||
template <> struct Field::TypeToEnum<DecimalField<Decimal256>>{ static const Types::Which value = Types::Decimal256; };
|
||||
template <> struct Field::TypeToEnum<DecimalField<DateTime64>>{ static const Types::Which value = Types::Decimal64; };
|
||||
template <> struct Field::TypeToEnum<AggregateFunctionStateData>{ static const Types::Which value = Types::AggregateFunctionState; };
|
||||
template <> struct Field::TypeToEnum<bool>{ static const Types::Which value = Types::Bool; };
|
||||
template <> struct Field::TypeToEnum<Null> { static constexpr Types::Which value = Types::Null; };
|
||||
template <> struct Field::TypeToEnum<UInt64> { static constexpr Types::Which value = Types::UInt64; };
|
||||
template <> struct Field::TypeToEnum<UInt128> { static constexpr Types::Which value = Types::UInt128; };
|
||||
template <> struct Field::TypeToEnum<UInt256> { static constexpr Types::Which value = Types::UInt256; };
|
||||
template <> struct Field::TypeToEnum<Int64> { static constexpr Types::Which value = Types::Int64; };
|
||||
template <> struct Field::TypeToEnum<Int128> { static constexpr Types::Which value = Types::Int128; };
|
||||
template <> struct Field::TypeToEnum<Int256> { static constexpr Types::Which value = Types::Int256; };
|
||||
template <> struct Field::TypeToEnum<UUID> { static constexpr Types::Which value = Types::UUID; };
|
||||
template <> struct Field::TypeToEnum<Float64> { static constexpr Types::Which value = Types::Float64; };
|
||||
template <> struct Field::TypeToEnum<String> { static constexpr Types::Which value = Types::String; };
|
||||
template <> struct Field::TypeToEnum<Array> { static constexpr Types::Which value = Types::Array; };
|
||||
template <> struct Field::TypeToEnum<Tuple> { static constexpr Types::Which value = Types::Tuple; };
|
||||
template <> struct Field::TypeToEnum<Map> { static constexpr Types::Which value = Types::Map; };
|
||||
template <> struct Field::TypeToEnum<Object> { static constexpr Types::Which value = Types::Object; };
|
||||
template <> struct Field::TypeToEnum<DecimalField<Decimal32>>{ static constexpr Types::Which value = Types::Decimal32; };
|
||||
template <> struct Field::TypeToEnum<DecimalField<Decimal64>>{ static constexpr Types::Which value = Types::Decimal64; };
|
||||
template <> struct Field::TypeToEnum<DecimalField<Decimal128>>{ static constexpr Types::Which value = Types::Decimal128; };
|
||||
template <> struct Field::TypeToEnum<DecimalField<Decimal256>>{ static constexpr Types::Which value = Types::Decimal256; };
|
||||
template <> struct Field::TypeToEnum<DecimalField<DateTime64>>{ static constexpr Types::Which value = Types::Decimal64; };
|
||||
template <> struct Field::TypeToEnum<AggregateFunctionStateData>{ static constexpr Types::Which value = Types::AggregateFunctionState; };
|
||||
template <> struct Field::TypeToEnum<bool>{ static constexpr Types::Which value = Types::Bool; };
|
||||
|
||||
template <> struct Field::EnumToType<Field::Types::Null> { using Type = Null; };
|
||||
template <> struct Field::EnumToType<Field::Types::UInt64> { using Type = UInt64; };
|
||||
@ -771,6 +794,7 @@ template <> struct Field::EnumToType<Field::Types::String> { using Type = Strin
|
||||
template <> struct Field::EnumToType<Field::Types::Array> { using Type = Array; };
|
||||
template <> struct Field::EnumToType<Field::Types::Tuple> { using Type = Tuple; };
|
||||
template <> struct Field::EnumToType<Field::Types::Map> { using Type = Map; };
|
||||
template <> struct Field::EnumToType<Field::Types::Object> { using Type = Object; };
|
||||
template <> struct Field::EnumToType<Field::Types::Decimal32> { using Type = DecimalField<Decimal32>; };
|
||||
template <> struct Field::EnumToType<Field::Types::Decimal64> { using Type = DecimalField<Decimal64>; };
|
||||
template <> struct Field::EnumToType<Field::Types::Decimal128> { using Type = DecimalField<Decimal128>; };
|
||||
@ -931,34 +955,39 @@ class WriteBuffer;
|
||||
|
||||
/// It is assumed that all elements of the array have the same type.
|
||||
void readBinary(Array & x, ReadBuffer & buf);
|
||||
|
||||
[[noreturn]] inline void readText(Array &, ReadBuffer &) { throw Exception("Cannot read Array.", ErrorCodes::NOT_IMPLEMENTED); }
|
||||
[[noreturn]] inline void readQuoted(Array &, ReadBuffer &) { throw Exception("Cannot read Array.", ErrorCodes::NOT_IMPLEMENTED); }
|
||||
|
||||
/// It is assumed that all elements of the array have the same type.
|
||||
/// Also write size and type into buf. UInt64 and Int64 is written in variadic size form
|
||||
void writeBinary(const Array & x, WriteBuffer & buf);
|
||||
|
||||
void writeText(const Array & x, WriteBuffer & buf);
|
||||
|
||||
[[noreturn]] inline void writeQuoted(const Array &, WriteBuffer &) { throw Exception("Cannot write Array quoted.", ErrorCodes::NOT_IMPLEMENTED); }
|
||||
|
||||
void readBinary(Tuple & x, ReadBuffer & buf);
|
||||
|
||||
[[noreturn]] inline void readText(Tuple &, ReadBuffer &) { throw Exception("Cannot read Tuple.", ErrorCodes::NOT_IMPLEMENTED); }
|
||||
[[noreturn]] inline void readQuoted(Tuple &, ReadBuffer &) { throw Exception("Cannot read Tuple.", ErrorCodes::NOT_IMPLEMENTED); }
|
||||
|
||||
void writeBinary(const Tuple & x, WriteBuffer & buf);
|
||||
|
||||
void writeText(const Tuple & x, WriteBuffer & buf);
|
||||
[[noreturn]] inline void writeQuoted(const Tuple &, WriteBuffer &) { throw Exception("Cannot write Tuple quoted.", ErrorCodes::NOT_IMPLEMENTED); }
|
||||
|
||||
void readBinary(Map & x, ReadBuffer & buf);
|
||||
[[noreturn]] inline void readText(Map &, ReadBuffer &) { throw Exception("Cannot read Map.", ErrorCodes::NOT_IMPLEMENTED); }
|
||||
[[noreturn]] inline void readQuoted(Map &, ReadBuffer &) { throw Exception("Cannot read Map.", ErrorCodes::NOT_IMPLEMENTED); }
|
||||
|
||||
void writeBinary(const Map & x, WriteBuffer & buf);
|
||||
void writeText(const Map & x, WriteBuffer & buf);
|
||||
[[noreturn]] inline void writeQuoted(const Map &, WriteBuffer &) { throw Exception("Cannot write Map quoted.", ErrorCodes::NOT_IMPLEMENTED); }
|
||||
|
||||
void readBinary(Object & x, ReadBuffer & buf);
|
||||
[[noreturn]] inline void readText(Object &, ReadBuffer &) { throw Exception("Cannot read Object.", ErrorCodes::NOT_IMPLEMENTED); }
|
||||
[[noreturn]] inline void readQuoted(Object &, ReadBuffer &) { throw Exception("Cannot read Object.", ErrorCodes::NOT_IMPLEMENTED); }
|
||||
|
||||
void writeBinary(const Object & x, WriteBuffer & buf);
|
||||
void writeText(const Object & x, WriteBuffer & buf);
|
||||
[[noreturn]] inline void writeQuoted(const Object &, WriteBuffer &) { throw Exception("Cannot write Object quoted.", ErrorCodes::NOT_IMPLEMENTED); }
|
||||
|
||||
__attribute__ ((noreturn)) inline void writeText(const AggregateFunctionStateData &, WriteBuffer &)
|
||||
{
|
||||
// This probably doesn't make any sense, but we have to have it for
|
||||
@ -977,8 +1006,6 @@ void readQuoted(DecimalField<T> & x, ReadBuffer & buf);
|
||||
|
||||
void writeFieldText(const Field & x, WriteBuffer & buf);
|
||||
|
||||
[[noreturn]] inline void writeQuoted(const Tuple &, WriteBuffer &) { throw Exception("Cannot write Tuple quoted.", ErrorCodes::NOT_IMPLEMENTED); }
|
||||
|
||||
String toString(const Field & x);
|
||||
|
||||
}
|
||||
|
@ -53,7 +53,8 @@ struct MultiEnum
|
||||
return bitset;
|
||||
}
|
||||
|
||||
template <typename ValueType, typename = std::enable_if_t<std::is_convertible_v<ValueType, StorageType>>>
|
||||
template <typename ValueType>
|
||||
requires std::is_convertible_v<ValueType, StorageType>
|
||||
void setValue(ValueType new_value)
|
||||
{
|
||||
// Can't set value from any enum avoid confusion
|
||||
@ -66,7 +67,8 @@ struct MultiEnum
|
||||
return bitset == other.bitset;
|
||||
}
|
||||
|
||||
template <typename ValueType, typename = std::enable_if_t<std::is_convertible_v<ValueType, StorageType>>>
|
||||
template <typename ValueType>
|
||||
requires std::is_convertible_v<ValueType, StorageType>
|
||||
bool operator==(ValueType other) const
|
||||
{
|
||||
// Shouldn't be comparable with any enum to avoid confusion
|
||||
@ -80,13 +82,15 @@ struct MultiEnum
|
||||
return !(*this == other);
|
||||
}
|
||||
|
||||
template <typename ValueType, typename = std::enable_if_t<std::is_convertible_v<ValueType, StorageType>>>
|
||||
template <typename ValueType>
|
||||
requires std::is_convertible_v<ValueType, StorageType>
|
||||
friend bool operator==(ValueType left, MultiEnum right)
|
||||
{
|
||||
return right.operator==(left);
|
||||
}
|
||||
|
||||
template <typename L, typename = typename std::enable_if<!std::is_same_v<L, MultiEnum>>::type>
|
||||
template <typename L>
|
||||
requires (!std::is_same_v<L, MultiEnum>)
|
||||
friend bool operator!=(L left, MultiEnum right)
|
||||
{
|
||||
return !(right.operator==(left));
|
||||
|
@ -473,6 +473,7 @@ class IColumn;
|
||||
M(Bool, allow_experimental_geo_types, false, "Allow geo data types such as Point, Ring, Polygon, MultiPolygon", 0) \
|
||||
M(Bool, data_type_default_nullable, false, "Data types without NULL or NOT NULL will make Nullable", 0) \
|
||||
M(Bool, cast_keep_nullable, false, "CAST operator keep Nullable for result data type", 0) \
|
||||
M(Bool, cast_ipv4_ipv6_default_on_conversion_error, false, "CAST operator into IPv4, CAST operator into IPV6 type, toIPv4, toIPv6 functions will return default value instead of throwing exception on conversion error.", 0) \
|
||||
M(Bool, alter_partition_verbose_result, false, "Output information about affected parts. Currently works only for FREEZE and ATTACH commands.", 0) \
|
||||
M(Bool, allow_experimental_database_materialized_mysql, false, "Allow to create database with Engine=MaterializedMySQL(...).", 0) \
|
||||
M(Bool, allow_experimental_database_materialized_postgresql, false, "Allow to create database with Engine=MaterializedPostgreSQL(...).", 0) \
|
||||
@ -492,6 +493,7 @@ class IColumn;
|
||||
M(Bool, force_optimize_projection, false, "If projection optimization is enabled, SELECT queries need to use projection", 0) \
|
||||
M(Bool, async_socket_for_remote, true, "Asynchronously read from socket executing remote query", 0) \
|
||||
M(Bool, insert_null_as_default, true, "Insert DEFAULT values instead of NULL in INSERT SELECT (UNION ALL)", 0) \
|
||||
M(Bool, describe_extend_object_types, false, "Deduce concrete type of columns of type Object in DESCRIBE query", 0) \
|
||||
M(Bool, describe_include_subcolumns, false, "If true, subcolumns of all table columns will be included into result of DESCRIBE query", 0) \
|
||||
\
|
||||
M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \
|
||||
@ -508,6 +510,7 @@ class IColumn;
|
||||
M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \
|
||||
M(UInt64, max_distributed_depth, 5, "Maximum distributed query depth", 0) \
|
||||
M(Bool, database_replicated_always_detach_permanently, false, "Execute DETACH TABLE as DETACH TABLE PERMANENTLY if database engine is Replicated", 0) \
|
||||
M(Bool, database_replicated_allow_only_replicated_engine, false, "Allow to create only Replicated tables in database with engine Replicated", 0) \
|
||||
M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, "Format of distributed DDL query result", 0) \
|
||||
M(UInt64, distributed_ddl_entry_format_version, 1, "Version of DDL entry to write into ZooKeeper", 0) \
|
||||
\
|
||||
@ -567,6 +570,7 @@ class IColumn;
|
||||
/** Experimental functions */ \
|
||||
M(Bool, allow_experimental_funnel_functions, false, "Enable experimental functions for funnel analysis.", 0) \
|
||||
M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \
|
||||
M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \
|
||||
M(String, insert_deduplication_token, "", "If not empty, used for duplicate detection instead of data digest", 0) \
|
||||
// End of COMMON_SETTINGS
|
||||
// Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS.
|
||||
|
@ -87,6 +87,7 @@ enum class TypeIndex
|
||||
AggregateFunction,
|
||||
LowCardinality,
|
||||
Map,
|
||||
Object,
|
||||
};
|
||||
#if !defined(__clang__)
|
||||
#pragma GCC diagnostic pop
|
||||
|
@ -15,6 +15,8 @@
|
||||
#cmakedefine01 USE_NURAFT
|
||||
#cmakedefine01 USE_NLP
|
||||
#cmakedefine01 USE_KRB5
|
||||
#cmakedefine01 USE_SIMDJSON
|
||||
#cmakedefine01 USE_RAPIDJSON
|
||||
#cmakedefine01 USE_FILELOG
|
||||
#cmakedefine01 USE_ODBC
|
||||
#cmakedefine01 USE_REPLXX
|
||||
|
@ -7,7 +7,8 @@ namespace DB
|
||||
// Use template to disable implicit casting for certain overloaded types such as Field, which leads
|
||||
// to overload resolution ambiguity.
|
||||
class Field;
|
||||
template <typename T, typename U = std::enable_if_t<std::is_same_v<T, Field>>>
|
||||
template <typename T>
|
||||
requires std::is_same_v<T, Field>
|
||||
std::ostream & operator<<(std::ostream & stream, const T & what);
|
||||
|
||||
struct NameAndTypePair;
|
||||
|
@ -1,3 +1,5 @@
|
||||
add_subdirectory (Serializations)
|
||||
|
||||
if (ENABLE_EXAMPLES)
|
||||
add_subdirectory (examples)
|
||||
endif ()
|
||||
|
@ -213,6 +213,7 @@ DataTypeFactory::DataTypeFactory()
|
||||
registerDataTypeDomainSimpleAggregateFunction(*this);
|
||||
registerDataTypeDomainGeo(*this);
|
||||
registerDataTypeMap(*this);
|
||||
registerDataTypeObject(*this);
|
||||
}
|
||||
|
||||
DataTypeFactory & DataTypeFactory::instance()
|
||||
|
@ -87,5 +87,6 @@ void registerDataTypeDomainIPv4AndIPv6(DataTypeFactory & factory);
|
||||
void registerDataTypeDomainBool(DataTypeFactory & factory);
|
||||
void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory);
|
||||
void registerDataTypeDomainGeo(DataTypeFactory & factory);
|
||||
void registerDataTypeObject(DataTypeFactory & factory);
|
||||
|
||||
}
|
||||
|
83
src/DataTypes/DataTypeObject.cpp
Normal file
83
src/DataTypes/DataTypeObject.cpp
Normal file
@ -0,0 +1,83 @@
|
||||
#include <DataTypes/DataTypeObject.h>
|
||||
#include <DataTypes/DataTypeFactory.h>
|
||||
#include <DataTypes/Serializations/SerializationObject.h>
|
||||
|
||||
#include <Parsers/IAST.h>
|
||||
#include <Parsers/ASTLiteral.h>
|
||||
#include <Parsers/ASTFunction.h>
|
||||
#include <IO/Operators.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
extern const int UNEXPECTED_AST_STRUCTURE;
|
||||
}
|
||||
|
||||
DataTypeObject::DataTypeObject(const String & schema_format_, bool is_nullable_)
|
||||
: schema_format(Poco::toLower(schema_format_))
|
||||
, is_nullable(is_nullable_)
|
||||
, default_serialization(getObjectSerialization(schema_format))
|
||||
{
|
||||
}
|
||||
|
||||
bool DataTypeObject::equals(const IDataType & rhs) const
|
||||
{
|
||||
if (const auto * object = typeid_cast<const DataTypeObject *>(&rhs))
|
||||
return schema_format == object->schema_format && is_nullable == object->is_nullable;
|
||||
return false;
|
||||
}
|
||||
|
||||
SerializationPtr DataTypeObject::doGetDefaultSerialization() const
|
||||
{
|
||||
return default_serialization;
|
||||
}
|
||||
|
||||
String DataTypeObject::doGetName() const
|
||||
{
|
||||
WriteBufferFromOwnString out;
|
||||
if (is_nullable)
|
||||
out << "Object(Nullable(" << quote << schema_format << "))";
|
||||
else
|
||||
out << "Object(" << quote << schema_format << ")";
|
||||
return out.str();
|
||||
}
|
||||
|
||||
static DataTypePtr create(const ASTPtr & arguments)
|
||||
{
|
||||
if (!arguments || arguments->children.size() != 1)
|
||||
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
|
||||
"Object data type family must have one argument - name of schema format");
|
||||
|
||||
ASTPtr schema_argument = arguments->children[0];
|
||||
bool is_nullable = false;
|
||||
|
||||
if (const auto * func = schema_argument->as<ASTFunction>())
|
||||
{
|
||||
if (func->name != "Nullable" || func->arguments->children.size() != 1)
|
||||
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE,
|
||||
"Expected 'Nullable(<schema_name>)' as parameter for type Object", func->name);
|
||||
|
||||
schema_argument = func->arguments->children[0];
|
||||
is_nullable = true;
|
||||
}
|
||||
|
||||
const auto * literal = schema_argument->as<ASTLiteral>();
|
||||
if (!literal || literal->value.getType() != Field::Types::String)
|
||||
throw Exception(ErrorCodes::UNEXPECTED_AST_STRUCTURE,
|
||||
"Object data type family must have a const string as its schema name parameter");
|
||||
|
||||
return std::make_shared<DataTypeObject>(literal->value.get<const String &>(), is_nullable);
|
||||
}
|
||||
|
||||
void registerDataTypeObject(DataTypeFactory & factory)
|
||||
{
|
||||
factory.registerDataType("Object", create);
|
||||
factory.registerSimpleDataType("JSON",
|
||||
[] { return std::make_shared<DataTypeObject>("JSON", false); },
|
||||
DataTypeFactory::CaseInsensitive);
|
||||
}
|
||||
|
||||
}
|
46
src/DataTypes/DataTypeObject.h
Normal file
46
src/DataTypes/DataTypeObject.h
Normal file
@ -0,0 +1,46 @@
|
||||
#pragma once
|
||||
|
||||
#include <DataTypes/IDataType.h>
|
||||
#include <Core/Field.h>
|
||||
#include <Columns/ColumnObject.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
class DataTypeObject : public IDataType
|
||||
{
|
||||
private:
|
||||
String schema_format;
|
||||
bool is_nullable;
|
||||
SerializationPtr default_serialization;
|
||||
|
||||
public:
|
||||
DataTypeObject(const String & schema_format_, bool is_nullable_);
|
||||
|
||||
const char * getFamilyName() const override { return "Object"; }
|
||||
String doGetName() const override;
|
||||
TypeIndex getTypeId() const override { return TypeIndex::Object; }
|
||||
|
||||
MutableColumnPtr createColumn() const override { return ColumnObject::create(is_nullable); }
|
||||
|
||||
Field getDefault() const override
|
||||
{
|
||||
throw Exception("Method getDefault() is not implemented for data type " + getName(), ErrorCodes::NOT_IMPLEMENTED);
|
||||
}
|
||||
|
||||
bool haveSubtypes() const override { return false; }
|
||||
bool equals(const IDataType & rhs) const override;
|
||||
bool isParametric() const override { return true; }
|
||||
|
||||
SerializationPtr doGetDefaultSerialization() const override;
|
||||
|
||||
bool hasNullableSubcolumns() const { return is_nullable; }
|
||||
};
|
||||
|
||||
}
|
@ -1,6 +1,7 @@
|
||||
#include <DataTypes/FieldToDataType.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/DataTypeMap.h>
|
||||
#include <DataTypes/DataTypeObject.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <DataTypes/DataTypesDecimal.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
@ -108,12 +109,11 @@ DataTypePtr FieldToDataType::operator() (const Array & x) const
|
||||
element_types.reserve(x.size());
|
||||
|
||||
for (const Field & elem : x)
|
||||
element_types.emplace_back(applyVisitor(FieldToDataType(), elem));
|
||||
element_types.emplace_back(applyVisitor(FieldToDataType(allow_convertion_to_string), elem));
|
||||
|
||||
return std::make_shared<DataTypeArray>(getLeastSupertype(element_types));
|
||||
return std::make_shared<DataTypeArray>(getLeastSupertype(element_types, allow_convertion_to_string));
|
||||
}
|
||||
|
||||
|
||||
DataTypePtr FieldToDataType::operator() (const Tuple & tuple) const
|
||||
{
|
||||
if (tuple.empty())
|
||||
@ -123,7 +123,7 @@ DataTypePtr FieldToDataType::operator() (const Tuple & tuple) const
|
||||
element_types.reserve(tuple.size());
|
||||
|
||||
for (const auto & element : tuple)
|
||||
element_types.push_back(applyVisitor(FieldToDataType(), element));
|
||||
element_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), element));
|
||||
|
||||
return std::make_shared<DataTypeTuple>(element_types);
|
||||
}
|
||||
@ -139,11 +139,19 @@ DataTypePtr FieldToDataType::operator() (const Map & map) const
|
||||
{
|
||||
const auto & tuple = elem.safeGet<const Tuple &>();
|
||||
assert(tuple.size() == 2);
|
||||
key_types.push_back(applyVisitor(FieldToDataType(), tuple[0]));
|
||||
value_types.push_back(applyVisitor(FieldToDataType(), tuple[1]));
|
||||
key_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), tuple[0]));
|
||||
value_types.push_back(applyVisitor(FieldToDataType(allow_convertion_to_string), tuple[1]));
|
||||
}
|
||||
|
||||
return std::make_shared<DataTypeMap>(getLeastSupertype(key_types), getLeastSupertype(value_types));
|
||||
return std::make_shared<DataTypeMap>(
|
||||
getLeastSupertype(key_types, allow_convertion_to_string),
|
||||
getLeastSupertype(value_types, allow_convertion_to_string));
|
||||
}
|
||||
|
||||
DataTypePtr FieldToDataType::operator() (const Object &) const
|
||||
{
|
||||
/// TODO: Do we need different parameters for type Object?
|
||||
return std::make_shared<DataTypeObject>("json", false);
|
||||
}
|
||||
|
||||
DataTypePtr FieldToDataType::operator() (const AggregateFunctionStateData & x) const
|
||||
|
@ -20,26 +20,34 @@ using DataTypePtr = std::shared_ptr<const IDataType>;
|
||||
class FieldToDataType : public StaticVisitor<DataTypePtr>
|
||||
{
|
||||
public:
|
||||
FieldToDataType(bool allow_convertion_to_string_ = false)
|
||||
: allow_convertion_to_string(allow_convertion_to_string_)
|
||||
{
|
||||
}
|
||||
|
||||
DataTypePtr operator() (const Null & x) const;
|
||||
DataTypePtr operator() (const UInt64 & x) const;
|
||||
DataTypePtr operator() (const UInt128 & x) const;
|
||||
DataTypePtr operator() (const UInt256 & x) const;
|
||||
DataTypePtr operator() (const Int64 & x) const;
|
||||
DataTypePtr operator() (const Int128 & x) const;
|
||||
DataTypePtr operator() (const Int256 & x) const;
|
||||
DataTypePtr operator() (const UUID & x) const;
|
||||
DataTypePtr operator() (const Float64 & x) const;
|
||||
DataTypePtr operator() (const String & x) const;
|
||||
DataTypePtr operator() (const Array & x) const;
|
||||
DataTypePtr operator() (const Tuple & tuple) const;
|
||||
DataTypePtr operator() (const Map & map) const;
|
||||
DataTypePtr operator() (const Object & map) const;
|
||||
DataTypePtr operator() (const DecimalField<Decimal32> & x) const;
|
||||
DataTypePtr operator() (const DecimalField<Decimal64> & x) const;
|
||||
DataTypePtr operator() (const DecimalField<Decimal128> & x) const;
|
||||
DataTypePtr operator() (const DecimalField<Decimal256> & x) const;
|
||||
DataTypePtr operator() (const AggregateFunctionStateData & x) const;
|
||||
DataTypePtr operator() (const UInt256 & x) const;
|
||||
DataTypePtr operator() (const Int256 & x) const;
|
||||
DataTypePtr operator() (const bool & x) const;
|
||||
|
||||
private:
|
||||
bool allow_convertion_to_string;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
|
@ -126,19 +126,25 @@ DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const
|
||||
DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const
|
||||
{
|
||||
SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
|
||||
return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type);
|
||||
return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, true);
|
||||
}
|
||||
|
||||
SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const
|
||||
ColumnPtr IDataType::tryGetSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const
|
||||
{
|
||||
SubstreamData data = { serialization, nullptr, nullptr, nullptr };
|
||||
return getForSubcolumn<SerializationPtr>(subcolumn_name, data, &SubstreamData::serialization);
|
||||
SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr };
|
||||
return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column, false);
|
||||
}
|
||||
|
||||
ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const
|
||||
{
|
||||
SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr };
|
||||
return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column);
|
||||
return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column, true);
|
||||
}
|
||||
|
||||
SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const
|
||||
{
|
||||
SubstreamData data = { serialization, nullptr, nullptr, nullptr };
|
||||
return getForSubcolumn<SerializationPtr>(subcolumn_name, data, &SubstreamData::serialization, true);
|
||||
}
|
||||
|
||||
Names IDataType::getSubcolumnNames() const
|
||||
|
@ -82,9 +82,11 @@ public:
|
||||
DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const;
|
||||
DataTypePtr getSubcolumnType(const String & subcolumn_name) const;
|
||||
|
||||
SerializationPtr getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const;
|
||||
ColumnPtr tryGetSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const;
|
||||
ColumnPtr getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const;
|
||||
|
||||
SerializationPtr getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const;
|
||||
|
||||
using SubstreamData = ISerialization::SubstreamData;
|
||||
using SubstreamPath = ISerialization::SubstreamPath;
|
||||
|
||||
@ -309,7 +311,7 @@ private:
|
||||
const String & subcolumn_name,
|
||||
const SubstreamData & data,
|
||||
Ptr SubstreamData::*member,
|
||||
bool throw_if_null = true) const;
|
||||
bool throw_if_null) const;
|
||||
};
|
||||
|
||||
|
||||
@ -373,11 +375,13 @@ struct WhichDataType
|
||||
constexpr bool isMap() const {return idx == TypeIndex::Map; }
|
||||
constexpr bool isSet() const { return idx == TypeIndex::Set; }
|
||||
constexpr bool isInterval() const { return idx == TypeIndex::Interval; }
|
||||
constexpr bool isObject() const { return idx == TypeIndex::Object; }
|
||||
|
||||
constexpr bool isNothing() const { return idx == TypeIndex::Nothing; }
|
||||
constexpr bool isNullable() const { return idx == TypeIndex::Nullable; }
|
||||
constexpr bool isFunction() const { return idx == TypeIndex::Function; }
|
||||
constexpr bool isAggregateFunction() const { return idx == TypeIndex::AggregateFunction; }
|
||||
constexpr bool isSimple() const { return isInt() || isUInt() || isFloat() || isString(); }
|
||||
|
||||
constexpr bool isLowCarnality() const { return idx == TypeIndex::LowCardinality; }
|
||||
};
|
||||
@ -403,6 +407,12 @@ inline bool isMap(const DataTypePtr & data_type) { return WhichDataType(data_typ
|
||||
inline bool isNothing(const DataTypePtr & data_type) { return WhichDataType(data_type).isNothing(); }
|
||||
inline bool isUUID(const DataTypePtr & data_type) { return WhichDataType(data_type).isUUID(); }
|
||||
|
||||
template <typename T>
|
||||
inline bool isObject(const T & data_type)
|
||||
{
|
||||
return WhichDataType(data_type).isObject();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline bool isUInt8(const T & data_type)
|
||||
{
|
||||
|
@ -30,6 +30,12 @@ namespace Nested
|
||||
|
||||
std::string concatenateName(const std::string & nested_table_name, const std::string & nested_field_name)
|
||||
{
|
||||
if (nested_table_name.empty())
|
||||
return nested_field_name;
|
||||
|
||||
if (nested_field_name.empty())
|
||||
return nested_table_name;
|
||||
|
||||
return nested_table_name + "." + nested_field_name;
|
||||
}
|
||||
|
||||
|
703
src/DataTypes/ObjectUtils.cpp
Normal file
703
src/DataTypes/ObjectUtils.cpp
Normal file
@ -0,0 +1,703 @@
|
||||
#include <DataTypes/ObjectUtils.h>
|
||||
#include <DataTypes/DataTypeObject.h>
|
||||
#include <DataTypes/DataTypeNothing.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <DataTypes/DataTypeNested.h>
|
||||
#include <DataTypes/DataTypeFactory.h>
|
||||
#include <DataTypes/getLeastSupertype.h>
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
#include <Columns/ColumnObject.h>
|
||||
#include <Columns/ColumnTuple.h>
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <Columns/ColumnNullable.h>
|
||||
#include <Parsers/ASTSelectQuery.h>
|
||||
#include <Parsers/ASTExpressionList.h>
|
||||
#include <Parsers/ASTLiteral.h>
|
||||
#include <Parsers/ASTFunction.h>
|
||||
#include <IO/Operators.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int TYPE_MISMATCH;
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int DUPLICATE_COLUMN;
|
||||
}
|
||||
|
||||
size_t getNumberOfDimensions(const IDataType & type)
|
||||
{
|
||||
if (const auto * type_array = typeid_cast<const DataTypeArray *>(&type))
|
||||
return type_array->getNumberOfDimensions();
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t getNumberOfDimensions(const IColumn & column)
|
||||
{
|
||||
if (const auto * column_array = checkAndGetColumn<ColumnArray>(column))
|
||||
return column_array->getNumberOfDimensions();
|
||||
return 0;
|
||||
}
|
||||
|
||||
DataTypePtr getBaseTypeOfArray(const DataTypePtr & type)
|
||||
{
|
||||
/// Get raw pointers to avoid extra copying of type pointers.
|
||||
const DataTypeArray * last_array = nullptr;
|
||||
const auto * current_type = type.get();
|
||||
while (const auto * type_array = typeid_cast<const DataTypeArray *>(current_type))
|
||||
{
|
||||
current_type = type_array->getNestedType().get();
|
||||
last_array = type_array;
|
||||
}
|
||||
|
||||
return last_array ? last_array->getNestedType() : type;
|
||||
}
|
||||
|
||||
ColumnPtr getBaseColumnOfArray(const ColumnPtr & column)
|
||||
{
|
||||
/// Get raw pointers to avoid extra copying of column pointers.
|
||||
const ColumnArray * last_array = nullptr;
|
||||
const auto * current_column = column.get();
|
||||
while (const auto * column_array = checkAndGetColumn<ColumnArray>(current_column))
|
||||
{
|
||||
current_column = &column_array->getData();
|
||||
last_array = column_array;
|
||||
}
|
||||
|
||||
return last_array ? last_array->getDataPtr() : column;
|
||||
}
|
||||
|
||||
DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions)
|
||||
{
|
||||
for (size_t i = 0; i < num_dimensions; ++i)
|
||||
type = std::make_shared<DataTypeArray>(std::move(type));
|
||||
return type;
|
||||
}
|
||||
|
||||
ColumnPtr createArrayOfColumn(ColumnPtr column, size_t num_dimensions)
|
||||
{
|
||||
for (size_t i = 0; i < num_dimensions; ++i)
|
||||
column = ColumnArray::create(column);
|
||||
return column;
|
||||
}
|
||||
|
||||
Array createEmptyArrayField(size_t num_dimensions)
|
||||
{
|
||||
if (num_dimensions == 0)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create array field with 0 dimensions");
|
||||
|
||||
Array array;
|
||||
Array * current_array = &array;
|
||||
for (size_t i = 1; i < num_dimensions; ++i)
|
||||
{
|
||||
current_array->push_back(Array());
|
||||
current_array = ¤t_array->back().get<Array &>();
|
||||
}
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
DataTypePtr getDataTypeByColumn(const IColumn & column)
|
||||
{
|
||||
auto idx = column.getDataType();
|
||||
if (WhichDataType(idx).isSimple())
|
||||
return DataTypeFactory::instance().get(String(magic_enum::enum_name(idx)));
|
||||
|
||||
if (const auto * column_array = checkAndGetColumn<ColumnArray>(&column))
|
||||
return std::make_shared<DataTypeArray>(getDataTypeByColumn(column_array->getData()));
|
||||
|
||||
if (const auto * column_nullable = checkAndGetColumn<ColumnNullable>(&column))
|
||||
return makeNullable(getDataTypeByColumn(column_nullable->getNestedColumn()));
|
||||
|
||||
/// TODO: add more types.
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get data type of column {}", column.getFamilyName());
|
||||
}
|
||||
|
||||
template <size_t I, typename Tuple>
|
||||
static auto extractVector(const std::vector<Tuple> & vec)
|
||||
{
|
||||
static_assert(I < std::tuple_size_v<Tuple>);
|
||||
std::vector<std::tuple_element_t<I, Tuple>> res;
|
||||
res.reserve(vec.size());
|
||||
for (const auto & elem : vec)
|
||||
res.emplace_back(std::get<I>(elem));
|
||||
return res;
|
||||
}
|
||||
|
||||
void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, const NamesAndTypesList & extended_storage_columns)
|
||||
{
|
||||
std::unordered_map<String, DataTypePtr> storage_columns_map;
|
||||
for (const auto & [name, type] : extended_storage_columns)
|
||||
storage_columns_map[name] = type;
|
||||
|
||||
for (auto & name_type : columns_list)
|
||||
{
|
||||
if (!isObject(name_type.type))
|
||||
continue;
|
||||
|
||||
auto & column = block.getByName(name_type.name);
|
||||
if (!isObject(column.type))
|
||||
throw Exception(ErrorCodes::TYPE_MISMATCH,
|
||||
"Type for column '{}' mismatch in columns list and in block. In list: {}, in block: {}",
|
||||
name_type.name, name_type.type->getName(), column.type->getName());
|
||||
|
||||
const auto & column_object = assert_cast<const ColumnObject &>(*column.column);
|
||||
const auto & subcolumns = column_object.getSubcolumns();
|
||||
|
||||
if (!column_object.isFinalized())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Cannot convert to tuple column '{}' from type {}. Column should be finalized first",
|
||||
name_type.name, name_type.type->getName());
|
||||
|
||||
PathsInData tuple_paths;
|
||||
DataTypes tuple_types;
|
||||
Columns tuple_columns;
|
||||
|
||||
for (const auto & entry : subcolumns)
|
||||
{
|
||||
tuple_paths.emplace_back(entry->path);
|
||||
tuple_types.emplace_back(entry->data.getLeastCommonType());
|
||||
tuple_columns.emplace_back(entry->data.getFinalizedColumnPtr());
|
||||
}
|
||||
|
||||
auto it = storage_columns_map.find(name_type.name);
|
||||
if (it == storage_columns_map.end())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' not found in storage", name_type.name);
|
||||
|
||||
std::tie(column.column, column.type) = unflattenTuple(tuple_paths, tuple_types, tuple_columns);
|
||||
name_type.type = column.type;
|
||||
|
||||
/// Check that constructed Tuple type and type in storage are compatible.
|
||||
getLeastCommonTypeForObject({column.type, it->second}, true);
|
||||
}
|
||||
}
|
||||
|
||||
static bool isPrefix(const PathInData::Parts & prefix, const PathInData::Parts & parts)
|
||||
{
|
||||
if (prefix.size() > parts.size())
|
||||
return false;
|
||||
|
||||
for (size_t i = 0; i < prefix.size(); ++i)
|
||||
if (prefix[i].key != parts[i].key)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
void checkObjectHasNoAmbiguosPaths(const PathsInData & paths)
|
||||
{
|
||||
size_t size = paths.size();
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
for (size_t j = 0; j < i; ++j)
|
||||
{
|
||||
if (isPrefix(paths[i].getParts(), paths[j].getParts())
|
||||
|| isPrefix(paths[j].getParts(), paths[i].getParts()))
|
||||
throw Exception(ErrorCodes::DUPLICATE_COLUMN,
|
||||
"Data in Object has ambiguous paths: '{}' and '{}'",
|
||||
paths[i].getPath(), paths[j].getPath());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool check_ambiguos_paths)
|
||||
{
|
||||
if (types.empty())
|
||||
return nullptr;
|
||||
|
||||
bool all_equal = true;
|
||||
for (size_t i = 1; i < types.size(); ++i)
|
||||
{
|
||||
if (!types[i]->equals(*types[0]))
|
||||
{
|
||||
all_equal = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (all_equal)
|
||||
return types[0];
|
||||
|
||||
/// Types of subcolumns by path from all tuples.
|
||||
std::unordered_map<PathInData, DataTypes, PathInData::Hash> subcolumns_types;
|
||||
|
||||
/// First we flatten tuples, then get common type for paths
|
||||
/// and finally unflatten paths and create new tuple type.
|
||||
for (const auto & type : types)
|
||||
{
|
||||
const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type.get());
|
||||
if (!type_tuple)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Least common type for object can be deduced only from tuples, but {} given", type->getName());
|
||||
|
||||
auto [tuple_paths, tuple_types] = flattenTuple(type);
|
||||
assert(tuple_paths.size() == tuple_types.size());
|
||||
|
||||
for (size_t i = 0; i < tuple_paths.size(); ++i)
|
||||
subcolumns_types[tuple_paths[i]].push_back(tuple_types[i]);
|
||||
}
|
||||
|
||||
PathsInData tuple_paths;
|
||||
DataTypes tuple_types;
|
||||
|
||||
/// Get the least common type for all paths.
|
||||
for (const auto & [key, subtypes] : subcolumns_types)
|
||||
{
|
||||
assert(!subtypes.empty());
|
||||
if (key.getPath() == ColumnObject::COLUMN_NAME_DUMMY)
|
||||
continue;
|
||||
|
||||
size_t first_dim = getNumberOfDimensions(*subtypes[0]);
|
||||
for (size_t i = 1; i < subtypes.size(); ++i)
|
||||
if (first_dim != getNumberOfDimensions(*subtypes[i]))
|
||||
throw Exception(ErrorCodes::TYPE_MISMATCH,
|
||||
"Uncompatible types of subcolumn '{}': {} and {}",
|
||||
key.getPath(), subtypes[0]->getName(), subtypes[i]->getName());
|
||||
|
||||
tuple_paths.emplace_back(key);
|
||||
tuple_types.emplace_back(getLeastSupertype(subtypes, /*allow_conversion_to_string=*/ true));
|
||||
}
|
||||
|
||||
if (tuple_paths.empty())
|
||||
{
|
||||
tuple_paths.emplace_back(ColumnObject::COLUMN_NAME_DUMMY);
|
||||
tuple_types.emplace_back(std::make_shared<DataTypeUInt8>());
|
||||
}
|
||||
|
||||
if (check_ambiguos_paths)
|
||||
checkObjectHasNoAmbiguosPaths(tuple_paths);
|
||||
|
||||
return unflattenTuple(tuple_paths, tuple_types);
|
||||
}
|
||||
|
||||
NameSet getNamesOfObjectColumns(const NamesAndTypesList & columns_list)
|
||||
{
|
||||
NameSet res;
|
||||
for (const auto & [name, type] : columns_list)
|
||||
if (isObject(type))
|
||||
res.insert(name);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
bool hasObjectColumns(const ColumnsDescription & columns)
|
||||
{
|
||||
return std::any_of(columns.begin(), columns.end(), [](const auto & column) { return isObject(column.type); });
|
||||
}
|
||||
|
||||
void extendObjectColumns(NamesAndTypesList & columns_list, const ColumnsDescription & object_columns, bool with_subcolumns)
|
||||
{
|
||||
NamesAndTypesList subcolumns_list;
|
||||
for (auto & column : columns_list)
|
||||
{
|
||||
auto object_column = object_columns.tryGetColumn(GetColumnsOptions::All, column.name);
|
||||
if (object_column)
|
||||
{
|
||||
column.type = object_column->type;
|
||||
|
||||
if (with_subcolumns)
|
||||
subcolumns_list.splice(subcolumns_list.end(), object_columns.getSubcolumns(column.name));
|
||||
}
|
||||
}
|
||||
|
||||
columns_list.splice(columns_list.end(), std::move(subcolumns_list));
|
||||
}
|
||||
|
||||
void updateObjectColumns(ColumnsDescription & object_columns, const NamesAndTypesList & new_columns)
|
||||
{
|
||||
for (const auto & new_column : new_columns)
|
||||
{
|
||||
auto object_column = object_columns.tryGetColumn(GetColumnsOptions::All, new_column.name);
|
||||
if (object_column && !object_column->type->equals(*new_column.type))
|
||||
{
|
||||
object_columns.modify(new_column.name, [&](auto & column)
|
||||
{
|
||||
column.type = getLeastCommonTypeForObject({object_column->type, new_column.type});
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
void flattenTupleImpl(
|
||||
PathInDataBuilder & builder,
|
||||
DataTypePtr type,
|
||||
std::vector<PathInData::Parts> & new_paths,
|
||||
DataTypes & new_types)
|
||||
{
|
||||
if (const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type.get()))
|
||||
{
|
||||
const auto & tuple_names = type_tuple->getElementNames();
|
||||
const auto & tuple_types = type_tuple->getElements();
|
||||
|
||||
for (size_t i = 0; i < tuple_names.size(); ++i)
|
||||
{
|
||||
builder.append(tuple_names[i], false);
|
||||
flattenTupleImpl(builder, tuple_types[i], new_paths, new_types);
|
||||
builder.popBack();
|
||||
}
|
||||
}
|
||||
else if (const auto * type_array = typeid_cast<const DataTypeArray *>(type.get()))
|
||||
{
|
||||
PathInDataBuilder element_builder;
|
||||
std::vector<PathInData::Parts> element_paths;
|
||||
DataTypes element_types;
|
||||
|
||||
flattenTupleImpl(element_builder, type_array->getNestedType(), element_paths, element_types);
|
||||
assert(element_paths.size() == element_types.size());
|
||||
|
||||
for (size_t i = 0; i < element_paths.size(); ++i)
|
||||
{
|
||||
builder.append(element_paths[i], true);
|
||||
new_paths.emplace_back(builder.getParts());
|
||||
new_types.emplace_back(std::make_shared<DataTypeArray>(element_types[i]));
|
||||
builder.popBack(element_paths[i].size());
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
new_paths.emplace_back(builder.getParts());
|
||||
new_types.emplace_back(type);
|
||||
}
|
||||
}
|
||||
|
||||
/// @offsets_columns are used as stack of array offsets and allows to recreate Array columns.
|
||||
void flattenTupleImpl(const ColumnPtr & column, Columns & new_columns, Columns & offsets_columns)
|
||||
{
|
||||
if (const auto * column_tuple = checkAndGetColumn<ColumnTuple>(column.get()))
|
||||
{
|
||||
const auto & subcolumns = column_tuple->getColumns();
|
||||
for (const auto & subcolumn : subcolumns)
|
||||
flattenTupleImpl(subcolumn, new_columns, offsets_columns);
|
||||
}
|
||||
else if (const auto * column_array = checkAndGetColumn<ColumnArray>(column.get()))
|
||||
{
|
||||
offsets_columns.push_back(column_array->getOffsetsPtr());
|
||||
flattenTupleImpl(column_array->getDataPtr(), new_columns, offsets_columns);
|
||||
offsets_columns.pop_back();
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!offsets_columns.empty())
|
||||
{
|
||||
auto new_column = ColumnArray::create(column, offsets_columns.back());
|
||||
for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it)
|
||||
new_column = ColumnArray::create(new_column, *it);
|
||||
|
||||
new_columns.push_back(std::move(new_column));
|
||||
}
|
||||
else
|
||||
{
|
||||
new_columns.push_back(column);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DataTypePtr reduceNumberOfDimensions(DataTypePtr type, size_t dimensions_to_reduce)
|
||||
{
|
||||
while (dimensions_to_reduce--)
|
||||
{
|
||||
const auto * type_array = typeid_cast<const DataTypeArray *>(type.get());
|
||||
if (!type_array)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Not enough dimensions to reduce");
|
||||
|
||||
type = type_array->getNestedType();
|
||||
}
|
||||
|
||||
return type;
|
||||
}
|
||||
|
||||
ColumnPtr reduceNumberOfDimensions(ColumnPtr column, size_t dimensions_to_reduce)
|
||||
{
|
||||
while (dimensions_to_reduce--)
|
||||
{
|
||||
const auto * column_array = typeid_cast<const ColumnArray *>(column.get());
|
||||
if (!column_array)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Not enough dimensions to reduce");
|
||||
|
||||
column = column_array->getDataPtr();
|
||||
}
|
||||
|
||||
return column;
|
||||
}
|
||||
|
||||
/// We save intermediate column, type and number of array
|
||||
/// dimensions for each intermediate node in path in subcolumns tree.
|
||||
struct ColumnWithTypeAndDimensions
|
||||
{
|
||||
ColumnPtr column;
|
||||
DataTypePtr type;
|
||||
size_t array_dimensions;
|
||||
};
|
||||
|
||||
using SubcolumnsTreeWithColumns = SubcolumnsTree<ColumnWithTypeAndDimensions>;
|
||||
using Node = SubcolumnsTreeWithColumns::Node;
|
||||
|
||||
/// Creates data type and column from tree of subcolumns.
|
||||
ColumnWithTypeAndDimensions createTypeFromNode(const Node * node)
|
||||
{
|
||||
auto collect_tuple_elemets = [](const auto & children)
|
||||
{
|
||||
std::vector<std::tuple<String, ColumnWithTypeAndDimensions>> tuple_elements;
|
||||
tuple_elements.reserve(children.size());
|
||||
for (const auto & [name, child] : children)
|
||||
{
|
||||
auto column = createTypeFromNode(child.get());
|
||||
tuple_elements.emplace_back(name, std::move(column));
|
||||
}
|
||||
|
||||
/// Sort to always create the same type for the same set of subcolumns.
|
||||
std::sort(tuple_elements.begin(), tuple_elements.end(),
|
||||
[](const auto & lhs, const auto & rhs) { return std::get<0>(lhs) < std::get<0>(rhs); });
|
||||
|
||||
auto tuple_names = extractVector<0>(tuple_elements);
|
||||
auto tuple_columns = extractVector<1>(tuple_elements);
|
||||
|
||||
return std::make_tuple(std::move(tuple_names), std::move(tuple_columns));
|
||||
};
|
||||
|
||||
if (node->kind == Node::SCALAR)
|
||||
{
|
||||
return node->data;
|
||||
}
|
||||
else if (node->kind == Node::NESTED)
|
||||
{
|
||||
auto [tuple_names, tuple_columns] = collect_tuple_elemets(node->children);
|
||||
|
||||
Columns offsets_columns;
|
||||
offsets_columns.reserve(tuple_columns[0].array_dimensions + 1);
|
||||
|
||||
/// If we have a Nested node and child node with anonymous array levels
|
||||
/// we need to push a Nested type through all array levels.
|
||||
/// Example: { "k1": [[{"k2": 1, "k3": 2}] } should be parsed as
|
||||
/// `k1 Array(Nested(k2 Int, k3 Int))` and k1 is marked as Nested
|
||||
/// and `k2` and `k3` has anonymous_array_level = 1 in that case.
|
||||
|
||||
const auto & current_array = assert_cast<const ColumnArray &>(*node->data.column);
|
||||
offsets_columns.push_back(current_array.getOffsetsPtr());
|
||||
|
||||
auto first_column = tuple_columns[0].column;
|
||||
for (size_t i = 0; i < tuple_columns[0].array_dimensions; ++i)
|
||||
{
|
||||
const auto & column_array = assert_cast<const ColumnArray &>(*first_column);
|
||||
offsets_columns.push_back(column_array.getOffsetsPtr());
|
||||
first_column = column_array.getDataPtr();
|
||||
}
|
||||
|
||||
size_t num_elements = tuple_columns.size();
|
||||
Columns tuple_elements_columns(num_elements);
|
||||
DataTypes tuple_elements_types(num_elements);
|
||||
|
||||
/// Reduce extra array dimensions to get columns and types of Nested elements.
|
||||
for (size_t i = 0; i < num_elements; ++i)
|
||||
{
|
||||
assert(tuple_columns[i].array_dimensions == tuple_columns[0].array_dimensions);
|
||||
tuple_elements_columns[i] = reduceNumberOfDimensions(tuple_columns[i].column, tuple_columns[i].array_dimensions);
|
||||
tuple_elements_types[i] = reduceNumberOfDimensions(tuple_columns[i].type, tuple_columns[i].array_dimensions);
|
||||
}
|
||||
|
||||
auto result_column = ColumnArray::create(ColumnTuple::create(tuple_elements_columns), offsets_columns.back());
|
||||
auto result_type = createNested(tuple_elements_types, tuple_names);
|
||||
|
||||
/// Recreate result Array type and Array column.
|
||||
for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it)
|
||||
{
|
||||
result_column = ColumnArray::create(result_column, *it);
|
||||
result_type = std::make_shared<DataTypeArray>(result_type);
|
||||
}
|
||||
|
||||
return {result_column, result_type, tuple_columns[0].array_dimensions};
|
||||
}
|
||||
else
|
||||
{
|
||||
auto [tuple_names, tuple_columns] = collect_tuple_elemets(node->children);
|
||||
|
||||
size_t num_elements = tuple_columns.size();
|
||||
Columns tuple_elements_columns(num_elements);
|
||||
DataTypes tuple_elements_types(num_elements);
|
||||
|
||||
for (size_t i = 0; i < tuple_columns.size(); ++i)
|
||||
{
|
||||
assert(tuple_columns[i].array_dimensions == tuple_columns[0].array_dimensions);
|
||||
tuple_elements_columns[i] = tuple_columns[i].column;
|
||||
tuple_elements_types[i] = tuple_columns[i].type;
|
||||
}
|
||||
|
||||
auto result_column = ColumnTuple::create(tuple_elements_columns);
|
||||
auto result_type = std::make_shared<DataTypeTuple>(tuple_elements_types, tuple_names);
|
||||
|
||||
return {result_column, result_type, tuple_columns[0].array_dimensions};
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
std::pair<PathsInData, DataTypes> flattenTuple(const DataTypePtr & type)
|
||||
{
|
||||
std::vector<PathInData::Parts> new_path_parts;
|
||||
DataTypes new_types;
|
||||
PathInDataBuilder builder;
|
||||
|
||||
flattenTupleImpl(builder, type, new_path_parts, new_types);
|
||||
|
||||
PathsInData new_paths(new_path_parts.begin(), new_path_parts.end());
|
||||
return {new_paths, new_types};
|
||||
}
|
||||
|
||||
ColumnPtr flattenTuple(const ColumnPtr & column)
|
||||
{
|
||||
Columns new_columns;
|
||||
Columns offsets_columns;
|
||||
|
||||
flattenTupleImpl(column, new_columns, offsets_columns);
|
||||
return ColumnTuple::create(new_columns);
|
||||
}
|
||||
|
||||
DataTypePtr unflattenTuple(const PathsInData & paths, const DataTypes & tuple_types)
|
||||
{
|
||||
assert(paths.size() == tuple_types.size());
|
||||
Columns tuple_columns;
|
||||
tuple_columns.reserve(tuple_types.size());
|
||||
for (const auto & type : tuple_types)
|
||||
tuple_columns.emplace_back(type->createColumn());
|
||||
|
||||
return unflattenTuple(paths, tuple_types, tuple_columns).second;
|
||||
}
|
||||
|
||||
std::pair<ColumnPtr, DataTypePtr> unflattenTuple(
|
||||
const PathsInData & paths,
|
||||
const DataTypes & tuple_types,
|
||||
const Columns & tuple_columns)
|
||||
{
|
||||
assert(paths.size() == tuple_types.size());
|
||||
assert(paths.size() == tuple_columns.size());
|
||||
|
||||
/// We add all paths to the subcolumn tree and then create a type from it.
|
||||
/// The tree stores column, type and number of array dimensions
|
||||
/// for each intermediate node.
|
||||
SubcolumnsTreeWithColumns tree;
|
||||
|
||||
for (size_t i = 0; i < paths.size(); ++i)
|
||||
{
|
||||
auto column = tuple_columns[i];
|
||||
auto type = tuple_types[i];
|
||||
|
||||
const auto & parts = paths[i].getParts();
|
||||
size_t num_parts = parts.size();
|
||||
|
||||
size_t pos = 0;
|
||||
tree.add(paths[i], [&](Node::Kind kind, bool exists) -> std::shared_ptr<Node>
|
||||
{
|
||||
if (pos >= num_parts)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Not enough name parts for path {}. Expected at least {}, got {}",
|
||||
paths[i].getPath(), pos + 1, num_parts);
|
||||
|
||||
size_t array_dimensions = kind == Node::NESTED ? 1 : parts[pos].anonymous_array_level;
|
||||
ColumnWithTypeAndDimensions current_column{column, type, array_dimensions};
|
||||
|
||||
/// Get type and column for next node.
|
||||
if (array_dimensions)
|
||||
{
|
||||
type = reduceNumberOfDimensions(type, array_dimensions);
|
||||
column = reduceNumberOfDimensions(column, array_dimensions);
|
||||
}
|
||||
|
||||
++pos;
|
||||
if (exists)
|
||||
return nullptr;
|
||||
|
||||
return kind == Node::SCALAR
|
||||
? std::make_shared<Node>(kind, current_column, paths[i])
|
||||
: std::make_shared<Node>(kind, current_column);
|
||||
});
|
||||
}
|
||||
|
||||
auto [column, type, _] = createTypeFromNode(tree.getRoot());
|
||||
return std::make_pair(std::move(column), std::move(type));
|
||||
}
|
||||
|
||||
static void addConstantToWithClause(const ASTPtr & query, const String & column_name, const DataTypePtr & data_type)
|
||||
{
|
||||
auto & select = query->as<ASTSelectQuery &>();
|
||||
if (!select.with())
|
||||
select.setExpression(ASTSelectQuery::Expression::WITH, std::make_shared<ASTExpressionList>());
|
||||
|
||||
/// TODO: avoid materialize
|
||||
auto node = makeASTFunction("materialize",
|
||||
makeASTFunction("CAST",
|
||||
std::make_shared<ASTLiteral>(data_type->getDefault()),
|
||||
std::make_shared<ASTLiteral>(data_type->getName())));
|
||||
|
||||
node->alias = column_name;
|
||||
node->prefer_alias_to_column_name = true;
|
||||
select.with()->children.push_back(std::move(node));
|
||||
}
|
||||
|
||||
/// @expected_columns and @available_columns contain descriptions
|
||||
/// of extended Object columns.
|
||||
void replaceMissedSubcolumnsByConstants(
|
||||
const ColumnsDescription & expected_columns,
|
||||
const ColumnsDescription & available_columns,
|
||||
ASTPtr query)
|
||||
{
|
||||
NamesAndTypes missed_names_types;
|
||||
|
||||
/// Find all subcolumns that are in @expected_columns, but not in @available_columns.
|
||||
for (const auto & column : available_columns)
|
||||
{
|
||||
auto expected_column = expected_columns.getColumn(GetColumnsOptions::All, column.name);
|
||||
|
||||
/// Extract all paths from both descriptions to easily check existence of subcolumns.
|
||||
auto [available_paths, available_types] = flattenTuple(column.type);
|
||||
auto [expected_paths, expected_types] = flattenTuple(expected_column.type);
|
||||
|
||||
auto extract_names_and_types = [&column](const auto & paths, const auto & types)
|
||||
{
|
||||
NamesAndTypes res;
|
||||
res.reserve(paths.size());
|
||||
for (size_t i = 0; i < paths.size(); ++i)
|
||||
{
|
||||
auto full_name = Nested::concatenateName(column.name, paths[i].getPath());
|
||||
res.emplace_back(full_name, types[i]);
|
||||
}
|
||||
|
||||
std::sort(res.begin(), res.end());
|
||||
return res;
|
||||
};
|
||||
|
||||
auto available_names_types = extract_names_and_types(available_paths, available_types);
|
||||
auto expected_names_types = extract_names_and_types(expected_paths, expected_types);
|
||||
|
||||
std::set_difference(
|
||||
expected_names_types.begin(), expected_names_types.end(),
|
||||
available_names_types.begin(), available_names_types.end(),
|
||||
std::back_inserter(missed_names_types),
|
||||
[](const auto & lhs, const auto & rhs) { return lhs.name < rhs.name; });
|
||||
}
|
||||
|
||||
if (missed_names_types.empty())
|
||||
return;
|
||||
|
||||
IdentifierNameSet identifiers;
|
||||
query->collectIdentifierNames(identifiers);
|
||||
|
||||
/// Replace missed subcolumns to default literals of theirs type.
|
||||
for (const auto & [name, type] : missed_names_types)
|
||||
if (identifiers.count(name))
|
||||
addConstantToWithClause(query, name, type);
|
||||
}
|
||||
|
||||
void finalizeObjectColumns(MutableColumns & columns)
|
||||
{
|
||||
for (auto & column : columns)
|
||||
if (auto * column_object = typeid_cast<ColumnObject *>(column.get()))
|
||||
column_object->finalize();
|
||||
}
|
||||
|
||||
}
|
140
src/DataTypes/ObjectUtils.h
Normal file
140
src/DataTypes/ObjectUtils.h
Normal file
@ -0,0 +1,140 @@
|
||||
#pragma once
|
||||
|
||||
#include <Core/Block.h>
|
||||
#include <Core/NamesAndTypes.h>
|
||||
#include <Common/FieldVisitors.h>
|
||||
#include <Storages/ColumnsDescription.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/Serializations/JSONDataParser.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Columns/ColumnObject.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/// Returns number of dimensions in Array type. 0 if type is not array.
|
||||
size_t getNumberOfDimensions(const IDataType & type);
|
||||
|
||||
/// Returns number of dimensions in Array column. 0 if column is not array.
|
||||
size_t getNumberOfDimensions(const IColumn & column);
|
||||
|
||||
/// Returns type of scalars of Array of arbitrary dimensions.
|
||||
DataTypePtr getBaseTypeOfArray(const DataTypePtr & type);
|
||||
|
||||
/// Returns Array type with requested scalar type and number of dimensions.
|
||||
DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions);
|
||||
|
||||
/// Returns column of scalars of Array of arbitrary dimensions.
|
||||
ColumnPtr getBaseColumnOfArray(const ColumnPtr & column);
|
||||
|
||||
/// Returns empty Array column with requested scalar column and number of dimensions.
|
||||
ColumnPtr createArrayOfColumn(const ColumnPtr & column, size_t num_dimensions);
|
||||
|
||||
/// Returns Array with requested number of dimensions and no scalars.
|
||||
Array createEmptyArrayField(size_t num_dimensions);
|
||||
|
||||
/// Tries to get data type by column. Only limited subset of types is supported
|
||||
DataTypePtr getDataTypeByColumn(const IColumn & column);
|
||||
|
||||
/// Converts Object types and columns to Tuples in @columns_list and @block
|
||||
/// and checks that types are consistent with types in @extended_storage_columns.
|
||||
void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, const NamesAndTypesList & extended_storage_columns);
|
||||
|
||||
/// Checks that each path is not the prefix of any other path.
|
||||
void checkObjectHasNoAmbiguosPaths(const PathsInData & paths);
|
||||
|
||||
/// Receives several Tuple types and deduces the least common type among them.
|
||||
DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool check_ambiguos_paths = false);
|
||||
|
||||
/// Converts types of object columns to tuples in @columns_list
|
||||
/// according to @object_columns and adds all tuple's subcolumns if needed.
|
||||
void extendObjectColumns(NamesAndTypesList & columns_list, const ColumnsDescription & object_columns, bool with_subcolumns);
|
||||
|
||||
NameSet getNamesOfObjectColumns(const NamesAndTypesList & columns_list);
|
||||
bool hasObjectColumns(const ColumnsDescription & columns);
|
||||
void finalizeObjectColumns(MutableColumns & columns);
|
||||
|
||||
/// Updates types of objects in @object_columns inplace
|
||||
/// according to types in new_columns.
|
||||
void updateObjectColumns(ColumnsDescription & object_columns, const NamesAndTypesList & new_columns);
|
||||
|
||||
using DataTypeTuplePtr = std::shared_ptr<DataTypeTuple>;
|
||||
|
||||
/// Flattens nested Tuple to plain Tuple. I.e extracts all paths and types from tuple.
|
||||
/// E.g. Tuple(t Tuple(c1 UInt32, c2 String), c3 UInt64) -> Tuple(t.c1 UInt32, t.c2 String, c3 UInt32)
|
||||
std::pair<PathsInData, DataTypes> flattenTuple(const DataTypePtr & type);
|
||||
|
||||
/// Flattens nested Tuple column to plain Tuple column.
|
||||
ColumnPtr flattenTuple(const ColumnPtr & column);
|
||||
|
||||
/// The reverse operation to 'flattenTuple'.
|
||||
/// Creates nested Tuple from all paths and types.
|
||||
/// E.g. Tuple(t.c1 UInt32, t.c2 String, c3 UInt32) -> Tuple(t Tuple(c1 UInt32, c2 String), c3 UInt64)
|
||||
DataTypePtr unflattenTuple(
|
||||
const PathsInData & paths,
|
||||
const DataTypes & tuple_types);
|
||||
|
||||
std::pair<ColumnPtr, DataTypePtr> unflattenTuple(
|
||||
const PathsInData & paths,
|
||||
const DataTypes & tuple_types,
|
||||
const Columns & tuple_columns);
|
||||
|
||||
/// For all columns which exist in @expected_columns and
|
||||
/// don't exist in @available_columns adds to WITH clause
|
||||
/// an alias with column name to literal of default value of column type.
|
||||
void replaceMissedSubcolumnsByConstants(
|
||||
const ColumnsDescription & expected_columns,
|
||||
const ColumnsDescription & available_columns,
|
||||
ASTPtr query);
|
||||
|
||||
/// Receives range of objects, which contains collections
|
||||
/// of columns-like objects (e.g. ColumnsDescription or NamesAndTypesList)
|
||||
/// and deduces the common types of object columns for all entries.
|
||||
/// @entry_columns_getter should extract reference to collection of
|
||||
/// columns-like objects from entry to which Iterator points.
|
||||
/// columns-like object should have fields "name" and "type".
|
||||
template <typename Iterator, typename EntryColumnsGetter>
|
||||
ColumnsDescription getObjectColumns(
|
||||
Iterator begin, Iterator end,
|
||||
const ColumnsDescription & storage_columns,
|
||||
EntryColumnsGetter && entry_columns_getter)
|
||||
{
|
||||
ColumnsDescription res;
|
||||
|
||||
if (begin == end)
|
||||
{
|
||||
for (const auto & column : storage_columns)
|
||||
{
|
||||
if (isObject(column.type))
|
||||
{
|
||||
auto tuple_type = std::make_shared<DataTypeTuple>(
|
||||
DataTypes{std::make_shared<DataTypeUInt8>()},
|
||||
Names{ColumnObject::COLUMN_NAME_DUMMY});
|
||||
|
||||
res.add({column.name, std::move(tuple_type)});
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
std::unordered_map<String, DataTypes> types_in_entries;
|
||||
|
||||
for (auto it = begin; it != end; ++it)
|
||||
{
|
||||
const auto & entry_columns = entry_columns_getter(*it);
|
||||
for (const auto & column : entry_columns)
|
||||
{
|
||||
auto storage_column = storage_columns.tryGetPhysical(column.name);
|
||||
if (storage_column && isObject(storage_column->type))
|
||||
types_in_entries[column.name].push_back(column.type);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & [name, types] : types_in_entries)
|
||||
res.add({name, getLeastCommonTypeForObject(types)});
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
3
src/DataTypes/Serializations/CMakeLists.txt
Normal file
3
src/DataTypes/Serializations/CMakeLists.txt
Normal file
@ -0,0 +1,3 @@
|
||||
if (ENABLE_TESTS)
|
||||
add_subdirectory (tests)
|
||||
endif ()
|
@ -172,6 +172,10 @@ String getNameForSubstreamPath(
|
||||
else
|
||||
stream_name += "." + it->tuple_element_name;
|
||||
}
|
||||
else if (it->type == Substream::ObjectElement)
|
||||
{
|
||||
stream_name += escapeForFileName(".") + escapeForFileName(it->object_key_name);
|
||||
}
|
||||
}
|
||||
|
||||
return stream_name;
|
||||
|
@ -125,6 +125,9 @@ public:
|
||||
SparseElements,
|
||||
SparseOffsets,
|
||||
|
||||
ObjectStructure,
|
||||
ObjectElement,
|
||||
|
||||
Regular,
|
||||
};
|
||||
|
||||
@ -133,6 +136,9 @@ public:
|
||||
/// Index of tuple element, starting at 1 or name.
|
||||
String tuple_element_name;
|
||||
|
||||
/// Name of subcolumn of object column.
|
||||
String object_key_name;
|
||||
|
||||
/// Do we need to escape a dot in filenames for tuple elements.
|
||||
bool escape_tuple_delimiter = true;
|
||||
|
||||
|
183
src/DataTypes/Serializations/JSONDataParser.h
Normal file
183
src/DataTypes/Serializations/JSONDataParser.h
Normal file
@ -0,0 +1,183 @@
|
||||
#pragma once
|
||||
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/checkStackSize.h>
|
||||
#include <DataTypes/Serializations/PathInData.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
class ReadBuffer;
|
||||
class WriteBuffer;
|
||||
|
||||
template <typename Element>
|
||||
static Field getValueAsField(const Element & element)
|
||||
{
|
||||
if (element.isBool()) return element.getBool();
|
||||
if (element.isInt64()) return element.getInt64();
|
||||
if (element.isUInt64()) return element.getUInt64();
|
||||
if (element.isDouble()) return element.getDouble();
|
||||
if (element.isString()) return element.getString();
|
||||
if (element.isNull()) return Field();
|
||||
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unsupported type of JSON field");
|
||||
}
|
||||
|
||||
template <typename ParserImpl>
|
||||
class JSONDataParser
|
||||
{
|
||||
public:
|
||||
using Element = typename ParserImpl::Element;
|
||||
|
||||
void readJSON(String & s, ReadBuffer & buf)
|
||||
{
|
||||
readJSONObjectPossiblyInvalid(s, buf);
|
||||
}
|
||||
|
||||
std::optional<ParseResult> parse(const char * begin, size_t length)
|
||||
{
|
||||
std::string_view json{begin, length};
|
||||
Element document;
|
||||
if (!parser.parse(json, document))
|
||||
return {};
|
||||
|
||||
ParseResult result;
|
||||
PathInDataBuilder builder;
|
||||
std::vector<PathInData::Parts> paths;
|
||||
|
||||
traverse(document, builder, paths, result.values);
|
||||
|
||||
result.paths.reserve(paths.size());
|
||||
for (auto && path : paths)
|
||||
result.paths.emplace_back(std::move(path));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
void traverse(
|
||||
const Element & element,
|
||||
PathInDataBuilder & builder,
|
||||
std::vector<PathInData::Parts> & paths,
|
||||
std::vector<Field> & values)
|
||||
{
|
||||
checkStackSize();
|
||||
|
||||
if (element.isObject())
|
||||
{
|
||||
auto object = element.getObject();
|
||||
|
||||
paths.reserve(paths.size() + object.size());
|
||||
values.reserve(values.size() + object.size());
|
||||
|
||||
for (auto it = object.begin(); it != object.end(); ++it)
|
||||
{
|
||||
const auto & [key, value] = *it;
|
||||
traverse(value, builder.append(key, false), paths, values);
|
||||
builder.popBack();
|
||||
}
|
||||
}
|
||||
else if (element.isArray())
|
||||
{
|
||||
auto array = element.getArray();
|
||||
|
||||
using PathPartsWithArray = std::pair<PathInData::Parts, Array>;
|
||||
using PathToArray = HashMapWithStackMemory<UInt128, PathPartsWithArray, UInt128TrivialHash, 5>;
|
||||
|
||||
/// Traverse elements of array and collect an array
|
||||
/// of fields by each path.
|
||||
|
||||
PathToArray arrays_by_path;
|
||||
Arena strings_pool;
|
||||
|
||||
size_t current_size = 0;
|
||||
for (auto it = array.begin(); it != array.end(); ++it)
|
||||
{
|
||||
std::vector<PathInData::Parts> element_paths;
|
||||
std::vector<Field> element_values;
|
||||
PathInDataBuilder element_builder;
|
||||
|
||||
traverse(*it, element_builder, element_paths, element_values);
|
||||
size_t size = element_paths.size();
|
||||
size_t keys_to_update = arrays_by_path.size();
|
||||
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
UInt128 hash = PathInData::getPartsHash(element_paths[i]);
|
||||
if (auto * found = arrays_by_path.find(hash))
|
||||
{
|
||||
auto & path_array = found->getMapped().second;
|
||||
|
||||
assert(path_array.size() == current_size);
|
||||
path_array.push_back(std::move(element_values[i]));
|
||||
--keys_to_update;
|
||||
}
|
||||
else
|
||||
{
|
||||
/// We found a new key. Add and empty array with current size.
|
||||
Array path_array;
|
||||
path_array.reserve(array.size());
|
||||
path_array.resize(current_size);
|
||||
path_array.push_back(std::move(element_values[i]));
|
||||
|
||||
auto & elem = arrays_by_path[hash];
|
||||
elem.first = std::move(element_paths[i]);
|
||||
elem.second = std::move(path_array);
|
||||
}
|
||||
}
|
||||
|
||||
/// If some of the keys are missed in current element,
|
||||
/// add default values for them.
|
||||
if (keys_to_update)
|
||||
{
|
||||
for (auto & [_, value] : arrays_by_path)
|
||||
{
|
||||
auto & path_array = value.second;
|
||||
assert(path_array.size() == current_size || path_array.size() == current_size + 1);
|
||||
if (path_array.size() == current_size)
|
||||
path_array.push_back(Field());
|
||||
}
|
||||
}
|
||||
|
||||
++current_size;
|
||||
}
|
||||
|
||||
if (arrays_by_path.empty())
|
||||
{
|
||||
paths.push_back(builder.getParts());
|
||||
values.push_back(Array());
|
||||
}
|
||||
else
|
||||
{
|
||||
paths.reserve(paths.size() + arrays_by_path.size());
|
||||
values.reserve(values.size() + arrays_by_path.size());
|
||||
|
||||
for (auto && [_, value] : arrays_by_path)
|
||||
{
|
||||
auto && [path, path_array] = value;
|
||||
|
||||
/// Merge prefix path and path of array element.
|
||||
paths.push_back(builder.append(path, true).getParts());
|
||||
values.push_back(std::move(path_array));
|
||||
|
||||
builder.popBack(path.size());
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
paths.push_back(builder.getParts());
|
||||
values.push_back(getValueAsField(element));
|
||||
}
|
||||
}
|
||||
|
||||
ParserImpl parser;
|
||||
};
|
||||
|
||||
}
|
199
src/DataTypes/Serializations/PathInData.cpp
Normal file
199
src/DataTypes/Serializations/PathInData.cpp
Normal file
@ -0,0 +1,199 @@
|
||||
#include <DataTypes/Serializations/PathInData.h>
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <Columns/ColumnTuple.h>
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <Common/SipHash.h>
|
||||
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
|
||||
#include <boost/algorithm/string/split.hpp>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
PathInData::PathInData(std::string_view path_)
|
||||
: path(path_)
|
||||
{
|
||||
const char * begin = path.data();
|
||||
const char * end = path.data() + path.size();
|
||||
|
||||
for (const char * it = path.data(); it != end; ++it)
|
||||
{
|
||||
if (*it == '.')
|
||||
{
|
||||
size_t size = static_cast<size_t>(it - begin);
|
||||
parts.emplace_back(std::string_view{begin, size}, false, 0);
|
||||
begin = it + 1;
|
||||
}
|
||||
}
|
||||
|
||||
size_t size = static_cast<size_t>(end - begin);
|
||||
parts.emplace_back(std::string_view{begin, size}, false, 0.);
|
||||
}
|
||||
|
||||
PathInData::PathInData(const Parts & parts_)
|
||||
: path(buildPath(parts_))
|
||||
, parts(buildParts(path, parts_))
|
||||
{
|
||||
}
|
||||
|
||||
PathInData::PathInData(const PathInData & other)
|
||||
: path(other.path)
|
||||
, parts(buildParts(path, other.getParts()))
|
||||
{
|
||||
}
|
||||
|
||||
PathInData & PathInData::operator=(const PathInData & other)
|
||||
{
|
||||
if (this != &other)
|
||||
{
|
||||
path = other.path;
|
||||
parts = buildParts(path, other.parts);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
UInt128 PathInData::getPartsHash(const Parts & parts_)
|
||||
{
|
||||
SipHash hash;
|
||||
hash.update(parts_.size());
|
||||
for (const auto & part : parts_)
|
||||
{
|
||||
hash.update(part.key.data(), part.key.length());
|
||||
hash.update(part.is_nested);
|
||||
hash.update(part.anonymous_array_level);
|
||||
}
|
||||
|
||||
UInt128 res;
|
||||
hash.get128(res);
|
||||
return res;
|
||||
}
|
||||
|
||||
void PathInData::writeBinary(WriteBuffer & out) const
|
||||
{
|
||||
writeVarUInt(parts.size(), out);
|
||||
for (const auto & part : parts)
|
||||
{
|
||||
writeStringBinary(part.key, out);
|
||||
writeVarUInt(part.is_nested, out);
|
||||
writeVarUInt(part.anonymous_array_level, out);
|
||||
}
|
||||
}
|
||||
|
||||
void PathInData::readBinary(ReadBuffer & in)
|
||||
{
|
||||
size_t num_parts;
|
||||
readVarUInt(num_parts, in);
|
||||
|
||||
Arena arena;
|
||||
Parts temp_parts;
|
||||
temp_parts.reserve(num_parts);
|
||||
|
||||
for (size_t i = 0; i < num_parts; ++i)
|
||||
{
|
||||
bool is_nested;
|
||||
UInt8 anonymous_array_level;
|
||||
|
||||
auto ref = readStringBinaryInto(arena, in);
|
||||
readVarUInt(is_nested, in);
|
||||
readVarUInt(anonymous_array_level, in);
|
||||
|
||||
temp_parts.emplace_back(static_cast<std::string_view>(ref), is_nested, anonymous_array_level);
|
||||
}
|
||||
|
||||
/// Recreate path and parts.
|
||||
path = buildPath(temp_parts);
|
||||
parts = buildParts(path, temp_parts);
|
||||
}
|
||||
|
||||
String PathInData::buildPath(const Parts & other_parts)
|
||||
{
|
||||
if (other_parts.empty())
|
||||
return "";
|
||||
|
||||
String res;
|
||||
auto it = other_parts.begin();
|
||||
res += it->key;
|
||||
++it;
|
||||
for (; it != other_parts.end(); ++it)
|
||||
{
|
||||
res += ".";
|
||||
res += it->key;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
PathInData::Parts PathInData::buildParts(const String & other_path, const Parts & other_parts)
|
||||
{
|
||||
if (other_parts.empty())
|
||||
return {};
|
||||
|
||||
Parts res;
|
||||
const char * begin = other_path.data();
|
||||
for (const auto & part : other_parts)
|
||||
{
|
||||
res.emplace_back(std::string_view{begin, part.key.length()}, part.is_nested, part.anonymous_array_level);
|
||||
begin += part.key.length() + 1;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
size_t PathInData::Hash::operator()(const PathInData & value) const
|
||||
{
|
||||
auto hash = getPartsHash(value.parts);
|
||||
return hash.items[0] ^ hash.items[1];
|
||||
}
|
||||
|
||||
PathInDataBuilder & PathInDataBuilder::append(std::string_view key, bool is_array)
|
||||
{
|
||||
if (parts.empty())
|
||||
current_anonymous_array_level += is_array;
|
||||
|
||||
if (!key.empty())
|
||||
{
|
||||
if (!parts.empty())
|
||||
parts.back().is_nested = is_array;
|
||||
|
||||
parts.emplace_back(key, false, current_anonymous_array_level);
|
||||
current_anonymous_array_level = 0;
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
PathInDataBuilder & PathInDataBuilder::append(const PathInData::Parts & path, bool is_array)
|
||||
{
|
||||
if (parts.empty())
|
||||
current_anonymous_array_level += is_array;
|
||||
|
||||
if (!path.empty())
|
||||
{
|
||||
if (!parts.empty())
|
||||
parts.back().is_nested = is_array;
|
||||
|
||||
auto it = parts.insert(parts.end(), path.begin(), path.end());
|
||||
for (; it != parts.end(); ++it)
|
||||
it->anonymous_array_level += current_anonymous_array_level;
|
||||
current_anonymous_array_level = 0;
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
void PathInDataBuilder::popBack()
|
||||
{
|
||||
parts.pop_back();
|
||||
}
|
||||
|
||||
void PathInDataBuilder::popBack(size_t n)
|
||||
{
|
||||
assert(n <= parts.size());
|
||||
parts.resize(parts.size() - n);
|
||||
}
|
||||
|
||||
}
|
112
src/DataTypes/Serializations/PathInData.h
Normal file
112
src/DataTypes/Serializations/PathInData.h
Normal file
@ -0,0 +1,112 @@
|
||||
#pragma once
|
||||
|
||||
#include <Core/Types.h>
|
||||
#include <Core/Field.h>
|
||||
#include <bitset>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class ReadBuffer;
|
||||
class WriteBuffer;
|
||||
|
||||
/// Class that represents path in document, e.g. JSON.
|
||||
class PathInData
|
||||
{
|
||||
public:
|
||||
struct Part
|
||||
{
|
||||
Part() = default;
|
||||
Part(std::string_view key_, bool is_nested_, UInt8 anonymous_array_level_)
|
||||
: key(key_), is_nested(is_nested_), anonymous_array_level(anonymous_array_level_)
|
||||
{
|
||||
}
|
||||
|
||||
/// Name of part of path.
|
||||
std::string_view key;
|
||||
|
||||
/// If this part is Nested, i.e. element
|
||||
/// related to this key is the array of objects.
|
||||
bool is_nested = false;
|
||||
|
||||
/// Number of array levels between current key and previous key.
|
||||
/// E.g. in JSON {"k1": [[[{"k2": 1, "k3": 2}]]]}
|
||||
/// "k1" is nested and has anonymous_array_level = 0.
|
||||
/// "k2" and "k3" are not nested and have anonymous_array_level = 2.
|
||||
UInt8 anonymous_array_level = 0;
|
||||
|
||||
bool operator==(const Part & other) const = default;
|
||||
};
|
||||
|
||||
using Parts = std::vector<Part>;
|
||||
|
||||
PathInData() = default;
|
||||
explicit PathInData(std::string_view path_);
|
||||
explicit PathInData(const Parts & parts_);
|
||||
|
||||
PathInData(const PathInData & other);
|
||||
PathInData & operator=(const PathInData & other);
|
||||
|
||||
static UInt128 getPartsHash(const Parts & parts_);
|
||||
|
||||
bool empty() const { return parts.empty(); }
|
||||
|
||||
const String & getPath() const { return path; }
|
||||
const Parts & getParts() const { return parts; }
|
||||
|
||||
bool isNested(size_t i) const { return parts[i].is_nested; }
|
||||
bool hasNested() const { return std::any_of(parts.begin(), parts.end(), [](const auto & part) { return part.is_nested; }); }
|
||||
|
||||
void writeBinary(WriteBuffer & out) const;
|
||||
void readBinary(ReadBuffer & in);
|
||||
|
||||
bool operator==(const PathInData & other) const { return parts == other.parts; }
|
||||
struct Hash { size_t operator()(const PathInData & value) const; };
|
||||
|
||||
private:
|
||||
/// Creates full path from parts.
|
||||
static String buildPath(const Parts & other_parts);
|
||||
|
||||
/// Creates new parts full from full path with correct string pointers.
|
||||
static Parts buildParts(const String & other_path, const Parts & other_parts);
|
||||
|
||||
/// The full path. Parts are separated by dots.
|
||||
String path;
|
||||
|
||||
/// Parts of the path. All string_view-s in parts must point to the @path.
|
||||
Parts parts;
|
||||
};
|
||||
|
||||
class PathInDataBuilder
|
||||
{
|
||||
public:
|
||||
const PathInData::Parts & getParts() const { return parts; }
|
||||
|
||||
PathInDataBuilder & append(std::string_view key, bool is_array);
|
||||
PathInDataBuilder & append(const PathInData::Parts & path, bool is_array);
|
||||
|
||||
void popBack();
|
||||
void popBack(size_t n);
|
||||
|
||||
private:
|
||||
PathInData::Parts parts;
|
||||
|
||||
/// Number of array levels without key to which
|
||||
/// next non-empty key will be nested.
|
||||
/// Example: for JSON { "k1": [[{"k2": 1, "k3": 2}] }
|
||||
// `k2` and `k3` has anonymous_array_level = 1 in that case.
|
||||
size_t current_anonymous_array_level = 0;
|
||||
};
|
||||
|
||||
using PathsInData = std::vector<PathInData>;
|
||||
|
||||
/// Result of parsing of a document.
|
||||
/// Contains all paths extracted from document
|
||||
/// and values which are related to them.
|
||||
struct ParseResult
|
||||
{
|
||||
std::vector<PathInData> paths;
|
||||
std::vector<Field> values;
|
||||
};
|
||||
|
||||
}
|
460
src/DataTypes/Serializations/SerializationObject.cpp
Normal file
460
src/DataTypes/Serializations/SerializationObject.cpp
Normal file
@ -0,0 +1,460 @@
|
||||
#include <DataTypes/Serializations/SerializationObject.h>
|
||||
#include <DataTypes/Serializations/JSONDataParser.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <DataTypes/ObjectUtils.h>
|
||||
#include <DataTypes/DataTypeFactory.h>
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
#include <Common/JSONParsers/SimdJSONParser.h>
|
||||
#include <Common/JSONParsers/RapidJSONParser.h>
|
||||
#include <Common/HashTable/HashSet.h>
|
||||
#include <Columns/ColumnObject.h>
|
||||
|
||||
#include <Common/FieldVisitorToString.h>
|
||||
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/VarInt.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int NOT_IMPLEMENTED;
|
||||
extern const int INCORRECT_DATA;
|
||||
extern const int CANNOT_READ_ALL_DATA;
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
/// Visitor that keeps @num_dimensions_to_keep dimensions in arrays
|
||||
/// and replaces all scalars or nested arrays to @replacement at that level.
|
||||
class FieldVisitorReplaceScalars : public StaticVisitor<Field>
|
||||
{
|
||||
public:
|
||||
FieldVisitorReplaceScalars(const Field & replacement_, size_t num_dimensions_to_keep_)
|
||||
: replacement(replacement_), num_dimensions_to_keep(num_dimensions_to_keep_)
|
||||
{
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Field operator()(const T & x) const
|
||||
{
|
||||
if constexpr (std::is_same_v<T, Array>)
|
||||
{
|
||||
if (num_dimensions_to_keep == 0)
|
||||
return replacement;
|
||||
|
||||
const size_t size = x.size();
|
||||
Array res(size);
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
res[i] = applyVisitor(FieldVisitorReplaceScalars(replacement, num_dimensions_to_keep - 1), x[i]);
|
||||
return res;
|
||||
}
|
||||
else
|
||||
return replacement;
|
||||
}
|
||||
|
||||
private:
|
||||
const Field & replacement;
|
||||
size_t num_dimensions_to_keep;
|
||||
};
|
||||
|
||||
using Node = typename ColumnObject::SubcolumnsTree::Node;
|
||||
|
||||
/// Finds a subcolumn from the same Nested type as @entry and inserts
|
||||
/// an array with default values with consistent sizes as in Nested type.
|
||||
bool tryInsertDefaultFromNested(
|
||||
std::shared_ptr<Node> entry, const ColumnObject::SubcolumnsTree & subcolumns)
|
||||
{
|
||||
if (!entry->path.hasNested())
|
||||
return false;
|
||||
|
||||
const Node * current_node = subcolumns.findLeaf(entry->path);
|
||||
const Node * leaf = nullptr;
|
||||
size_t num_skipped_nested = 0;
|
||||
|
||||
while (current_node)
|
||||
{
|
||||
/// Try to find the first Nested up to the current node.
|
||||
const auto * node_nested = subcolumns.findParent(current_node,
|
||||
[](const auto & candidate) { return candidate.isNested(); });
|
||||
|
||||
if (!node_nested)
|
||||
break;
|
||||
|
||||
/// If there are no leaves, skip current node and find
|
||||
/// the next node up to the current.
|
||||
leaf = subcolumns.findLeaf(node_nested,
|
||||
[&](const auto & candidate)
|
||||
{
|
||||
return candidate.data.size() == entry->data.size() + 1;
|
||||
});
|
||||
|
||||
if (leaf)
|
||||
break;
|
||||
|
||||
current_node = node_nested->parent;
|
||||
++num_skipped_nested;
|
||||
}
|
||||
|
||||
if (!leaf)
|
||||
return false;
|
||||
|
||||
auto last_field = leaf->data.getLastField();
|
||||
if (last_field.isNull())
|
||||
return false;
|
||||
|
||||
const auto & least_common_type = entry->data.getLeastCommonType();
|
||||
size_t num_dimensions = getNumberOfDimensions(*least_common_type);
|
||||
assert(num_skipped_nested < num_dimensions);
|
||||
|
||||
/// Replace scalars to default values with consistent array sizes.
|
||||
size_t num_dimensions_to_keep = num_dimensions - num_skipped_nested;
|
||||
auto default_scalar = num_skipped_nested
|
||||
? createEmptyArrayField(num_skipped_nested)
|
||||
: getBaseTypeOfArray(least_common_type)->getDefault();
|
||||
|
||||
auto default_field = applyVisitor(FieldVisitorReplaceScalars(default_scalar, num_dimensions_to_keep), last_field);
|
||||
entry->data.insert(std::move(default_field));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
template <typename Reader>
|
||||
void SerializationObject<Parser>::deserializeTextImpl(IColumn & column, Reader && reader) const
|
||||
{
|
||||
auto & column_object = assert_cast<ColumnObject &>(column);
|
||||
|
||||
String buf;
|
||||
reader(buf);
|
||||
|
||||
auto result = parser.parse(buf.data(), buf.size());
|
||||
if (!result)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse object");
|
||||
|
||||
auto & [paths, values] = *result;
|
||||
assert(paths.size() == values.size());
|
||||
|
||||
HashSet<StringRef, StringRefHash> paths_set;
|
||||
size_t column_size = column_object.size();
|
||||
|
||||
for (size_t i = 0; i < paths.size(); ++i)
|
||||
{
|
||||
auto field_info = getFieldInfo(values[i]);
|
||||
if (isNothing(field_info.scalar_type))
|
||||
continue;
|
||||
|
||||
if (!paths_set.insert(paths[i].getPath()).second)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA,
|
||||
"Object has ambiguous path: {}", paths[i].getPath());
|
||||
|
||||
if (!column_object.hasSubcolumn(paths[i]))
|
||||
{
|
||||
if (paths[i].hasNested())
|
||||
column_object.addNestedSubcolumn(paths[i], field_info, column_size);
|
||||
else
|
||||
column_object.addSubcolumn(paths[i], column_size);
|
||||
}
|
||||
|
||||
auto & subcolumn = column_object.getSubcolumn(paths[i]);
|
||||
assert(subcolumn.size() == column_size);
|
||||
|
||||
subcolumn.insert(std::move(values[i]), std::move(field_info));
|
||||
}
|
||||
|
||||
/// Insert default values to missed subcolumns.
|
||||
const auto & subcolumns = column_object.getSubcolumns();
|
||||
for (const auto & entry : subcolumns)
|
||||
{
|
||||
if (!paths_set.has(entry->path.getPath()))
|
||||
{
|
||||
bool inserted = tryInsertDefaultFromNested(entry, subcolumns);
|
||||
if (!inserted)
|
||||
entry->data.insertDefault();
|
||||
}
|
||||
}
|
||||
|
||||
column_object.incrementNumRows();
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
|
||||
{
|
||||
deserializeTextImpl(column, [&](String & s) { readStringInto(s, istr); });
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
|
||||
{
|
||||
deserializeTextImpl(column, [&](String & s) { readEscapedStringInto(s, istr); });
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
|
||||
{
|
||||
deserializeTextImpl(column, [&](String & s) { readQuotedStringInto<true>(s, istr); });
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
|
||||
{
|
||||
deserializeTextImpl(column, [&](String & s) { parser.readJSON(s, istr); });
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
|
||||
{
|
||||
deserializeTextImpl(column, [&](String & s) { readCSVStringInto(s, istr, settings.csv); });
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
template <typename TSettings, typename TStatePtr>
|
||||
void SerializationObject<Parser>::checkSerializationIsSupported(const TSettings & settings, const TStatePtr & state) const
|
||||
{
|
||||
if (settings.position_independent_encoding)
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
|
||||
"DataTypeObject doesn't support serialization with position independent encoding");
|
||||
|
||||
if (state)
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
|
||||
"DataTypeObject doesn't support serialization with non-trivial state");
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const
|
||||
{
|
||||
checkSerializationIsSupported(settings, state);
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::serializeBinaryBulkStateSuffix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const
|
||||
{
|
||||
checkSerializationIsSupported(settings, state);
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::deserializeBinaryBulkStatePrefix(
|
||||
DeserializeBinaryBulkSettings & settings,
|
||||
DeserializeBinaryBulkStatePtr & state) const
|
||||
{
|
||||
checkSerializationIsSupported(settings, state);
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::serializeBinaryBulkWithMultipleStreams(
|
||||
const IColumn & column,
|
||||
size_t offset,
|
||||
size_t limit,
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const
|
||||
{
|
||||
checkSerializationIsSupported(settings, state);
|
||||
const auto & column_object = assert_cast<const ColumnObject &>(column);
|
||||
|
||||
if (!column_object.isFinalized())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write non-finalized ColumnObject");
|
||||
|
||||
settings.path.push_back(Substream::ObjectStructure);
|
||||
if (auto * stream = settings.getter(settings.path))
|
||||
writeVarUInt(column_object.getSubcolumns().size(), *stream);
|
||||
|
||||
const auto & subcolumns = column_object.getSubcolumns();
|
||||
for (const auto & entry : subcolumns)
|
||||
{
|
||||
settings.path.back() = Substream::ObjectStructure;
|
||||
settings.path.back().object_key_name = entry->path.getPath();
|
||||
|
||||
const auto & type = entry->data.getLeastCommonType();
|
||||
if (auto * stream = settings.getter(settings.path))
|
||||
{
|
||||
entry->path.writeBinary(*stream);
|
||||
writeStringBinary(type->getName(), *stream);
|
||||
}
|
||||
|
||||
settings.path.back() = Substream::ObjectElement;
|
||||
if (auto * stream = settings.getter(settings.path))
|
||||
{
|
||||
auto serialization = type->getDefaultSerialization();
|
||||
serialization->serializeBinaryBulkWithMultipleStreams(
|
||||
entry->data.getFinalizedColumn(), offset, limit, settings, state);
|
||||
}
|
||||
}
|
||||
|
||||
settings.path.pop_back();
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::deserializeBinaryBulkWithMultipleStreams(
|
||||
ColumnPtr & column,
|
||||
size_t limit,
|
||||
DeserializeBinaryBulkSettings & settings,
|
||||
DeserializeBinaryBulkStatePtr & state,
|
||||
SubstreamsCache * cache) const
|
||||
{
|
||||
checkSerializationIsSupported(settings, state);
|
||||
if (!column->empty())
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
|
||||
"DataTypeObject cannot be deserialized to non-empty column");
|
||||
|
||||
auto mutable_column = column->assumeMutable();
|
||||
auto & column_object = typeid_cast<ColumnObject &>(*mutable_column);
|
||||
|
||||
size_t num_subcolumns = 0;
|
||||
settings.path.push_back(Substream::ObjectStructure);
|
||||
if (auto * stream = settings.getter(settings.path))
|
||||
readVarUInt(num_subcolumns, *stream);
|
||||
|
||||
settings.path.back() = Substream::ObjectElement;
|
||||
for (size_t i = 0; i < num_subcolumns; ++i)
|
||||
{
|
||||
PathInData key;
|
||||
String type_name;
|
||||
|
||||
settings.path.back() = Substream::ObjectStructure;
|
||||
if (auto * stream = settings.getter(settings.path))
|
||||
{
|
||||
key.readBinary(*stream);
|
||||
readStringBinary(type_name, *stream);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA,
|
||||
"Cannot read structure of DataTypeObject, because its stream is missing");
|
||||
}
|
||||
|
||||
settings.path.back() = Substream::ObjectElement;
|
||||
settings.path.back().object_key_name = key.getPath();
|
||||
|
||||
if (auto * stream = settings.getter(settings.path))
|
||||
{
|
||||
auto type = DataTypeFactory::instance().get(type_name);
|
||||
auto serialization = type->getDefaultSerialization();
|
||||
ColumnPtr subcolumn_data = type->createColumn();
|
||||
serialization->deserializeBinaryBulkWithMultipleStreams(subcolumn_data, limit, settings, state, cache);
|
||||
column_object.addSubcolumn(key, subcolumn_data->assumeMutable());
|
||||
}
|
||||
else
|
||||
{
|
||||
throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA,
|
||||
"Cannot read subcolumn '{}' of DataTypeObject, because its stream is missing", key.getPath());
|
||||
}
|
||||
}
|
||||
|
||||
settings.path.pop_back();
|
||||
column_object.checkConsistency();
|
||||
column_object.finalize();
|
||||
column = std::move(mutable_column);
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::serializeBinary(const Field &, WriteBuffer &) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject");
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::deserializeBinary(Field &, ReadBuffer &) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject");
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::serializeBinary(const IColumn &, size_t, WriteBuffer &) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject");
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::deserializeBinary(IColumn &, ReadBuffer &) const
|
||||
{
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented for SerializationObject");
|
||||
}
|
||||
|
||||
/// TODO: use format different of JSON in serializations.
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
{
|
||||
const auto & column_object = assert_cast<const ColumnObject &>(column);
|
||||
const auto & subcolumns = column_object.getSubcolumns();
|
||||
|
||||
writeChar('{', ostr);
|
||||
for (auto it = subcolumns.begin(); it != subcolumns.end(); ++it)
|
||||
{
|
||||
if (it != subcolumns.begin())
|
||||
writeCString(",", ostr);
|
||||
|
||||
writeDoubleQuoted((*it)->path.getPath(), ostr);
|
||||
writeChar(':', ostr);
|
||||
|
||||
auto serialization = (*it)->data.getLeastCommonType()->getDefaultSerialization();
|
||||
serialization->serializeTextJSON((*it)->data.getFinalizedColumn(), row_num, ostr, settings);
|
||||
}
|
||||
writeChar('}', ostr);
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
{
|
||||
serializeTextImpl(column, row_num, ostr, settings);
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
{
|
||||
WriteBufferFromOwnString ostr_str;
|
||||
serializeTextImpl(column, row_num, ostr_str, settings);
|
||||
writeEscapedString(ostr_str.str(), ostr);
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
{
|
||||
WriteBufferFromOwnString ostr_str;
|
||||
serializeTextImpl(column, row_num, ostr_str, settings);
|
||||
writeQuotedString(ostr_str.str(), ostr);
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
{
|
||||
serializeTextImpl(column, row_num, ostr, settings);
|
||||
}
|
||||
|
||||
template <typename Parser>
|
||||
void SerializationObject<Parser>::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
||||
{
|
||||
WriteBufferFromOwnString ostr_str;
|
||||
serializeTextImpl(column, row_num, ostr_str, settings);
|
||||
writeCSVString(ostr_str.str(), ostr);
|
||||
}
|
||||
|
||||
SerializationPtr getObjectSerialization(const String & schema_format)
|
||||
{
|
||||
if (schema_format == "json")
|
||||
{
|
||||
#if USE_SIMDJSON
|
||||
return std::make_shared<SerializationObject<JSONDataParser<SimdJSONParser>>>();
|
||||
#elif USE_RAPIDJSON
|
||||
return std::make_shared<SerializationObject<JSONDataParser<RapidJSONParser>>>();
|
||||
#else
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
|
||||
"To use data type Object with JSON format ClickHouse should be built with Simdjson or Rapidjson");
|
||||
#endif
|
||||
}
|
||||
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unknown schema format '{}'", schema_format);
|
||||
}
|
||||
|
||||
}
|
73
src/DataTypes/Serializations/SerializationObject.h
Normal file
73
src/DataTypes/Serializations/SerializationObject.h
Normal file
@ -0,0 +1,73 @@
|
||||
#pragma once
|
||||
|
||||
#include <DataTypes/Serializations/SimpleTextSerialization.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/// Serialization for data type Object.
|
||||
/// Supported only test serialization/deserialization.
|
||||
/// and binary bulk serialization/deserialization without position independent
|
||||
/// encoding, i.e. serialization/deserialization into Native format.
|
||||
template <typename Parser>
|
||||
class SerializationObject : public ISerialization
|
||||
{
|
||||
public:
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const override;
|
||||
|
||||
void serializeBinaryBulkStateSuffix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const override;
|
||||
|
||||
void deserializeBinaryBulkStatePrefix(
|
||||
DeserializeBinaryBulkSettings & settings,
|
||||
DeserializeBinaryBulkStatePtr & state) const override;
|
||||
|
||||
void serializeBinaryBulkWithMultipleStreams(
|
||||
const IColumn & column,
|
||||
size_t offset,
|
||||
size_t limit,
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
SerializeBinaryBulkStatePtr & state) const override;
|
||||
|
||||
void deserializeBinaryBulkWithMultipleStreams(
|
||||
ColumnPtr & column,
|
||||
size_t limit,
|
||||
DeserializeBinaryBulkSettings & settings,
|
||||
DeserializeBinaryBulkStatePtr & state,
|
||||
SubstreamsCache * cache) const override;
|
||||
|
||||
void serializeBinary(const Field & field, WriteBuffer & ostr) const override;
|
||||
void deserializeBinary(Field & field, ReadBuffer & istr) const override;
|
||||
void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override;
|
||||
void deserializeBinary(IColumn & column, ReadBuffer & istr) const override;
|
||||
|
||||
void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
|
||||
void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
|
||||
void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
|
||||
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
|
||||
void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override;
|
||||
|
||||
void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
|
||||
void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
|
||||
void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
|
||||
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
|
||||
void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override;
|
||||
|
||||
private:
|
||||
template <typename TSettings, typename TStatePtr>
|
||||
void checkSerializationIsSupported(const TSettings & settings, const TStatePtr & state) const;
|
||||
|
||||
template <typename Reader>
|
||||
void deserializeTextImpl(IColumn & column, Reader && reader) const;
|
||||
|
||||
void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const;
|
||||
|
||||
mutable Parser parser;
|
||||
};
|
||||
|
||||
SerializationPtr getObjectSerialization(const String & schema_format);
|
||||
|
||||
}
|
209
src/DataTypes/Serializations/SubcolumnsTree.h
Normal file
209
src/DataTypes/Serializations/SubcolumnsTree.h
Normal file
@ -0,0 +1,209 @@
|
||||
#pragma once
|
||||
|
||||
#include <DataTypes/Serializations/PathInData.h>
|
||||
#include <DataTypes/IDataType.h>
|
||||
#include <Columns/IColumn.h>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/// Tree that represents paths in document
|
||||
/// with additional data in nodes.
|
||||
template <typename NodeData>
|
||||
class SubcolumnsTree
|
||||
{
|
||||
public:
|
||||
struct Node
|
||||
{
|
||||
enum Kind
|
||||
{
|
||||
TUPLE,
|
||||
NESTED,
|
||||
SCALAR,
|
||||
};
|
||||
|
||||
explicit Node(Kind kind_) : kind(kind_) {}
|
||||
Node(Kind kind_, const NodeData & data_) : kind(kind_), data(data_) {}
|
||||
Node(Kind kind_, const NodeData & data_, const PathInData & path_)
|
||||
: kind(kind_), data(data_), path(path_) {}
|
||||
|
||||
Kind kind = TUPLE;
|
||||
const Node * parent = nullptr;
|
||||
|
||||
std::map<String, std::shared_ptr<Node>, std::less<>> children;
|
||||
|
||||
NodeData data;
|
||||
PathInData path;
|
||||
|
||||
bool isNested() const { return kind == NESTED; }
|
||||
bool isScalar() const { return kind == SCALAR; }
|
||||
|
||||
void addChild(const String & key, std::shared_ptr<Node> next_node)
|
||||
{
|
||||
next_node->parent = this;
|
||||
children[key] = std::move(next_node);
|
||||
}
|
||||
};
|
||||
|
||||
using NodeKind = typename Node::Kind;
|
||||
using NodePtr = std::shared_ptr<Node>;
|
||||
|
||||
/// Add a leaf without any data in other nodes.
|
||||
bool add(const PathInData & path, const NodeData & leaf_data)
|
||||
{
|
||||
return add(path, [&](NodeKind kind, bool exists) -> NodePtr
|
||||
{
|
||||
if (exists)
|
||||
return nullptr;
|
||||
|
||||
if (kind == Node::SCALAR)
|
||||
return std::make_shared<Node>(kind, leaf_data, path);
|
||||
|
||||
return std::make_shared<Node>(kind);
|
||||
});
|
||||
}
|
||||
|
||||
/// Callback for creation of node. Receives kind of node and
|
||||
/// flag, which is true if node already exists.
|
||||
using NodeCreator = std::function<NodePtr(NodeKind, bool)>;
|
||||
|
||||
bool add(const PathInData & path, const NodeCreator & node_creator)
|
||||
{
|
||||
const auto & parts = path.getParts();
|
||||
|
||||
if (parts.empty())
|
||||
return false;
|
||||
|
||||
if (!root)
|
||||
root = std::make_shared<Node>(Node::TUPLE);
|
||||
|
||||
Node * current_node = root.get();
|
||||
for (size_t i = 0; i < parts.size() - 1; ++i)
|
||||
{
|
||||
assert(current_node->kind != Node::SCALAR);
|
||||
|
||||
auto it = current_node->children.find(parts[i].key);
|
||||
if (it != current_node->children.end())
|
||||
{
|
||||
current_node = it->second.get();
|
||||
node_creator(current_node->kind, true);
|
||||
|
||||
if (current_node->isNested() != parts[i].is_nested)
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
auto next_kind = parts[i].is_nested ? Node::NESTED : Node::TUPLE;
|
||||
auto next_node = node_creator(next_kind, false);
|
||||
current_node->addChild(String(parts[i].key), next_node);
|
||||
current_node = next_node.get();
|
||||
}
|
||||
}
|
||||
|
||||
auto it = current_node->children.find(parts.back().key);
|
||||
if (it != current_node->children.end())
|
||||
return false;
|
||||
|
||||
auto next_node = node_creator(Node::SCALAR, false);
|
||||
current_node->addChild(String(parts.back().key), next_node);
|
||||
leaves.push_back(std::move(next_node));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Find node that matches the path the best.
|
||||
const Node * findBestMatch(const PathInData & path) const
|
||||
{
|
||||
return findImpl(path, false);
|
||||
}
|
||||
|
||||
/// Find node that matches the path exactly.
|
||||
const Node * findExact(const PathInData & path) const
|
||||
{
|
||||
return findImpl(path, true);
|
||||
}
|
||||
|
||||
/// Find leaf by path.
|
||||
const Node * findLeaf(const PathInData & path) const
|
||||
{
|
||||
const auto * candidate = findExact(path);
|
||||
if (!candidate || !candidate->isScalar())
|
||||
return nullptr;
|
||||
return candidate;
|
||||
}
|
||||
|
||||
using NodePredicate = std::function<bool(const Node &)>;
|
||||
|
||||
/// Finds leaf that satisfies the predicate.
|
||||
const Node * findLeaf(const NodePredicate & predicate)
|
||||
{
|
||||
return findLeaf(root.get(), predicate);
|
||||
}
|
||||
|
||||
static const Node * findLeaf(const Node * node, const NodePredicate & predicate)
|
||||
{
|
||||
if (!node)
|
||||
return nullptr;
|
||||
|
||||
if (node->isScalar())
|
||||
return predicate(*node) ? node : nullptr;
|
||||
|
||||
for (const auto & [_, child] : node->children)
|
||||
if (const auto * leaf = findLeaf(child.get(), predicate))
|
||||
return leaf;
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
/// Find first parent node that satisfies the predicate.
|
||||
static const Node * findParent(const Node * node, const NodePredicate & predicate)
|
||||
{
|
||||
while (node && !predicate(*node))
|
||||
node = node->parent;
|
||||
return node;
|
||||
}
|
||||
|
||||
bool empty() const { return root == nullptr; }
|
||||
size_t size() const { return leaves.size(); }
|
||||
|
||||
using Nodes = std::vector<NodePtr>;
|
||||
|
||||
const Nodes & getLeaves() const { return leaves; }
|
||||
const Node * getRoot() const { return root.get(); }
|
||||
|
||||
using iterator = typename Nodes::iterator;
|
||||
using const_iterator = typename Nodes::const_iterator;
|
||||
|
||||
iterator begin() { return leaves.begin(); }
|
||||
iterator end() { return leaves.end(); }
|
||||
|
||||
const_iterator begin() const { return leaves.begin(); }
|
||||
const_iterator end() const { return leaves.end(); }
|
||||
|
||||
private:
|
||||
const Node * findImpl(const PathInData & path, bool find_exact) const
|
||||
{
|
||||
if (!root)
|
||||
return nullptr;
|
||||
|
||||
const auto & parts = path.getParts();
|
||||
const Node * current_node = root.get();
|
||||
|
||||
for (const auto & part : parts)
|
||||
{
|
||||
auto it = current_node->children.find(part.key);
|
||||
if (it == current_node->children.end())
|
||||
return find_exact ? nullptr : current_node;
|
||||
|
||||
current_node = it->second.get();
|
||||
}
|
||||
|
||||
return current_node;
|
||||
}
|
||||
|
||||
NodePtr root;
|
||||
Nodes leaves;
|
||||
};
|
||||
|
||||
}
|
0
src/DataTypes/Serializations/tests/CMakeLists.txt
Normal file
0
src/DataTypes/Serializations/tests/CMakeLists.txt
Normal file
216
src/DataTypes/Serializations/tests/gtest_json_parser.cpp
Normal file
216
src/DataTypes/Serializations/tests/gtest_json_parser.cpp
Normal file
@ -0,0 +1,216 @@
|
||||
#include <DataTypes/Serializations/JSONDataParser.h>
|
||||
#include <Common/JSONParsers/SimdJSONParser.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <Common/FieldVisitorToString.h>
|
||||
|
||||
#include <ostream>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#if USE_SIMDJSON
|
||||
|
||||
using namespace DB;
|
||||
|
||||
const String json1 = R"({"k1" : 1, "k2" : {"k3" : "aa", "k4" : 2}})";
|
||||
|
||||
/// Nested(k2 String, k3 Nested(k4 String))
|
||||
const String json2 =
|
||||
R"({"k1" : [
|
||||
{
|
||||
"k2" : "aaa",
|
||||
"k3" : [{ "k4" : "bbb" }, { "k4" : "ccc" }]
|
||||
},
|
||||
{
|
||||
"k2" : "ddd",
|
||||
"k3" : [{ "k4" : "eee" }, { "k4" : "fff" }]
|
||||
}
|
||||
]
|
||||
})";
|
||||
|
||||
TEST(JSONDataParser, ReadJSON)
|
||||
{
|
||||
{
|
||||
String json_bad = json1 + "aaaaaaa";
|
||||
|
||||
JSONDataParser<SimdJSONParser> parser;
|
||||
ReadBufferFromString buf(json_bad);
|
||||
String res;
|
||||
parser.readJSON(res, buf);
|
||||
ASSERT_EQ(json1, res);
|
||||
}
|
||||
|
||||
{
|
||||
String json_bad = json2 + "aaaaaaa";
|
||||
|
||||
JSONDataParser<SimdJSONParser> parser;
|
||||
ReadBufferFromString buf(json_bad);
|
||||
String res;
|
||||
parser.readJSON(res, buf);
|
||||
ASSERT_EQ(json2, res);
|
||||
}
|
||||
}
|
||||
|
||||
struct JSONPathAndValue
|
||||
{
|
||||
PathInData path;
|
||||
Field value;
|
||||
|
||||
JSONPathAndValue(const PathInData & path_, const Field & value_)
|
||||
: path(path_), value(value_)
|
||||
{
|
||||
}
|
||||
|
||||
bool operator==(const JSONPathAndValue & other) const = default;
|
||||
bool operator<(const JSONPathAndValue & other) const { return path.getPath() < other.path.getPath(); }
|
||||
};
|
||||
|
||||
static std::ostream & operator<<(std::ostream & ostr, const JSONPathAndValue & path_and_value)
|
||||
{
|
||||
ostr << "{ PathInData{";
|
||||
bool first = true;
|
||||
for (const auto & part : path_and_value.path.getParts())
|
||||
{
|
||||
ostr << (first ? "{" : ", {") << part.key << ", " << part.is_nested << ", " << part.anonymous_array_level << "}";
|
||||
first = false;
|
||||
}
|
||||
|
||||
ostr << "}, Field{" << applyVisitor(FieldVisitorToString(), path_and_value.value) << "} }";
|
||||
return ostr;
|
||||
}
|
||||
|
||||
using JSONValues = std::vector<JSONPathAndValue>;
|
||||
|
||||
static void check(
|
||||
const String & json_str,
|
||||
const String & tag,
|
||||
JSONValues expected_values)
|
||||
{
|
||||
JSONDataParser<SimdJSONParser> parser;
|
||||
auto res = parser.parse(json_str.data(), json_str.size());
|
||||
ASSERT_TRUE(res.has_value()) << tag;
|
||||
|
||||
const auto & [paths, values] = *res;
|
||||
|
||||
ASSERT_EQ(paths.size(), expected_values.size()) << tag;
|
||||
ASSERT_EQ(values.size(), expected_values.size()) << tag;
|
||||
|
||||
JSONValues result_values;
|
||||
for (size_t i = 0; i < paths.size(); ++i)
|
||||
result_values.emplace_back(paths[i], values[i]);
|
||||
|
||||
std::sort(expected_values.begin(), expected_values.end());
|
||||
std::sort(result_values.begin(), result_values.end());
|
||||
|
||||
ASSERT_EQ(result_values, expected_values) << tag;
|
||||
}
|
||||
|
||||
TEST(JSONDataParser, Parse)
|
||||
{
|
||||
{
|
||||
check(json1, "json1",
|
||||
{
|
||||
{ PathInData{{{"k1", false, 0}}}, 1 },
|
||||
{ PathInData{{{"k2", false, 0}, {"k3", false, 0}}}, "aa" },
|
||||
{ PathInData{{{"k2", false, 0}, {"k4", false, 0}}}, 2 },
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
check(json2, "json2",
|
||||
{
|
||||
{ PathInData{{{"k1", true, 0}, {"k2", false, 0}}}, Array{"aaa", "ddd"} },
|
||||
{ PathInData{{{"k1", true, 0}, {"k3", true, 0}, {"k4", false, 0}}}, Array{Array{"bbb", "ccc"}, Array{"eee", "fff"}} },
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
/// Nested(k2 Tuple(k3 Array(Int), k4 Array(Int)), k5 String)
|
||||
const String json3 =
|
||||
R"({"k1": [
|
||||
{
|
||||
"k2": {
|
||||
"k3": [1, 2],
|
||||
"k4": [3, 4]
|
||||
},
|
||||
"k5": "foo"
|
||||
},
|
||||
{
|
||||
"k2": {
|
||||
"k3": [5, 6],
|
||||
"k4": [7, 8]
|
||||
},
|
||||
"k5": "bar"
|
||||
}
|
||||
]})";
|
||||
|
||||
check(json3, "json3",
|
||||
{
|
||||
{ PathInData{{{"k1", true, 0}, {"k5", false, 0}}}, Array{"foo", "bar"} },
|
||||
{ PathInData{{{"k1", true, 0}, {"k2", false, 0}, {"k3", false, 0}}}, Array{Array{1, 2}, Array{5, 6}} },
|
||||
{ PathInData{{{"k1", true, 0}, {"k2", false, 0}, {"k4", false, 0}}}, Array{Array{3, 4}, Array{7, 8}} },
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
/// Nested(k2 Nested(k3 Int, k4 Int), k5 String)
|
||||
const String json4 =
|
||||
R"({"k1": [
|
||||
{
|
||||
"k2": [{"k3": 1, "k4": 3}, {"k3": 2, "k4": 4}],
|
||||
"k5": "foo"
|
||||
},
|
||||
{
|
||||
"k2": [{"k3": 5, "k4": 7}, {"k3": 6, "k4": 8}],
|
||||
"k5": "bar"
|
||||
}
|
||||
]})";
|
||||
|
||||
check(json4, "json4",
|
||||
{
|
||||
{ PathInData{{{"k1", true, 0}, {"k5", false, 0}}}, Array{"foo", "bar"} },
|
||||
{ PathInData{{{"k1", true, 0}, {"k2", true, 0}, {"k3", false, 0}}}, Array{Array{1, 2}, Array{5, 6}} },
|
||||
{ PathInData{{{"k1", true, 0}, {"k2", true, 0}, {"k4", false, 0}}}, Array{Array{3, 4}, Array{7, 8}} },
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
const String json5 = R"({"k1": [[1, 2, 3], [4, 5], [6]]})";
|
||||
check(json5, "json5",
|
||||
{
|
||||
{ PathInData{{{"k1", false, 0}}}, Array{Array{1, 2, 3}, Array{4, 5}, Array{6}} }
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
/// Array(Nested(k2 Int, k3 Int))
|
||||
const String json6 = R"({
|
||||
"k1": [
|
||||
[{"k2": 1, "k3": 2}, {"k2": 3, "k3": 4}],
|
||||
[{"k2": 5, "k3": 6}]
|
||||
]
|
||||
})";
|
||||
|
||||
check(json6, "json6",
|
||||
{
|
||||
{ PathInData{{{"k1", true, 0}, {"k2", false, 1}}}, Array{Array{1, 3}, Array{5}} },
|
||||
{ PathInData{{{"k1", true, 0}, {"k3", false, 1}}}, Array{Array{2, 4}, Array{6}} },
|
||||
});
|
||||
}
|
||||
|
||||
{
|
||||
/// Nested(k2 Array(Int), k3 Array(Int))
|
||||
const String json7 = R"({
|
||||
"k1": [
|
||||
{"k2": [1, 3], "k3": [2, 4]},
|
||||
{"k2": [5], "k3": [6]}
|
||||
]
|
||||
})";
|
||||
|
||||
check(json7, "json7",
|
||||
{
|
||||
{ PathInData{{{"k1", true, 0}, {"k2", false, 0}}}, Array{Array{1, 3}, Array{5}} },
|
||||
{ PathInData{{{"k1", true, 0}, {"k3", false, 0}}}, Array{Array{2, 4}, Array{6}} },
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@ -18,6 +18,8 @@
|
||||
#include <DataTypes/DataTypeEnum.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <DataTypes/DataTypesDecimal.h>
|
||||
#include <DataTypes/DataTypeFactory.h>
|
||||
#include <base/EnumReflection.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -30,6 +32,11 @@ namespace ErrorCodes
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
String typeToString(const DataTypePtr & type) { return type->getName(); }
|
||||
String typeToString(const TypeIndex & type) { return String(magic_enum::enum_name(type)); }
|
||||
|
||||
template <typename DataTypes>
|
||||
String getExceptionMessagePrefix(const DataTypes & types)
|
||||
{
|
||||
WriteBufferFromOwnString res;
|
||||
@ -42,16 +49,164 @@ namespace
|
||||
res << ", ";
|
||||
first = false;
|
||||
|
||||
res << type->getName();
|
||||
res << typeToString(type);
|
||||
}
|
||||
|
||||
return res.str();
|
||||
}
|
||||
|
||||
DataTypePtr getNumericType(const TypeIndexSet & types, bool allow_conversion_to_string)
|
||||
{
|
||||
auto throw_or_return = [&](std::string_view message, int error_code)
|
||||
{
|
||||
if (allow_conversion_to_string)
|
||||
return std::make_shared<DataTypeString>();
|
||||
|
||||
throw Exception(String(message), error_code);
|
||||
};
|
||||
|
||||
bool all_numbers = true;
|
||||
|
||||
size_t max_bits_of_signed_integer = 0;
|
||||
size_t max_bits_of_unsigned_integer = 0;
|
||||
size_t max_mantissa_bits_of_floating = 0;
|
||||
|
||||
auto maximize = [](size_t & what, size_t value)
|
||||
{
|
||||
if (value > what)
|
||||
what = value;
|
||||
};
|
||||
|
||||
for (const auto & type : types)
|
||||
{
|
||||
if (type == TypeIndex::UInt8)
|
||||
maximize(max_bits_of_unsigned_integer, 8);
|
||||
else if (type == TypeIndex::UInt16)
|
||||
maximize(max_bits_of_unsigned_integer, 16);
|
||||
else if (type == TypeIndex::UInt32)
|
||||
maximize(max_bits_of_unsigned_integer, 32);
|
||||
else if (type == TypeIndex::UInt64)
|
||||
maximize(max_bits_of_unsigned_integer, 64);
|
||||
else if (type == TypeIndex::UInt128)
|
||||
maximize(max_bits_of_unsigned_integer, 128);
|
||||
else if (type == TypeIndex::UInt256)
|
||||
maximize(max_bits_of_unsigned_integer, 256);
|
||||
else if (type == TypeIndex::Int8 || type == TypeIndex::Enum8)
|
||||
maximize(max_bits_of_signed_integer, 8);
|
||||
else if (type == TypeIndex::Int16 || type == TypeIndex::Enum16)
|
||||
maximize(max_bits_of_signed_integer, 16);
|
||||
else if (type == TypeIndex::Int32)
|
||||
maximize(max_bits_of_signed_integer, 32);
|
||||
else if (type == TypeIndex::Int64)
|
||||
maximize(max_bits_of_signed_integer, 64);
|
||||
else if (type == TypeIndex::Int128)
|
||||
maximize(max_bits_of_signed_integer, 128);
|
||||
else if (type == TypeIndex::Int256)
|
||||
maximize(max_bits_of_signed_integer, 256);
|
||||
else if (type == TypeIndex::Float32)
|
||||
maximize(max_mantissa_bits_of_floating, 24);
|
||||
else if (type == TypeIndex::Float64)
|
||||
maximize(max_mantissa_bits_of_floating, 53);
|
||||
else
|
||||
all_numbers = false;
|
||||
}
|
||||
|
||||
|
||||
DataTypePtr getLeastSupertype(const DataTypes & types)
|
||||
if (max_bits_of_signed_integer || max_bits_of_unsigned_integer || max_mantissa_bits_of_floating)
|
||||
{
|
||||
if (!all_numbers)
|
||||
return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are numbers and some of them are not", ErrorCodes::NO_COMMON_TYPE);
|
||||
|
||||
/// If there are signed and unsigned types of same bit-width, the result must be signed number with at least one more bit.
|
||||
/// Example, common of Int32, UInt32 = Int64.
|
||||
|
||||
size_t min_bit_width_of_integer = std::max(max_bits_of_signed_integer, max_bits_of_unsigned_integer);
|
||||
|
||||
/// If unsigned is not covered by signed.
|
||||
if (max_bits_of_signed_integer && max_bits_of_unsigned_integer >= max_bits_of_signed_integer) //-V1051
|
||||
{
|
||||
// Because 128 and 256 bit integers are significantly slower, we should not promote to them.
|
||||
// But if we already have wide numbers, promotion is necessary.
|
||||
if (min_bit_width_of_integer != 64)
|
||||
++min_bit_width_of_integer;
|
||||
else
|
||||
return throw_or_return(
|
||||
getExceptionMessagePrefix(types)
|
||||
+ " because some of them are signed integers and some are unsigned integers,"
|
||||
" but there is no signed integer type, that can exactly represent all required unsigned integer values",
|
||||
ErrorCodes::NO_COMMON_TYPE);
|
||||
}
|
||||
|
||||
/// If the result must be floating.
|
||||
if (max_mantissa_bits_of_floating)
|
||||
{
|
||||
size_t min_mantissa_bits = std::max(min_bit_width_of_integer, max_mantissa_bits_of_floating);
|
||||
if (min_mantissa_bits <= 24)
|
||||
return std::make_shared<DataTypeFloat32>();
|
||||
else if (min_mantissa_bits <= 53)
|
||||
return std::make_shared<DataTypeFloat64>();
|
||||
else
|
||||
return throw_or_return(getExceptionMessagePrefix(types)
|
||||
+ " because some of them are integers and some are floating point,"
|
||||
" but there is no floating point type, that can exactly represent all required integers", ErrorCodes::NO_COMMON_TYPE);
|
||||
}
|
||||
|
||||
/// If the result must be signed integer.
|
||||
if (max_bits_of_signed_integer)
|
||||
{
|
||||
if (min_bit_width_of_integer <= 8)
|
||||
return std::make_shared<DataTypeInt8>();
|
||||
else if (min_bit_width_of_integer <= 16)
|
||||
return std::make_shared<DataTypeInt16>();
|
||||
else if (min_bit_width_of_integer <= 32)
|
||||
return std::make_shared<DataTypeInt32>();
|
||||
else if (min_bit_width_of_integer <= 64)
|
||||
return std::make_shared<DataTypeInt64>();
|
||||
else if (min_bit_width_of_integer <= 128)
|
||||
return std::make_shared<DataTypeInt128>();
|
||||
else if (min_bit_width_of_integer <= 256)
|
||||
return std::make_shared<DataTypeInt256>();
|
||||
else
|
||||
return throw_or_return(getExceptionMessagePrefix(types)
|
||||
+ " because some of them are signed integers and some are unsigned integers,"
|
||||
" but there is no signed integer type, that can exactly represent all required unsigned integer values", ErrorCodes::NO_COMMON_TYPE);
|
||||
}
|
||||
|
||||
/// All unsigned.
|
||||
{
|
||||
if (min_bit_width_of_integer <= 8)
|
||||
return std::make_shared<DataTypeUInt8>();
|
||||
else if (min_bit_width_of_integer <= 16)
|
||||
return std::make_shared<DataTypeUInt16>();
|
||||
else if (min_bit_width_of_integer <= 32)
|
||||
return std::make_shared<DataTypeUInt32>();
|
||||
else if (min_bit_width_of_integer <= 64)
|
||||
return std::make_shared<DataTypeUInt64>();
|
||||
else if (min_bit_width_of_integer <= 128)
|
||||
return std::make_shared<DataTypeUInt128>();
|
||||
else if (min_bit_width_of_integer <= 256)
|
||||
return std::make_shared<DataTypeUInt256>();
|
||||
else
|
||||
return throw_or_return("Logical error: " + getExceptionMessagePrefix(types)
|
||||
+ " but as all data types are unsigned integers, we must have found maximum unsigned integer type", ErrorCodes::NO_COMMON_TYPE);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_conversion_to_string)
|
||||
{
|
||||
auto throw_or_return = [&](std::string_view message, int error_code)
|
||||
{
|
||||
if (allow_conversion_to_string)
|
||||
return std::make_shared<DataTypeString>();
|
||||
|
||||
throw Exception(String(message), error_code);
|
||||
};
|
||||
|
||||
/// Trivial cases
|
||||
|
||||
if (types.empty())
|
||||
@ -88,7 +243,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
|
||||
non_nothing_types.emplace_back(type);
|
||||
|
||||
if (non_nothing_types.size() < types.size())
|
||||
return getLeastSupertype(non_nothing_types);
|
||||
return getLeastSupertype(non_nothing_types, allow_conversion_to_string);
|
||||
}
|
||||
|
||||
/// For Arrays
|
||||
@ -113,9 +268,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
|
||||
if (have_array)
|
||||
{
|
||||
if (!all_arrays)
|
||||
throw Exception(getExceptionMessagePrefix(types) + " because some of them are Array and some of them are not", ErrorCodes::NO_COMMON_TYPE);
|
||||
return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are Array and some of them are not", ErrorCodes::NO_COMMON_TYPE);
|
||||
|
||||
return std::make_shared<DataTypeArray>(getLeastSupertype(nested_types));
|
||||
return std::make_shared<DataTypeArray>(getLeastSupertype(nested_types, allow_conversion_to_string));
|
||||
}
|
||||
}
|
||||
|
||||
@ -139,7 +294,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
|
||||
nested_types[elem_idx].reserve(types.size());
|
||||
}
|
||||
else if (tuple_size != type_tuple->getElements().size())
|
||||
throw Exception(getExceptionMessagePrefix(types) + " because Tuples have different sizes", ErrorCodes::NO_COMMON_TYPE);
|
||||
return throw_or_return(getExceptionMessagePrefix(types) + " because Tuples have different sizes", ErrorCodes::NO_COMMON_TYPE);
|
||||
|
||||
have_tuple = true;
|
||||
|
||||
@ -153,11 +308,11 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
|
||||
if (have_tuple)
|
||||
{
|
||||
if (!all_tuples)
|
||||
throw Exception(getExceptionMessagePrefix(types) + " because some of them are Tuple and some of them are not", ErrorCodes::NO_COMMON_TYPE);
|
||||
return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are Tuple and some of them are not", ErrorCodes::NO_COMMON_TYPE);
|
||||
|
||||
DataTypes common_tuple_types(tuple_size);
|
||||
for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx)
|
||||
common_tuple_types[elem_idx] = getLeastSupertype(nested_types[elem_idx]);
|
||||
common_tuple_types[elem_idx] = getLeastSupertype(nested_types[elem_idx], allow_conversion_to_string);
|
||||
|
||||
return std::make_shared<DataTypeTuple>(common_tuple_types);
|
||||
}
|
||||
@ -187,9 +342,11 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
|
||||
if (have_maps)
|
||||
{
|
||||
if (!all_maps)
|
||||
throw Exception(getExceptionMessagePrefix(types) + " because some of them are Maps and some of them are not", ErrorCodes::NO_COMMON_TYPE);
|
||||
return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are Maps and some of them are not", ErrorCodes::NO_COMMON_TYPE);
|
||||
|
||||
return std::make_shared<DataTypeMap>(getLeastSupertype(key_types), getLeastSupertype(value_types));
|
||||
return std::make_shared<DataTypeMap>(
|
||||
getLeastSupertype(key_types, allow_conversion_to_string),
|
||||
getLeastSupertype(value_types, allow_conversion_to_string));
|
||||
}
|
||||
}
|
||||
|
||||
@ -220,9 +377,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
|
||||
if (have_low_cardinality)
|
||||
{
|
||||
if (have_not_low_cardinality)
|
||||
return getLeastSupertype(nested_types);
|
||||
return getLeastSupertype(nested_types, allow_conversion_to_string);
|
||||
else
|
||||
return std::make_shared<DataTypeLowCardinality>(getLeastSupertype(nested_types));
|
||||
return std::make_shared<DataTypeLowCardinality>(getLeastSupertype(nested_types, allow_conversion_to_string));
|
||||
}
|
||||
}
|
||||
|
||||
@ -248,13 +405,13 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
|
||||
|
||||
if (have_nullable)
|
||||
{
|
||||
return std::make_shared<DataTypeNullable>(getLeastSupertype(nested_types));
|
||||
return std::make_shared<DataTypeNullable>(getLeastSupertype(nested_types, allow_conversion_to_string));
|
||||
}
|
||||
}
|
||||
|
||||
/// Non-recursive rules
|
||||
|
||||
std::unordered_set<TypeIndex> type_ids;
|
||||
TypeIndexSet type_ids;
|
||||
for (const auto & type : types)
|
||||
type_ids.insert(type->getTypeId());
|
||||
|
||||
@ -268,7 +425,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
|
||||
{
|
||||
bool all_strings = type_ids.size() == (have_string + have_fixed_string);
|
||||
if (!all_strings)
|
||||
throw Exception(getExceptionMessagePrefix(types) + " because some of them are String/FixedString and some of them are not", ErrorCodes::NO_COMMON_TYPE);
|
||||
return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are String/FixedString and some of them are not", ErrorCodes::NO_COMMON_TYPE);
|
||||
|
||||
return std::make_shared<DataTypeString>();
|
||||
}
|
||||
@ -285,7 +442,8 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
|
||||
{
|
||||
bool all_date_or_datetime = type_ids.size() == (have_date + have_date32 + have_datetime + have_datetime64);
|
||||
if (!all_date_or_datetime)
|
||||
throw Exception(getExceptionMessagePrefix(types) + " because some of them are Date/Date32/DateTime/DateTime64 and some of them are not",
|
||||
return throw_or_return(getExceptionMessagePrefix(types)
|
||||
+ " because some of them are Date/Date32/DateTime/DateTime64 and some of them are not",
|
||||
ErrorCodes::NO_COMMON_TYPE);
|
||||
|
||||
if (have_datetime64 == 0 && have_date32 == 0)
|
||||
@ -362,7 +520,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
|
||||
}
|
||||
|
||||
if (num_supported != type_ids.size())
|
||||
throw Exception(getExceptionMessagePrefix(types) + " because some of them have no lossless conversion to Decimal",
|
||||
return throw_or_return(getExceptionMessagePrefix(types) + " because some of them have no lossless conversion to Decimal",
|
||||
ErrorCodes::NO_COMMON_TYPE);
|
||||
|
||||
UInt32 max_scale = 0;
|
||||
@ -385,7 +543,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
|
||||
}
|
||||
|
||||
if (min_precision > DataTypeDecimal<Decimal128>::maxPrecision())
|
||||
throw Exception(getExceptionMessagePrefix(types) + " because the least supertype is Decimal("
|
||||
return throw_or_return(getExceptionMessagePrefix(types) + " because the least supertype is Decimal("
|
||||
+ toString(min_precision) + ',' + toString(max_scale) + ')',
|
||||
ErrorCodes::NO_COMMON_TYPE);
|
||||
|
||||
@ -399,135 +557,56 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
|
||||
|
||||
/// For numeric types, the most complicated part.
|
||||
{
|
||||
bool all_numbers = true;
|
||||
|
||||
size_t max_bits_of_signed_integer = 0;
|
||||
size_t max_bits_of_unsigned_integer = 0;
|
||||
size_t max_mantissa_bits_of_floating = 0;
|
||||
|
||||
auto maximize = [](size_t & what, size_t value)
|
||||
{
|
||||
if (value > what)
|
||||
what = value;
|
||||
};
|
||||
|
||||
for (const auto & type : types)
|
||||
{
|
||||
if (typeid_cast<const DataTypeUInt8 *>(type.get()))
|
||||
maximize(max_bits_of_unsigned_integer, 8);
|
||||
else if (typeid_cast<const DataTypeUInt16 *>(type.get()))
|
||||
maximize(max_bits_of_unsigned_integer, 16);
|
||||
else if (typeid_cast<const DataTypeUInt32 *>(type.get()))
|
||||
maximize(max_bits_of_unsigned_integer, 32);
|
||||
else if (typeid_cast<const DataTypeUInt64 *>(type.get()))
|
||||
maximize(max_bits_of_unsigned_integer, 64);
|
||||
else if (typeid_cast<const DataTypeUInt128 *>(type.get()))
|
||||
maximize(max_bits_of_unsigned_integer, 128);
|
||||
else if (typeid_cast<const DataTypeUInt256 *>(type.get()))
|
||||
maximize(max_bits_of_unsigned_integer, 256);
|
||||
else if (typeid_cast<const DataTypeInt8 *>(type.get()) || typeid_cast<const DataTypeEnum8 *>(type.get()))
|
||||
maximize(max_bits_of_signed_integer, 8);
|
||||
else if (typeid_cast<const DataTypeInt16 *>(type.get()) || typeid_cast<const DataTypeEnum16 *>(type.get()))
|
||||
maximize(max_bits_of_signed_integer, 16);
|
||||
else if (typeid_cast<const DataTypeInt32 *>(type.get()))
|
||||
maximize(max_bits_of_signed_integer, 32);
|
||||
else if (typeid_cast<const DataTypeInt64 *>(type.get()))
|
||||
maximize(max_bits_of_signed_integer, 64);
|
||||
else if (typeid_cast<const DataTypeInt128 *>(type.get()))
|
||||
maximize(max_bits_of_signed_integer, 128);
|
||||
else if (typeid_cast<const DataTypeInt256 *>(type.get()))
|
||||
maximize(max_bits_of_signed_integer, 256);
|
||||
else if (typeid_cast<const DataTypeFloat32 *>(type.get()))
|
||||
maximize(max_mantissa_bits_of_floating, 24);
|
||||
else if (typeid_cast<const DataTypeFloat64 *>(type.get()))
|
||||
maximize(max_mantissa_bits_of_floating, 53);
|
||||
else
|
||||
all_numbers = false;
|
||||
}
|
||||
|
||||
if (max_bits_of_signed_integer || max_bits_of_unsigned_integer || max_mantissa_bits_of_floating)
|
||||
{
|
||||
if (!all_numbers)
|
||||
throw Exception(getExceptionMessagePrefix(types) + " because some of them are numbers and some of them are not", ErrorCodes::NO_COMMON_TYPE);
|
||||
|
||||
/// If there are signed and unsigned types of same bit-width, the result must be signed number with at least one more bit.
|
||||
/// Example, common of Int32, UInt32 = Int64.
|
||||
|
||||
size_t min_bit_width_of_integer = std::max(max_bits_of_signed_integer, max_bits_of_unsigned_integer);
|
||||
|
||||
/// If unsigned is not covered by signed.
|
||||
if (max_bits_of_signed_integer && max_bits_of_unsigned_integer >= max_bits_of_signed_integer) //-V1051
|
||||
{
|
||||
// Because 128 and 256 bit integers are significantly slower, we should not promote to them.
|
||||
// But if we already have wide numbers, promotion is necessary.
|
||||
if (min_bit_width_of_integer != 64)
|
||||
++min_bit_width_of_integer;
|
||||
else
|
||||
throw Exception(
|
||||
getExceptionMessagePrefix(types)
|
||||
+ " because some of them are signed integers and some are unsigned integers,"
|
||||
" but there is no signed integer type, that can exactly represent all required unsigned integer values",
|
||||
ErrorCodes::NO_COMMON_TYPE);
|
||||
}
|
||||
|
||||
/// If the result must be floating.
|
||||
if (max_mantissa_bits_of_floating)
|
||||
{
|
||||
size_t min_mantissa_bits = std::max(min_bit_width_of_integer, max_mantissa_bits_of_floating);
|
||||
if (min_mantissa_bits <= 24)
|
||||
return std::make_shared<DataTypeFloat32>();
|
||||
else if (min_mantissa_bits <= 53)
|
||||
return std::make_shared<DataTypeFloat64>();
|
||||
else
|
||||
throw Exception(getExceptionMessagePrefix(types)
|
||||
+ " because some of them are integers and some are floating point,"
|
||||
" but there is no floating point type, that can exactly represent all required integers", ErrorCodes::NO_COMMON_TYPE);
|
||||
}
|
||||
|
||||
/// If the result must be signed integer.
|
||||
if (max_bits_of_signed_integer)
|
||||
{
|
||||
if (min_bit_width_of_integer <= 8)
|
||||
return std::make_shared<DataTypeInt8>();
|
||||
else if (min_bit_width_of_integer <= 16)
|
||||
return std::make_shared<DataTypeInt16>();
|
||||
else if (min_bit_width_of_integer <= 32)
|
||||
return std::make_shared<DataTypeInt32>();
|
||||
else if (min_bit_width_of_integer <= 64)
|
||||
return std::make_shared<DataTypeInt64>();
|
||||
else if (min_bit_width_of_integer <= 128)
|
||||
return std::make_shared<DataTypeInt128>();
|
||||
else if (min_bit_width_of_integer <= 256)
|
||||
return std::make_shared<DataTypeInt256>();
|
||||
else
|
||||
throw Exception(getExceptionMessagePrefix(types)
|
||||
+ " because some of them are signed integers and some are unsigned integers,"
|
||||
" but there is no signed integer type, that can exactly represent all required unsigned integer values", ErrorCodes::NO_COMMON_TYPE);
|
||||
}
|
||||
|
||||
/// All unsigned.
|
||||
{
|
||||
if (min_bit_width_of_integer <= 8)
|
||||
return std::make_shared<DataTypeUInt8>();
|
||||
else if (min_bit_width_of_integer <= 16)
|
||||
return std::make_shared<DataTypeUInt16>();
|
||||
else if (min_bit_width_of_integer <= 32)
|
||||
return std::make_shared<DataTypeUInt32>();
|
||||
else if (min_bit_width_of_integer <= 64)
|
||||
return std::make_shared<DataTypeUInt64>();
|
||||
else if (min_bit_width_of_integer <= 128)
|
||||
return std::make_shared<DataTypeUInt128>();
|
||||
else if (min_bit_width_of_integer <= 256)
|
||||
return std::make_shared<DataTypeUInt256>();
|
||||
else
|
||||
throw Exception("Logical error: " + getExceptionMessagePrefix(types)
|
||||
+ " but as all data types are unsigned integers, we must have found maximum unsigned integer type", ErrorCodes::NO_COMMON_TYPE);
|
||||
}
|
||||
}
|
||||
auto numeric_type = getNumericType(type_ids, allow_conversion_to_string);
|
||||
if (numeric_type)
|
||||
return numeric_type;
|
||||
}
|
||||
|
||||
/// All other data types (UUID, AggregateFunction, Enum...) are compatible only if they are the same (checked in trivial cases).
|
||||
throw Exception(getExceptionMessagePrefix(types), ErrorCodes::NO_COMMON_TYPE);
|
||||
return throw_or_return(getExceptionMessagePrefix(types), ErrorCodes::NO_COMMON_TYPE);
|
||||
}
|
||||
|
||||
DataTypePtr getLeastSupertype(const TypeIndexSet & types, bool allow_conversion_to_string)
|
||||
{
|
||||
auto throw_or_return = [&](std::string_view message, int error_code)
|
||||
{
|
||||
if (allow_conversion_to_string)
|
||||
return std::make_shared<DataTypeString>();
|
||||
|
||||
throw Exception(String(message), error_code);
|
||||
};
|
||||
|
||||
TypeIndexSet types_set;
|
||||
for (const auto & type : types)
|
||||
{
|
||||
if (WhichDataType(type).isNothing())
|
||||
continue;
|
||||
|
||||
if (!WhichDataType(type).isSimple())
|
||||
throw Exception(ErrorCodes::NO_COMMON_TYPE,
|
||||
"Cannot get common type by type ids with parametric type {}", typeToString(type));
|
||||
|
||||
types_set.insert(type);
|
||||
}
|
||||
|
||||
if (types_set.empty())
|
||||
return std::make_shared<DataTypeNothing>();
|
||||
|
||||
if (types.count(TypeIndex::String))
|
||||
{
|
||||
if (types.size() != 1)
|
||||
return throw_or_return(getExceptionMessagePrefix(types) + " because some of them are String and some of them are not", ErrorCodes::NO_COMMON_TYPE);
|
||||
|
||||
return std::make_shared<DataTypeString>();
|
||||
}
|
||||
|
||||
/// For numeric types, the most complicated part.
|
||||
auto numeric_type = getNumericType(types, allow_conversion_to_string);
|
||||
if (numeric_type)
|
||||
return numeric_type;
|
||||
|
||||
/// All other data types (UUID, AggregateFunction, Enum...) are compatible only if they are the same (checked in trivial cases).
|
||||
return throw_or_return(getExceptionMessagePrefix(types), ErrorCodes::NO_COMMON_TYPE);
|
||||
}
|
||||
|
||||
DataTypePtr tryGetLeastSupertype(const DataTypes & types)
|
||||
|
@ -7,12 +7,16 @@ namespace DB
|
||||
{
|
||||
|
||||
/** Get data type that covers all possible values of passed data types.
|
||||
* If there is no such data type, throws an exception.
|
||||
* If there is no such data type, throws an exception
|
||||
* or if 'allow_conversion_to_string' is true returns String as common type.
|
||||
*
|
||||
* Examples: least common supertype for UInt8, Int8 - Int16.
|
||||
* Examples: there is no least common supertype for Array(UInt8), Int8.
|
||||
*/
|
||||
DataTypePtr getLeastSupertype(const DataTypes & types);
|
||||
DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_conversion_to_string = false);
|
||||
|
||||
using TypeIndexSet = std::unordered_set<TypeIndex>;
|
||||
DataTypePtr getLeastSupertype(const TypeIndexSet & types, bool allow_conversion_to_string = false);
|
||||
|
||||
/// Same as above but return nullptr instead of throwing exception.
|
||||
DataTypePtr tryGetLeastSupertype(const DataTypes & types);
|
||||
|
@ -406,13 +406,24 @@ ASTPtr DatabasePostgreSQL::getCreateTableQueryImpl(const String & table_name, Co
|
||||
ASTs storage_children = ast_storage->children;
|
||||
auto storage_engine_arguments = ast_storage->engine->arguments;
|
||||
|
||||
if (storage_engine_arguments->children.empty())
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected number of arguments: {}", storage_engine_arguments->children.size());
|
||||
|
||||
/// Check for named collection.
|
||||
if (typeid_cast<ASTIdentifier *>(storage_engine_arguments->children[0].get()))
|
||||
{
|
||||
storage_engine_arguments->children.push_back(makeASTFunction("equals", std::make_shared<ASTIdentifier>("table"), std::make_shared<ASTLiteral>(table_id.table_name)));
|
||||
}
|
||||
else
|
||||
{
|
||||
/// Remove extra engine argument (`schema` and `use_table_cache`)
|
||||
if (storage_engine_arguments->children.size() >= 5)
|
||||
storage_engine_arguments->children.resize(4);
|
||||
|
||||
/// Add table_name to engine arguments
|
||||
assert(storage_engine_arguments->children.size() >= 2);
|
||||
/// Add table_name to engine arguments.
|
||||
if (storage_engine_arguments->children.size() >= 2)
|
||||
storage_engine_arguments->children.insert(storage_engine_arguments->children.begin() + 2, std::make_shared<ASTLiteral>(table_id.table_name));
|
||||
}
|
||||
|
||||
return create_table_query;
|
||||
}
|
||||
|
@ -9,9 +9,9 @@
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <DataTypes/DataTypeMap.h>
|
||||
#include <Functions/SimdJSONParser.h>
|
||||
#include <Functions/RapidJSONParser.h>
|
||||
#include <Functions/DummyJSONParser.h>
|
||||
#include <Common/JSONParsers/SimdJSONParser.h>
|
||||
#include <Common/JSONParsers/RapidJSONParser.h>
|
||||
#include <Common/JSONParsers/DummyJSONParser.h>
|
||||
|
||||
#include <base/find_symbols.h>
|
||||
|
||||
@ -169,6 +169,10 @@ DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field)
|
||||
|
||||
value_type = type;
|
||||
}
|
||||
|
||||
if (!value_type)
|
||||
return nullptr;
|
||||
|
||||
return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), value_type);
|
||||
}
|
||||
|
||||
|
@ -74,6 +74,7 @@ void registerOutputFormatCapnProto(FormatFactory & factory);
|
||||
|
||||
void registerInputFormatRegexp(FormatFactory & factory);
|
||||
void registerInputFormatJSONAsString(FormatFactory & factory);
|
||||
void registerInputFormatJSONAsObject(FormatFactory & factory);
|
||||
void registerInputFormatLineAsString(FormatFactory & factory);
|
||||
void registerInputFormatCapnProto(FormatFactory & factory);
|
||||
|
||||
@ -84,6 +85,7 @@ void registerInputFormatHiveText(FormatFactory & factory);
|
||||
/// Non trivial prefix and suffix checkers for disabling parallel parsing.
|
||||
void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory);
|
||||
void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factory);
|
||||
void registerNonTrivialPrefixAndSuffixCheckerJSONAsObject(FormatFactory & factory);
|
||||
|
||||
void registerArrowSchemaReader(FormatFactory & factory);
|
||||
void registerParquetSchemaReader(FormatFactory & factory);
|
||||
@ -175,6 +177,7 @@ void registerFormats()
|
||||
registerInputFormatRegexp(factory);
|
||||
registerInputFormatJSONAsString(factory);
|
||||
registerInputFormatLineAsString(factory);
|
||||
registerInputFormatJSONAsObject(factory);
|
||||
#if USE_HIVE
|
||||
registerInputFormatHiveText(factory);
|
||||
#endif
|
||||
@ -183,6 +186,7 @@ void registerFormats()
|
||||
|
||||
registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(factory);
|
||||
registerNonTrivialPrefixAndSuffixCheckerJSONAsString(factory);
|
||||
registerNonTrivialPrefixAndSuffixCheckerJSONAsObject(factory);
|
||||
|
||||
registerArrowSchemaReader(factory);
|
||||
registerParquetSchemaReader(factory);
|
||||
|
@ -33,22 +33,27 @@ public:
|
||||
|
||||
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
|
||||
|
||||
explicit CastOverloadResolverImpl(std::optional<Diagnostic> diagnostic_, bool keep_nullable_)
|
||||
: diagnostic(std::move(diagnostic_)), keep_nullable(keep_nullable_)
|
||||
explicit CastOverloadResolverImpl(std::optional<Diagnostic> diagnostic_, bool keep_nullable_, bool cast_ipv4_ipv6_default_on_conversion_error_)
|
||||
: diagnostic(std::move(diagnostic_))
|
||||
, keep_nullable(keep_nullable_)
|
||||
, cast_ipv4_ipv6_default_on_conversion_error(cast_ipv4_ipv6_default_on_conversion_error_)
|
||||
{
|
||||
}
|
||||
|
||||
static FunctionOverloadResolverPtr create(ContextPtr context)
|
||||
{
|
||||
const auto & settings_ref = context->getSettingsRef();
|
||||
|
||||
if constexpr (internal)
|
||||
return createImpl();
|
||||
return createImpl({}, context->getSettingsRef().cast_keep_nullable);
|
||||
return createImpl({}, false /*keep_nullable*/, false /*cast_ipv4_ipv6_default_on_conversion_error*/);
|
||||
|
||||
return createImpl({}, settings_ref.cast_keep_nullable, settings_ref.cast_ipv4_ipv6_default_on_conversion_error);
|
||||
}
|
||||
|
||||
static FunctionOverloadResolverPtr createImpl(std::optional<Diagnostic> diagnostic = {}, bool keep_nullable = false)
|
||||
static FunctionOverloadResolverPtr createImpl(std::optional<Diagnostic> diagnostic = {}, bool keep_nullable = false, bool cast_ipv4_ipv6_default_on_conversion_error = false)
|
||||
{
|
||||
assert(!internal || !keep_nullable);
|
||||
return std::make_unique<CastOverloadResolverImpl>(std::move(diagnostic), keep_nullable);
|
||||
return std::make_unique<CastOverloadResolverImpl>(std::move(diagnostic), keep_nullable, cast_ipv4_ipv6_default_on_conversion_error);
|
||||
}
|
||||
|
||||
protected:
|
||||
@ -61,7 +66,7 @@ protected:
|
||||
data_types[i] = arguments[i].type;
|
||||
|
||||
auto monotonicity = MonotonicityHelper::getMonotonicityInformation(arguments.front().type, return_type.get());
|
||||
return std::make_unique<FunctionCast<FunctionName>>(name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type);
|
||||
return std::make_unique<FunctionCast<FunctionName>>(name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type, cast_ipv4_ipv6_default_on_conversion_error);
|
||||
}
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
|
||||
@ -98,6 +103,7 @@ protected:
|
||||
private:
|
||||
std::optional<Diagnostic> diagnostic;
|
||||
bool keep_nullable;
|
||||
bool cast_ipv4_ipv6_default_on_conversion_error;
|
||||
};
|
||||
|
||||
|
||||
@ -115,7 +121,10 @@ struct CastInternalOverloadName
|
||||
static constexpr auto accurate_cast_or_null_name = "accurate_CastOrNull";
|
||||
};
|
||||
|
||||
template <CastType cast_type> using CastOverloadResolver = CastOverloadResolverImpl<cast_type, false, CastOverloadName, CastName>;
|
||||
template <CastType cast_type> using CastInternalOverloadResolver = CastOverloadResolverImpl<cast_type, true, CastInternalOverloadName, CastInternalName>;
|
||||
template <CastType cast_type>
|
||||
using CastOverloadResolver = CastOverloadResolverImpl<cast_type, false, CastOverloadName, CastName>;
|
||||
|
||||
template <CastType cast_type>
|
||||
using CastInternalOverloadResolver = CastOverloadResolverImpl<cast_type, true, CastInternalOverloadName, CastInternalName>;
|
||||
|
||||
}
|
||||
|
@ -8,13 +8,13 @@
|
||||
#include <Core/Settings.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Functions/DummyJSONParser.h>
|
||||
#include <Common/JSONParsers/DummyJSONParser.h>
|
||||
#include <Functions/IFunction.h>
|
||||
#include <Functions/JSONPath/ASTs/ASTJSONPath.h>
|
||||
#include <Functions/JSONPath/Generator/GeneratorJSONPath.h>
|
||||
#include <Functions/JSONPath/Parsers/ParserJSONPath.h>
|
||||
#include <Functions/RapidJSONParser.h>
|
||||
#include <Functions/SimdJSONParser.h>
|
||||
#include <Common/JSONParsers/RapidJSONParser.h>
|
||||
#include <Common/JSONParsers/SimdJSONParser.h>
|
||||
#include <Interpreters/Context.h>
|
||||
#include <Parsers/IParser.h>
|
||||
#include <Parsers/Lexer.h>
|
||||
|
@ -8,26 +8,21 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
|
||||
* Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
|
||||
* Using a naive Bayesian classifier, find the most likely charset and language and return it
|
||||
*/
|
||||
|
||||
template <bool detect_language>
|
||||
struct CharsetClassificationImpl
|
||||
namespace
|
||||
{
|
||||
/* We need to solve zero-frequency problem for Naive Bayes Classifier
|
||||
* If the bigram is not found in the text, we assume that the probability of its meeting is 1e-06.
|
||||
* 1e-06 is minimal value in our marked-up dictionary.
|
||||
*/
|
||||
static constexpr Float64 zero_frequency = 1e-06;
|
||||
constexpr Float64 zero_frequency = 1e-06;
|
||||
|
||||
/// If the data size is bigger than this, behaviour is unspecified for this function.
|
||||
static constexpr size_t max_string_size = 1u << 15;
|
||||
constexpr size_t max_string_size = 1UL << 15;
|
||||
|
||||
static ALWAYS_INLINE inline Float64 naiveBayes(
|
||||
template <typename ModelMap>
|
||||
ALWAYS_INLINE inline Float64 naiveBayes(
|
||||
const FrequencyHolder::EncodingMap & standard,
|
||||
const HashMap<UInt16, UInt64> & model,
|
||||
const ModelMap & model,
|
||||
Float64 max_result)
|
||||
{
|
||||
Float64 res = 0;
|
||||
@ -52,10 +47,11 @@ struct CharsetClassificationImpl
|
||||
}
|
||||
|
||||
/// Сount how many times each bigram occurs in the text.
|
||||
static ALWAYS_INLINE inline void calculateStats(
|
||||
template <typename ModelMap>
|
||||
ALWAYS_INLINE inline void calculateStats(
|
||||
const UInt8 * data,
|
||||
const size_t size,
|
||||
HashMap<UInt16, UInt64> & model)
|
||||
ModelMap & model)
|
||||
{
|
||||
UInt16 hash = 0;
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
@ -65,7 +61,15 @@ struct CharsetClassificationImpl
|
||||
++model[hash];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Determine language and charset of text data. For each text, we build the distribution of bigrams bytes.
|
||||
* Then we use marked-up dictionaries with distributions of bigram bytes of various languages and charsets.
|
||||
* Using a naive Bayesian classifier, find the most likely charset and language and return it
|
||||
*/
|
||||
template <bool detect_language>
|
||||
struct CharsetClassificationImpl
|
||||
{
|
||||
static void vector(
|
||||
const ColumnString::Chars & data,
|
||||
const ColumnString::Offsets & offsets,
|
||||
@ -74,7 +78,7 @@ struct CharsetClassificationImpl
|
||||
{
|
||||
const auto & encodings_freq = FrequencyHolder::getInstance().getEncodingsFrequency();
|
||||
|
||||
if (detect_language)
|
||||
if constexpr (detect_language)
|
||||
/// 2 chars for ISO code + 1 zero byte
|
||||
res_data.reserve(offsets.size() * 3);
|
||||
else
|
||||
@ -83,37 +87,43 @@ struct CharsetClassificationImpl
|
||||
|
||||
res_offsets.resize(offsets.size());
|
||||
|
||||
size_t res_offset = 0;
|
||||
size_t current_result_offset = 0;
|
||||
|
||||
double zero_frequency_log = log(zero_frequency);
|
||||
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
const UInt8 * str = data.data() + offsets[i - 1];
|
||||
const size_t str_len = offsets[i] - offsets[i - 1] - 1;
|
||||
|
||||
std::string_view res;
|
||||
|
||||
HashMap<UInt16, UInt64> model;
|
||||
HashMapWithStackMemory<UInt16, UInt64, DefaultHash<UInt16>, 4> model;
|
||||
calculateStats(str, str_len, model);
|
||||
|
||||
std::string_view result_value;
|
||||
|
||||
/// Go through the dictionary and find the charset with the highest weight
|
||||
Float64 max_result = log(zero_frequency) * (max_string_size);
|
||||
Float64 max_result = zero_frequency_log * (max_string_size);
|
||||
for (const auto & item : encodings_freq)
|
||||
{
|
||||
Float64 score = naiveBayes(item.map, model, max_result);
|
||||
if (max_result < score)
|
||||
{
|
||||
max_result = score;
|
||||
res = detect_language ? item.lang : item.name;
|
||||
|
||||
if constexpr (detect_language)
|
||||
result_value = item.lang;
|
||||
else
|
||||
result_value = item.name;
|
||||
}
|
||||
}
|
||||
|
||||
res_data.resize(res_offset + res.size() + 1);
|
||||
memcpy(&res_data[res_offset], res.data(), res.size());
|
||||
size_t result_value_size = result_value.size();
|
||||
res_data.resize(current_result_offset + result_value_size + 1);
|
||||
memcpy(&res_data[current_result_offset], result_value.data(), result_value_size);
|
||||
res_data[current_result_offset + result_value_size] = '\0';
|
||||
current_result_offset += result_value_size + 1;
|
||||
|
||||
res_data[res_offset + res.size()] = 0;
|
||||
res_offset += res.size() + 1;
|
||||
|
||||
res_offsets[i] = res_offset;
|
||||
res_offsets[i] = current_result_offset;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user