mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-19 16:20:50 +00:00
Merge branch 'ClickHouse:master' into master
This commit is contained in:
commit
55eb2953c7
@ -54,6 +54,7 @@ Other upcoming meetups
|
||||
* [Sydney Meetup](https://www.meetup.com/clickhouse-australia-user-group/events/302862966/) - September 5
|
||||
* [Zurich Meetup](https://www.meetup.com/clickhouse-switzerland-meetup-group/events/302267429/) - September 5
|
||||
* [Toronto Meetup (Shopify)](https://www.meetup.com/clickhouse-toronto-user-group/events/301490855/) - September 10
|
||||
* [Austin Meetup](https://www.meetup.com/clickhouse-austin-user-group/events/302558689/) - September 17
|
||||
* [London Meetup](https://www.meetup.com/clickhouse-london-user-group/events/302977267) - September 17
|
||||
|
||||
## Recent Recordings
|
||||
|
@ -311,6 +311,14 @@ int SecureSocketImpl::sendBytes(const void* buffer, int length, int flags)
|
||||
while (mustRetry(rc, remaining_time));
|
||||
if (rc <= 0)
|
||||
{
|
||||
// At this stage we still can have last not yet received SSL message containing SSL error
|
||||
// so make a read to force SSL to process possible SSL error
|
||||
if (SSL_get_error(_pSSL, rc) == SSL_ERROR_SYSCALL && SocketImpl::lastError() == POCO_ECONNRESET)
|
||||
{
|
||||
char c = 0;
|
||||
SSL_read(_pSSL, &c, 1);
|
||||
}
|
||||
|
||||
rc = handleError(rc);
|
||||
if (rc == 0) throw SSLConnectionUnexpectedlyClosedException();
|
||||
}
|
||||
|
@ -8,4 +8,7 @@ set (CMAKE_CXX_COMPILER_TARGET "x86_64-pc-freebsd11")
|
||||
set (CMAKE_ASM_COMPILER_TARGET "x86_64-pc-freebsd11")
|
||||
set (CMAKE_SYSROOT "${CMAKE_CURRENT_LIST_DIR}/../../contrib/sysroot/freebsd-x86_64")
|
||||
|
||||
# dprintf is used in a patched version of replxx
|
||||
add_compile_definitions(_WITH_DPRINTF)
|
||||
|
||||
set (CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) # disable linkage check - it doesn't work in CMake
|
||||
|
2
contrib/replxx
vendored
2
contrib/replxx
vendored
@ -1 +1 @@
|
||||
Subproject commit 5d04501f93a4fb7f0bb8b73b8f614bc986f9e25b
|
||||
Subproject commit 711c18e7f4d951255aa8b0851e5a55d5a5fb0ddb
|
@ -112,3 +112,5 @@ wadllib==1.3.6
|
||||
websocket-client==0.59.0
|
||||
wheel==0.37.1
|
||||
zipp==1.0.0
|
||||
deltalake==0.16.0
|
||||
|
||||
|
@ -6,28 +6,34 @@ sidebar_label: Iceberg
|
||||
|
||||
# Iceberg Table Engine
|
||||
|
||||
This engine provides a read-only integration with existing Apache [Iceberg](https://iceberg.apache.org/) tables in Amazon S3.
|
||||
This engine provides a read-only integration with existing Apache [Iceberg](https://iceberg.apache.org/) tables in Amazon S3, Azure and locally stored tables.
|
||||
|
||||
## Create Table
|
||||
|
||||
Note that the Iceberg table must already exist in S3, this command does not take DDL parameters to create a new table.
|
||||
Note that the Iceberg table must already exist in the storage, this command does not take DDL parameters to create a new table.
|
||||
|
||||
``` sql
|
||||
CREATE TABLE iceberg_table
|
||||
ENGINE = Iceberg(url, [aws_access_key_id, aws_secret_access_key,])
|
||||
CREATE TABLE iceberg_table_s3
|
||||
ENGINE = IcebergS3(url, [, NOSIGN | access_key_id, secret_access_key, [session_token]], format, [,compression])
|
||||
|
||||
CREATE TABLE iceberg_table_azure
|
||||
ENGINE = IcebergAzure(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression])
|
||||
|
||||
CREATE TABLE iceberg_table_local
|
||||
ENGINE = IcebergLocal(path_to_table, [,format] [,compression_method])
|
||||
```
|
||||
|
||||
**Engine parameters**
|
||||
**Engine arguments**
|
||||
|
||||
- `url` — url with the path to an existing Iceberg table.
|
||||
- `aws_access_key_id`, `aws_secret_access_key` - Long-term credentials for the [AWS](https://aws.amazon.com/) account user. You can use these to authenticate your requests. Parameter is optional. If credentials are not specified, they are used from the configuration file.
|
||||
Description of the arguments coincides with description of arguments in engines `S3`, `AzureBlobStorage` and `File` correspondingly.
|
||||
`format` stands for the format of data files in the Iceberg table.
|
||||
|
||||
Engine parameters can be specified using [Named Collections](../../../operations/named-collections.md)
|
||||
|
||||
**Example**
|
||||
|
||||
```sql
|
||||
CREATE TABLE iceberg_table ENGINE=Iceberg('http://test.s3.amazonaws.com/clickhouse-bucket/test_table', 'test', 'test')
|
||||
CREATE TABLE iceberg_table ENGINE=IcebergS3('http://test.s3.amazonaws.com/clickhouse-bucket/test_table', 'test', 'test')
|
||||
```
|
||||
|
||||
Using named collections:
|
||||
@ -45,9 +51,15 @@ Using named collections:
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE TABLE iceberg_table ENGINE=Iceberg(iceberg_conf, filename = 'test_table')
|
||||
CREATE TABLE iceberg_table ENGINE=IcebergS3(iceberg_conf, filename = 'test_table')
|
||||
|
||||
```
|
||||
|
||||
**Aliases**
|
||||
|
||||
|
||||
Table engine `Iceberg` is an alias to `IcebergS3` now.
|
||||
|
||||
## See also
|
||||
|
||||
- [iceberg table function](/docs/en/sql-reference/table-functions/iceberg.md)
|
||||
|
@ -1389,7 +1389,7 @@ DESC format(JSONEachRow, '{"id" : 1, "age" : 25, "name" : "Josh", "status" : nul
|
||||
#### schema_inference_make_columns_nullable
|
||||
|
||||
Controls making inferred types `Nullable` in schema inference for formats without information about nullability.
|
||||
If the setting is enabled, all inferred type will be `Nullable`, if disabled, the inferred type will be `Nullable` only if `input_format_null_as_default` is disabled and the column contains `NULL` in a sample that is parsed during schema inference.
|
||||
If the setting is enabled, all inferred type will be `Nullable`, if disabled, the inferred type will never be `Nullable`, if set to `auto`, the inferred type will be `Nullable` only if the column contains `NULL` in a sample that is parsed during schema inference or file metadata contains information about column nullability.
|
||||
|
||||
Enabled by default.
|
||||
|
||||
@ -1412,15 +1412,13 @@ DESC format(JSONEachRow, $$
|
||||
└─────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
|
||||
```
|
||||
```sql
|
||||
SET schema_inference_make_columns_nullable = 0;
|
||||
SET input_format_null_as_default = 0;
|
||||
SET schema_inference_make_columns_nullable = 'auto';
|
||||
DESC format(JSONEachRow, $$
|
||||
{"id" : 1, "age" : 25, "name" : "Josh", "status" : null, "hobbies" : ["football", "cooking"]}
|
||||
{"id" : 2, "age" : 19, "name" : "Alan", "status" : "married", "hobbies" : ["tennis", "art"]}
|
||||
$$)
|
||||
```
|
||||
```response
|
||||
|
||||
┌─name────┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
|
||||
│ id │ Int64 │ │ │ │ │ │
|
||||
│ age │ Int64 │ │ │ │ │ │
|
||||
@ -1432,7 +1430,6 @@ DESC format(JSONEachRow, $$
|
||||
|
||||
```sql
|
||||
SET schema_inference_make_columns_nullable = 0;
|
||||
SET input_format_null_as_default = 1;
|
||||
DESC format(JSONEachRow, $$
|
||||
{"id" : 1, "age" : 25, "name" : "Josh", "status" : null, "hobbies" : ["football", "cooking"]}
|
||||
{"id" : 2, "age" : 19, "name" : "Alan", "status" : "married", "hobbies" : ["tennis", "art"]}
|
||||
|
@ -171,8 +171,8 @@ If the `schema_inference_hints` is not formated properly, or if there is a typo
|
||||
|
||||
## schema_inference_make_columns_nullable {#schema_inference_make_columns_nullable}
|
||||
|
||||
Controls making inferred types `Nullable` in schema inference for formats without information about nullability.
|
||||
If the setting is enabled, the inferred type will be `Nullable` only if column contains `NULL` in a sample that is parsed during schema inference.
|
||||
Controls making inferred types `Nullable` in schema inference.
|
||||
If the setting is enabled, all inferred type will be `Nullable`, if disabled, the inferred type will never be `Nullable`, if set to `auto`, the inferred type will be `Nullable` only if the column contains `NULL` in a sample that is parsed during schema inference or file metadata contains information about column nullability.
|
||||
|
||||
Default value: `true`.
|
||||
|
||||
|
@ -6,7 +6,7 @@ title: "Functions for Working with Geohash"
|
||||
|
||||
## Geohash
|
||||
|
||||
[Geohash](https://en.wikipedia.org/wiki/Geohash) is the geocode system, which subdivides Earth’s surface into buckets of grid shape and encodes each cell into a short string of letters and digits. It is a hierarchical data structure, so the longer is the geohash string, the more precise is the geographic location.
|
||||
[Geohash](https://en.wikipedia.org/wiki/Geohash) is the geocode system, which subdivides Earth’s surface into buckets of grid shape and encodes each cell into a short string of letters and digits. It is a hierarchical data structure, so the longer the geohash string is, the more precise the geographic location will be.
|
||||
|
||||
If you need to manually convert geographic coordinates to geohash strings, you can use [geohash.org](http://geohash.org/).
|
||||
|
||||
@ -14,26 +14,37 @@ If you need to manually convert geographic coordinates to geohash strings, you c
|
||||
|
||||
Encodes latitude and longitude as a [geohash](#geohash)-string.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
geohashEncode(longitude, latitude, [precision])
|
||||
```
|
||||
|
||||
**Input values**
|
||||
|
||||
- longitude - longitude part of the coordinate you want to encode. Floating in range`[-180°, 180°]`
|
||||
- latitude - latitude part of the coordinate you want to encode. Floating in range `[-90°, 90°]`
|
||||
- precision - Optional, length of the resulting encoded string, defaults to `12`. Integer in range `[1, 12]`. Any value less than `1` or greater than `12` is silently converted to `12`.
|
||||
- `longitude` — Longitude part of the coordinate you want to encode. Floating in range`[-180°, 180°]`. [Float](../../data-types/float.md).
|
||||
- `latitude` — Latitude part of the coordinate you want to encode. Floating in range `[-90°, 90°]`. [Float](../../data-types/float.md).
|
||||
- `precision` (optional) — Length of the resulting encoded string. Defaults to `12`. Integer in the range `[1, 12]`. [Int8](../../data-types/int-uint.md).
|
||||
|
||||
:::note
|
||||
- All coordinate parameters must be of the same type: either `Float32` or `Float64`.
|
||||
- For the `precision` parameter, any value less than `1` or greater than `12` is silently converted to `12`.
|
||||
:::
|
||||
|
||||
**Returned values**
|
||||
|
||||
- alphanumeric `String` of encoded coordinate (modified version of the base32-encoding alphabet is used).
|
||||
- Alphanumeric string of the encoded coordinate (modified version of the base32-encoding alphabet is used). [String](../../data-types/string.md).
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─res──────────┐
|
||||
│ ezs42d000000 │
|
||||
@ -44,13 +55,19 @@ SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res;
|
||||
|
||||
Decodes any [geohash](#geohash)-encoded string into longitude and latitude.
|
||||
|
||||
**Syntax**
|
||||
|
||||
```sql
|
||||
geohashDecode(hash_str)
|
||||
```
|
||||
|
||||
**Input values**
|
||||
|
||||
- encoded string - geohash-encoded string.
|
||||
- `hash_str` — Geohash-encoded string.
|
||||
|
||||
**Returned values**
|
||||
|
||||
- (longitude, latitude) - 2-tuple of `Float64` values of longitude and latitude.
|
||||
- Tuple `(longitude, latitude)` of `Float64` values of longitude and latitude. [Tuple](../../data-types/tuple.md)([Float64](../../data-types/float.md))
|
||||
|
||||
**Example**
|
||||
|
||||
|
@ -688,6 +688,40 @@ SELECT kostikConsistentHash(16045690984833335023, 2);
|
||||
└───────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ripeMD160
|
||||
|
||||
Produces [RIPEMD-160](https://en.wikipedia.org/wiki/RIPEMD) hash value.
|
||||
|
||||
**Syntax**
|
||||
|
||||
```sql
|
||||
ripeMD160(input)
|
||||
```
|
||||
|
||||
**Parameters**
|
||||
|
||||
- `input`: Input string. [String](../data-types/string.md)
|
||||
|
||||
**Returned value**
|
||||
|
||||
- A [UInt256](../data-types/int-uint.md) hash value where the 160-bit RIPEMD-160 hash is stored in the first 20 bytes. The remaining 12 bytes are zero-padded.
|
||||
|
||||
**Example**
|
||||
|
||||
Use the [hex](../functions/encoding-functions.md/#hex) function to represent the result as a hex-encoded string.
|
||||
|
||||
Query:
|
||||
|
||||
```sql
|
||||
SELECT hex(ripeMD160('The quick brown fox jumps over the lazy dog'));
|
||||
```
|
||||
|
||||
```response
|
||||
┌─hex(ripeMD160('The quick brown fox jumps over the lazy dog'))─┐
|
||||
│ 37F332F68DB77BD9D7EDD4969571AD671CF9DD3B │
|
||||
└───────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## murmurHash2_32, murmurHash2_64
|
||||
|
||||
Produces a [MurmurHash2](https://github.com/aappleby/smhasher) hash value.
|
||||
|
@ -6,35 +6,37 @@ sidebar_label: iceberg
|
||||
|
||||
# iceberg Table Function
|
||||
|
||||
Provides a read-only table-like interface to Apache [Iceberg](https://iceberg.apache.org/) tables in Amazon S3.
|
||||
Provides a read-only table-like interface to Apache [Iceberg](https://iceberg.apache.org/) tables in Amazon S3, Azure or locally stored.
|
||||
|
||||
## Syntax
|
||||
|
||||
``` sql
|
||||
iceberg(url [,aws_access_key_id, aws_secret_access_key] [,format] [,structure])
|
||||
icebergS3(url [, NOSIGN | access_key_id, secret_access_key, [session_token]] [,format] [,compression_method])
|
||||
icebergS3(named_collection[, option=value [,..]])
|
||||
|
||||
icebergAzure(connection_string|storage_account_url, container_name, blobpath, [,account_name], [,account_key] [,format] [,compression_method])
|
||||
icebergAzure(named_collection[, option=value [,..]])
|
||||
|
||||
icebergLocal(path_to_table, [,format] [,compression_method])
|
||||
icebergLocal(named_collection[, option=value [,..]])
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
- `url` — Bucket url with the path to an existing Iceberg table in S3.
|
||||
- `aws_access_key_id`, `aws_secret_access_key` - Long-term credentials for the [AWS](https://aws.amazon.com/) account user. You can use these to authenticate your requests. These parameters are optional. If credentials are not specified, they are used from the ClickHouse configuration. For more information see [Using S3 for Data Storage](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-s3).
|
||||
- `format` — The [format](/docs/en/interfaces/formats.md/#formats) of the file. By default `Parquet` is used.
|
||||
- `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`.
|
||||
|
||||
Engine parameters can be specified using [Named Collections](/docs/en/operations/named-collections.md).
|
||||
Description of the arguments coincides with description of arguments in table functions `s3`, `azureBlobStorage` and `file` correspondingly.
|
||||
`format` stands for the format of data files in the Iceberg table.
|
||||
|
||||
**Returned value**
|
||||
|
||||
A table with the specified structure for reading data in the specified Iceberg table in S3.
|
||||
A table with the specified structure for reading data in the specified Iceberg table.
|
||||
|
||||
**Example**
|
||||
|
||||
```sql
|
||||
SELECT * FROM iceberg('http://test.s3.amazonaws.com/clickhouse-bucket/test_table', 'test', 'test')
|
||||
SELECT * FROM icebergS3('http://test.s3.amazonaws.com/clickhouse-bucket/test_table', 'test', 'test')
|
||||
```
|
||||
|
||||
:::important
|
||||
ClickHouse currently supports reading v1 (v2 support is coming soon!) of the Iceberg format via the `iceberg` table function and `Iceberg` table engine.
|
||||
ClickHouse currently supports reading v1 and v2 of the Iceberg format via the `icebergS3`, `icebergAzure` and `icebergLocal` table functions and `IcebergS3`, `icebergAzure` ans `icebergLocal` table engines.
|
||||
:::
|
||||
|
||||
## Defining a named collection
|
||||
@ -56,10 +58,14 @@ Here is an example of configuring a named collection for storing the URL and cre
|
||||
```
|
||||
|
||||
```sql
|
||||
SELECT * FROM iceberg(iceberg_conf, filename = 'test_table')
|
||||
DESCRIBE iceberg(iceberg_conf, filename = 'test_table')
|
||||
SELECT * FROM icebergS3(iceberg_conf, filename = 'test_table')
|
||||
DESCRIBE icebergS3(iceberg_conf, filename = 'test_table')
|
||||
```
|
||||
|
||||
**Aliases**
|
||||
|
||||
Table function `iceberg` is an alias to `icebergS3` now.
|
||||
|
||||
**See Also**
|
||||
|
||||
- [Iceberg engine](/docs/en/engines/table-engines/integrations/iceberg.md)
|
||||
|
@ -124,6 +124,40 @@ SELECT hex(sipHash128('foo', '\x01', 3));
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
|
||||
## ripeMD160
|
||||
|
||||
Генерирует [RIPEMD-160](https://en.wikipedia.org/wiki/RIPEMD) хеш строки.
|
||||
|
||||
**Синтаксис**
|
||||
|
||||
```sql
|
||||
ripeMD160(input)
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
|
||||
- `input`: Строка [String](../data-types/string.md)
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
- [UInt256](../data-types/int-uint.md), где 160-битный хеш RIPEMD-160 хранится в первых 20 байтах. Оставшиеся 12 байт заполняются нулями.
|
||||
|
||||
**Пример**
|
||||
|
||||
Используйте функцию [hex](../functions/encoding-functions.md#hex) для представления результата в виде строки с шестнадцатеричной кодировкой
|
||||
|
||||
Запрос:
|
||||
|
||||
```sql
|
||||
SELECT hex(ripeMD160('The quick brown fox jumps over the lazy dog'));
|
||||
```
|
||||
Результат:
|
||||
```response
|
||||
┌─hex(ripeMD160('The quick brown fox jumps over the lazy dog'))─┐
|
||||
│ 37F332F68DB77BD9D7EDD4969571AD671CF9DD3B │
|
||||
└───────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## cityHash64 {#cityhash64}
|
||||
|
||||
Генерирует 64-х битное значение [CityHash](https://github.com/google/cityhash).
|
||||
|
@ -978,6 +978,7 @@ try
|
||||
/** Explicitly destroy Context. It is more convenient than in destructor of Server, because logger is still available.
|
||||
* At this moment, no one could own shared part of Context.
|
||||
*/
|
||||
global_context->resetSharedContext();
|
||||
global_context.reset();
|
||||
shared_context.reset();
|
||||
LOG_DEBUG(log, "Destroyed global context.");
|
||||
|
@ -692,7 +692,7 @@ QueryTreeNodePtr IdentifierResolver::tryResolveIdentifierFromStorage(
|
||||
result_column_node = it->second;
|
||||
}
|
||||
/// Check if it's a dynamic subcolumn
|
||||
else
|
||||
else if (table_expression_data.supports_subcolumns)
|
||||
{
|
||||
auto [column_name, dynamic_subcolumn_name] = Nested::splitName(identifier_full_name);
|
||||
auto jt = table_expression_data.column_name_to_column_node.find(column_name);
|
||||
|
@ -4379,7 +4379,10 @@ void QueryAnalyzer::initializeTableExpressionData(const QueryTreeNodePtr & table
|
||||
|
||||
auto get_column_options = GetColumnsOptions(GetColumnsOptions::All).withExtendedObjects().withVirtuals();
|
||||
if (storage_snapshot->storage.supportsSubcolumns())
|
||||
{
|
||||
get_column_options.withSubcolumns();
|
||||
table_expression_data.supports_subcolumns = true;
|
||||
}
|
||||
|
||||
auto column_names_and_types = storage_snapshot->getColumns(get_column_options);
|
||||
table_expression_data.column_names_and_types = NamesAndTypes(column_names_and_types.begin(), column_names_and_types.end());
|
||||
|
@ -36,6 +36,7 @@ struct AnalysisTableExpressionData
|
||||
std::string database_name;
|
||||
std::string table_name;
|
||||
bool should_qualify_columns = true;
|
||||
bool supports_subcolumns = false;
|
||||
NamesAndTypes column_names_and_types;
|
||||
ColumnNameToColumnNodeMap column_name_to_column_node;
|
||||
std::unordered_set<std::string> subcolumn_names; /// Subset columns that are subcolumns of other columns
|
||||
|
@ -100,6 +100,7 @@ protected:
|
||||
auto buf = BuilderRWBufferFromHTTP(getPingURI())
|
||||
.withConnectionGroup(HTTPConnectionGroupType::STORAGE)
|
||||
.withTimeouts(getHTTPTimeouts())
|
||||
.withSettings(getContext()->getReadSettings())
|
||||
.create(credentials);
|
||||
|
||||
return checkString(PING_OK_ANSWER, *buf);
|
||||
@ -206,6 +207,7 @@ protected:
|
||||
.withConnectionGroup(HTTPConnectionGroupType::STORAGE)
|
||||
.withMethod(Poco::Net::HTTPRequest::HTTP_POST)
|
||||
.withTimeouts(getHTTPTimeouts())
|
||||
.withSettings(getContext()->getReadSettings())
|
||||
.create(credentials);
|
||||
|
||||
bool res = false;
|
||||
@ -232,6 +234,7 @@ protected:
|
||||
.withConnectionGroup(HTTPConnectionGroupType::STORAGE)
|
||||
.withMethod(Poco::Net::HTTPRequest::HTTP_POST)
|
||||
.withTimeouts(getHTTPTimeouts())
|
||||
.withSettings(getContext()->getReadSettings())
|
||||
.create(credentials);
|
||||
|
||||
std::string character;
|
||||
|
@ -111,6 +111,7 @@ add_headers_and_sources(dbms Storages/ObjectStorage)
|
||||
add_headers_and_sources(dbms Storages/ObjectStorage/Azure)
|
||||
add_headers_and_sources(dbms Storages/ObjectStorage/S3)
|
||||
add_headers_and_sources(dbms Storages/ObjectStorage/HDFS)
|
||||
add_headers_and_sources(dbms Storages/ObjectStorage/Local)
|
||||
add_headers_and_sources(dbms Storages/ObjectStorage/DataLakes)
|
||||
add_headers_and_sources(dbms Common/NamedCollections)
|
||||
|
||||
|
@ -145,6 +145,9 @@ void Connection::connect(const ConnectionTimeouts & timeouts)
|
||||
/// work we need to pass host name separately. It will be send into TLS Hello packet to let
|
||||
/// the server know which host we want to talk with (single IP can process requests for multiple hosts using SNI).
|
||||
static_cast<Poco::Net::SecureStreamSocket*>(socket.get())->setPeerHostName(host);
|
||||
/// we want to postpone SSL handshake until first read or write operation
|
||||
/// so any errors during negotiation would be properly processed
|
||||
static_cast<Poco::Net::SecureStreamSocket*>(socket.get())->setLazyHandshake(true);
|
||||
#else
|
||||
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "tcp_secure protocol is disabled because poco library was built without NetSSL support.");
|
||||
#endif
|
||||
|
@ -299,13 +299,14 @@ ReplxxLineReader::ReplxxLineReader(
|
||||
Patterns delimiters_,
|
||||
const char word_break_characters_[],
|
||||
replxx::Replxx::highlighter_callback_t highlighter_,
|
||||
[[ maybe_unused ]] std::istream & input_stream_,
|
||||
[[ maybe_unused ]] std::ostream & output_stream_,
|
||||
[[ maybe_unused ]] int in_fd_,
|
||||
[[ maybe_unused ]] int out_fd_,
|
||||
[[ maybe_unused ]] int err_fd_
|
||||
std::istream & input_stream_,
|
||||
std::ostream & output_stream_,
|
||||
int in_fd_,
|
||||
int out_fd_,
|
||||
int err_fd_
|
||||
)
|
||||
: LineReader(history_file_path_, multiline_, std::move(extenders_), std::move(delimiters_), input_stream_, output_stream_, in_fd_)
|
||||
, rx(input_stream_, output_stream_, in_fd_, out_fd_, err_fd_)
|
||||
, highlighter(std::move(highlighter_))
|
||||
, word_break_characters(word_break_characters_)
|
||||
, editor(getEditor())
|
||||
@ -516,7 +517,7 @@ void ReplxxLineReader::addToHistory(const String & line)
|
||||
rx.history_add(line);
|
||||
|
||||
// flush changes to the disk
|
||||
if (!rx.history_save(history_file_path))
|
||||
if (history_file_fd >= 0 && !rx.history_save(history_file_path))
|
||||
rx.print("Saving history failed: %s\n", errnoToString().c_str());
|
||||
|
||||
if (history_file_fd >= 0 && locked && 0 != flock(history_file_fd, LOCK_UN))
|
||||
|
@ -1181,13 +1181,14 @@ void ColumnDynamic::takeDynamicStructureFromSourceColumns(const Columns & source
|
||||
/// Check if the number of all dynamic types exceeds the limit.
|
||||
if (!canAddNewVariants(0, all_variants.size()))
|
||||
{
|
||||
/// Create list of variants with their sizes and sort it.
|
||||
std::vector<std::pair<size_t, DataTypePtr>> variants_with_sizes;
|
||||
/// Create a list of variants with their sizes and names and then sort it.
|
||||
std::vector<std::tuple<size_t, String, DataTypePtr>> variants_with_sizes;
|
||||
variants_with_sizes.reserve(all_variants.size());
|
||||
for (const auto & variant : all_variants)
|
||||
{
|
||||
if (variant->getName() != getSharedVariantTypeName())
|
||||
variants_with_sizes.emplace_back(total_sizes[variant->getName()], variant);
|
||||
auto variant_name = variant->getName();
|
||||
if (variant_name != getSharedVariantTypeName())
|
||||
variants_with_sizes.emplace_back(total_sizes[variant_name], variant_name, variant);
|
||||
}
|
||||
std::sort(variants_with_sizes.begin(), variants_with_sizes.end(), std::greater());
|
||||
|
||||
@ -1196,14 +1197,14 @@ void ColumnDynamic::takeDynamicStructureFromSourceColumns(const Columns & source
|
||||
result_variants.reserve(max_dynamic_types + 1); /// +1 for shared variant.
|
||||
/// Add shared variant.
|
||||
result_variants.push_back(getSharedVariantDataType());
|
||||
for (const auto & [size, variant] : variants_with_sizes)
|
||||
for (const auto & [size, variant_name, variant_type] : variants_with_sizes)
|
||||
{
|
||||
/// Add variant to the resulting variants list until we reach max_dynamic_types.
|
||||
if (canAddNewVariant(result_variants.size()))
|
||||
result_variants.push_back(variant);
|
||||
result_variants.push_back(variant_type);
|
||||
/// Add all remaining variants into shared_variants_statistics until we reach its max size.
|
||||
else if (new_statistics.shared_variants_statistics.size() < Statistics::MAX_SHARED_VARIANT_STATISTICS_SIZE)
|
||||
new_statistics.shared_variants_statistics[variant->getName()] = size;
|
||||
new_statistics.shared_variants_statistics[variant_name] = size;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
@ -127,7 +127,7 @@ std::string ColumnObject::getName() const
|
||||
{
|
||||
WriteBufferFromOwnString ss;
|
||||
ss << "Object(";
|
||||
ss << "max_dynamic_paths=" << max_dynamic_paths;
|
||||
ss << "max_dynamic_paths=" << global_max_dynamic_paths;
|
||||
ss << ", max_dynamic_types=" << max_dynamic_types;
|
||||
std::vector<String> sorted_typed_paths;
|
||||
sorted_typed_paths.reserve(typed_paths.size());
|
||||
@ -1045,9 +1045,9 @@ void ColumnObject::forEachSubcolumnRecursively(DB::IColumn::RecursiveMutableColu
|
||||
|
||||
bool ColumnObject::structureEquals(const IColumn & rhs) const
|
||||
{
|
||||
/// 2 Object columns have equal structure if they have the same typed paths and max_dynamic_paths/max_dynamic_types.
|
||||
/// 2 Object columns have equal structure if they have the same typed paths and global_max_dynamic_paths/max_dynamic_types.
|
||||
const auto * rhs_object = typeid_cast<const ColumnObject *>(&rhs);
|
||||
if (!rhs_object || typed_paths.size() != rhs_object->typed_paths.size() || max_dynamic_paths != rhs_object->max_dynamic_paths || max_dynamic_types != rhs_object->max_dynamic_types)
|
||||
if (!rhs_object || typed_paths.size() != rhs_object->typed_paths.size() || global_max_dynamic_paths != rhs_object->global_max_dynamic_paths || max_dynamic_types != rhs_object->max_dynamic_types)
|
||||
return false;
|
||||
|
||||
for (const auto & [path, column] : typed_paths)
|
||||
|
@ -953,7 +953,7 @@ ColumnPtr ColumnVariant::index(const IColumn & indexes, size_t limit) const
|
||||
{
|
||||
/// If we have only NULLs, index will take no effect, just return resized column.
|
||||
if (hasOnlyNulls())
|
||||
return cloneResized(limit);
|
||||
return cloneResized(limit == 0 ? indexes.size(): limit);
|
||||
|
||||
/// Optimization when we have only one non empty variant and no NULLs.
|
||||
/// In this case local_discriminators column is filled with identical values and offsets column
|
||||
@ -1009,8 +1009,16 @@ ColumnPtr ColumnVariant::indexImpl(const PaddedPODArray<Type> & indexes, size_t
|
||||
new_variants.reserve(num_variants);
|
||||
for (size_t i = 0; i != num_variants; ++i)
|
||||
{
|
||||
size_t nested_limit = nested_perms[i].size() == variants[i]->size() ? 0 : nested_perms[i].size();
|
||||
new_variants.emplace_back(variants[i]->permute(nested_perms[i], nested_limit));
|
||||
/// Check if no values from this variant were selected.
|
||||
if (nested_perms[i].empty())
|
||||
{
|
||||
new_variants.emplace_back(variants[i]->cloneEmpty());
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t nested_limit = nested_perms[i].size() == variants[i]->size() ? 0 : nested_perms[i].size();
|
||||
new_variants.emplace_back(variants[i]->permute(nested_perms[i], nested_limit));
|
||||
}
|
||||
}
|
||||
|
||||
/// We cannot use new_offsets column as an offset column, because it became invalid after variants permutation.
|
||||
|
@ -1120,7 +1120,7 @@ class IColumn;
|
||||
M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \
|
||||
M(String, schema_inference_hints, "", "The list of column names and types to use in schema inference for formats without column names. The format: 'column_name1 column_type1, column_name2 column_type2, ...'", 0) \
|
||||
M(SchemaInferenceMode, schema_inference_mode, "default", "Mode of schema inference. 'default' - assume that all files have the same schema and schema can be inferred from any file, 'union' - files can have different schemas and the resulting schema should be the a union of schemas of all files", 0) \
|
||||
M(Bool, schema_inference_make_columns_nullable, true, "If set to true, all inferred types will be Nullable in schema inference for formats without information about nullability.", 0) \
|
||||
M(UInt64Auto, schema_inference_make_columns_nullable, 1, "If set to true, all inferred types will be Nullable in schema inference. When set to false, no columns will be converted to Nullable. When set to 'auto', ClickHouse will use information about nullability from the data.", 0) \
|
||||
M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \
|
||||
M(Bool, input_format_json_read_bools_as_strings, true, "Allow to parse bools as strings in JSON input formats", 0) \
|
||||
M(Bool, input_format_json_try_infer_numbers_from_strings, false, "Try to infer numbers from string fields while schema inference", 0) \
|
||||
|
@ -22,7 +22,6 @@
|
||||
#include <cstring>
|
||||
#include <unistd.h>
|
||||
#include <algorithm>
|
||||
#include <typeinfo>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
|
||||
|
@ -43,39 +43,21 @@ bool LocalObjectStorage::exists(const StoredObject & object) const
|
||||
std::unique_ptr<ReadBufferFromFileBase> LocalObjectStorage::readObjects( /// NOLINT
|
||||
const StoredObjects & objects,
|
||||
const ReadSettings & read_settings,
|
||||
std::optional<size_t> read_hint,
|
||||
std::optional<size_t> file_size) const
|
||||
std::optional<size_t>,
|
||||
std::optional<size_t>) const
|
||||
{
|
||||
auto modified_settings = patchSettings(read_settings);
|
||||
auto global_context = Context::getGlobalContextInstance();
|
||||
auto read_buffer_creator =
|
||||
[=] (bool /* restricted_seek */, const StoredObject & object)
|
||||
-> std::unique_ptr<ReadBufferFromFileBase>
|
||||
{
|
||||
return createReadBufferFromFileBase(object.remote_path, modified_settings, read_hint, file_size);
|
||||
};
|
||||
auto read_buffer_creator = [=](bool /* restricted_seek */, const StoredObject & object) -> std::unique_ptr<ReadBufferFromFileBase>
|
||||
{ return std::make_unique<ReadBufferFromFile>(object.remote_path); };
|
||||
|
||||
switch (read_settings.remote_fs_method)
|
||||
{
|
||||
case RemoteFSReadMethod::read:
|
||||
{
|
||||
return std::make_unique<ReadBufferFromRemoteFSGather>(
|
||||
std::move(read_buffer_creator), objects, "file:", modified_settings,
|
||||
global_context->getFilesystemCacheLog(), /* use_external_buffer */false);
|
||||
}
|
||||
case RemoteFSReadMethod::threadpool:
|
||||
{
|
||||
auto impl = std::make_unique<ReadBufferFromRemoteFSGather>(
|
||||
std::move(read_buffer_creator), objects, "file:", modified_settings,
|
||||
global_context->getFilesystemCacheLog(), /* use_external_buffer */true);
|
||||
|
||||
auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER);
|
||||
return std::make_unique<AsynchronousBoundedReadBuffer>(
|
||||
std::move(impl), reader, read_settings,
|
||||
global_context->getAsyncReadCounters(),
|
||||
global_context->getFilesystemReadPrefetchesLog());
|
||||
}
|
||||
}
|
||||
return std::make_unique<ReadBufferFromRemoteFSGather>(
|
||||
std::move(read_buffer_creator),
|
||||
objects,
|
||||
"file:",
|
||||
modified_settings,
|
||||
global_context->getFilesystemCacheLog(),
|
||||
/* use_external_buffer */ false);
|
||||
}
|
||||
|
||||
ReadSettings LocalObjectStorage::patchSettings(const ReadSettings & read_settings) const
|
||||
|
@ -257,7 +257,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
|
||||
format_settings.max_bytes_to_read_for_schema_inference = settings.input_format_max_bytes_to_read_for_schema_inference;
|
||||
format_settings.column_names_for_schema_inference = settings.column_names_for_schema_inference;
|
||||
format_settings.schema_inference_hints = settings.schema_inference_hints;
|
||||
format_settings.schema_inference_make_columns_nullable = settings.schema_inference_make_columns_nullable;
|
||||
format_settings.schema_inference_make_columns_nullable = settings.schema_inference_make_columns_nullable.valueOr(2);
|
||||
format_settings.mysql_dump.table_name = settings.input_format_mysql_dump_table_name;
|
||||
format_settings.mysql_dump.map_column_names = settings.input_format_mysql_dump_map_column_names;
|
||||
format_settings.sql_insert.max_batch_size = settings.output_format_sql_insert_max_batch_size;
|
||||
|
@ -77,7 +77,7 @@ struct FormatSettings
|
||||
Raw
|
||||
};
|
||||
|
||||
bool schema_inference_make_columns_nullable = true;
|
||||
UInt64 schema_inference_make_columns_nullable = 1;
|
||||
|
||||
DateTimeOutputFormat date_time_output_format = DateTimeOutputFormat::Simple;
|
||||
|
||||
|
@ -1344,7 +1344,11 @@ namespace
|
||||
if (checkCharCaseInsensitive('n', buf))
|
||||
{
|
||||
if (checkStringCaseInsensitive("ull", buf))
|
||||
return std::make_shared<DataTypeNullable>(std::make_shared<DataTypeNothing>());
|
||||
{
|
||||
if (settings.schema_inference_make_columns_nullable == 0)
|
||||
return std::make_shared<DataTypeNothing>();
|
||||
return makeNullable(std::make_shared<DataTypeNothing>());
|
||||
}
|
||||
else if (checkStringCaseInsensitive("an", buf))
|
||||
return std::make_shared<DataTypeFloat64>();
|
||||
}
|
||||
|
@ -19,7 +19,9 @@
|
||||
#include <Common/HashTable/Hash.h>
|
||||
|
||||
#if USE_SSL
|
||||
# include <openssl/evp.h>
|
||||
# include <openssl/md5.h>
|
||||
# include <openssl/ripemd.h>
|
||||
#endif
|
||||
|
||||
#include <bit>
|
||||
@ -196,6 +198,34 @@ T combineHashesFunc(T t1, T t2)
|
||||
return HashFunction::apply(reinterpret_cast<const char *>(hashes), sizeof(hashes));
|
||||
}
|
||||
|
||||
#if USE_SSL
|
||||
struct RipeMD160Impl
|
||||
{
|
||||
static constexpr auto name = "ripeMD160";
|
||||
using ReturnType = UInt256;
|
||||
|
||||
static UInt256 apply(const char * begin, size_t size)
|
||||
{
|
||||
UInt8 digest[RIPEMD160_DIGEST_LENGTH];
|
||||
|
||||
RIPEMD160(reinterpret_cast<const unsigned char *>(begin), size, reinterpret_cast<unsigned char *>(digest));
|
||||
|
||||
std::reverse(digest, digest + RIPEMD160_DIGEST_LENGTH);
|
||||
|
||||
UInt256 res = 0;
|
||||
std::memcpy(&res, digest, RIPEMD160_DIGEST_LENGTH);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static UInt256 combineHashes(UInt256 h1, UInt256 h2)
|
||||
{
|
||||
return combineHashesFunc<UInt256, RipeMD160Impl>(h1, h2);
|
||||
}
|
||||
|
||||
static constexpr bool use_int_hash_for_pods = false;
|
||||
};
|
||||
#endif
|
||||
|
||||
struct SipHash64Impl
|
||||
{
|
||||
@ -1624,6 +1654,7 @@ using FunctionIntHash32 = FunctionIntHash<IntHash32Impl, NameIntHash32>;
|
||||
using FunctionIntHash64 = FunctionIntHash<IntHash64Impl, NameIntHash64>;
|
||||
#if USE_SSL
|
||||
using FunctionHalfMD5 = FunctionAnyHash<HalfMD5Impl>;
|
||||
using FunctionRipeMD160Hash = FunctionAnyHash<RipeMD160Impl>;
|
||||
#endif
|
||||
using FunctionSipHash128 = FunctionAnyHash<SipHash128Impl>;
|
||||
using FunctionSipHash128Keyed = FunctionAnyHash<SipHash128KeyedImpl, true, SipHash128KeyedImpl::Key, SipHash128KeyedImpl::KeyColumns>;
|
||||
@ -1652,6 +1683,7 @@ using FunctionXxHash64 = FunctionAnyHash<ImplXxHash64>;
|
||||
using FunctionXXH3 = FunctionAnyHash<ImplXXH3>;
|
||||
|
||||
using FunctionWyHash64 = FunctionAnyHash<ImplWyHash64>;
|
||||
|
||||
}
|
||||
|
||||
#pragma clang diagnostic pop
|
||||
|
23
src/Functions/FunctionsHashingRipe.cpp
Normal file
23
src/Functions/FunctionsHashingRipe.cpp
Normal file
@ -0,0 +1,23 @@
|
||||
#include "FunctionsHashing.h"
|
||||
|
||||
#include <Functions/FunctionFactory.h>
|
||||
|
||||
/// FunctionsHashing instantiations are separated into files FunctionsHashing*.cpp
|
||||
/// to better parallelize the build procedure and avoid MSan build failure
|
||||
/// due to excessive resource consumption.
|
||||
namespace DB
|
||||
{
|
||||
#if USE_SSL
|
||||
REGISTER_FUNCTION(HashingRipe)
|
||||
{
|
||||
factory.registerFunction<FunctionRipeMD160Hash>(FunctionDocumentation{
|
||||
.description = "RIPEMD-160 hash function, primarily used in Bitcoin address generation.",
|
||||
.examples{{"", "SELECT hex(ripeMD160('The quick brown fox jumps over the lazy dog'));", R"(
|
||||
┌─hex(ripeMD160('The quick brown fox jumps over the lazy dog'))─┐
|
||||
│ 37F332F68DB77BD9D7EDD4969571AD671CF9DD3B │
|
||||
└───────────────────────────────────────────────────────────────┘
|
||||
)"}},
|
||||
.categories{"Hash"}});
|
||||
}
|
||||
#endif
|
||||
}
|
@ -1598,6 +1598,9 @@ ColumnPtr FunctionArrayElement::executeTuple(const ColumnsWithTypeAndName & argu
|
||||
const auto & tuple_columns = col_nested->getColumns();
|
||||
size_t tuple_size = tuple_columns.size();
|
||||
|
||||
if (tuple_size == 0)
|
||||
return ColumnTuple::create(input_rows_count);
|
||||
|
||||
const DataTypes & tuple_types = typeid_cast<const DataTypeTuple &>(
|
||||
*typeid_cast<const DataTypeArray &>(*arguments[0].type).getNestedType()).getElements();
|
||||
|
||||
|
@ -787,7 +787,7 @@ S3CredentialsProviderChain::S3CredentialsProviderChain(
|
||||
/// EC2MetadataService delay is in order of seconds so it only make sense to retry after a couple of seconds.
|
||||
/// But the connection timeout should be small because there is the case when there is no IMDS at all,
|
||||
/// like outside of the cloud, on your own machines.
|
||||
aws_client_configuration.connectTimeoutMs = 10;
|
||||
aws_client_configuration.connectTimeoutMs = 50;
|
||||
aws_client_configuration.requestTimeoutMs = 1000;
|
||||
|
||||
aws_client_configuration.retryStrategy = std::make_shared<Aws::Client::DefaultRetryStrategy>(1, 1000);
|
||||
|
@ -893,6 +893,12 @@ ContextData::ContextData(const ContextData &o) :
|
||||
{
|
||||
}
|
||||
|
||||
void ContextData::resetSharedContext()
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_shared_context);
|
||||
shared = nullptr;
|
||||
}
|
||||
|
||||
Context::Context() = default;
|
||||
Context::Context(const Context & rhs) : ContextData(rhs), std::enable_shared_from_this<Context>(rhs) {}
|
||||
|
||||
@ -914,14 +920,6 @@ ContextMutablePtr Context::createGlobal(ContextSharedPart * shared_part)
|
||||
return res;
|
||||
}
|
||||
|
||||
void Context::initGlobal()
|
||||
{
|
||||
assert(!global_context_instance);
|
||||
global_context_instance = shared_from_this();
|
||||
DatabaseCatalog::init(shared_from_this());
|
||||
EventNotifier::init();
|
||||
}
|
||||
|
||||
SharedContextHolder Context::createShared()
|
||||
{
|
||||
return SharedContextHolder(std::make_unique<ContextSharedPart>());
|
||||
@ -2692,7 +2690,11 @@ void Context::makeSessionContext()
|
||||
|
||||
void Context::makeGlobalContext()
|
||||
{
|
||||
initGlobal();
|
||||
assert(!global_context_instance);
|
||||
global_context_instance = shared_from_this();
|
||||
DatabaseCatalog::init(shared_from_this());
|
||||
EventNotifier::init();
|
||||
|
||||
global_context = shared_from_this();
|
||||
}
|
||||
|
||||
@ -4088,8 +4090,13 @@ void Context::initializeTraceCollector()
|
||||
}
|
||||
|
||||
/// Call after unexpected crash happen.
|
||||
void Context::handleCrash() const TSA_NO_THREAD_SAFETY_ANALYSIS
|
||||
void Context::handleCrash() const
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_shared_context);
|
||||
if (!shared)
|
||||
return;
|
||||
|
||||
SharedLockGuard lock2(shared->mutex);
|
||||
if (shared->system_logs)
|
||||
shared->system_logs->handleCrash();
|
||||
}
|
||||
|
@ -492,6 +492,8 @@ public:
|
||||
|
||||
KitchenSink kitchen_sink;
|
||||
|
||||
void resetSharedContext();
|
||||
|
||||
protected:
|
||||
using SampleBlockCache = std::unordered_map<std::string, Block>;
|
||||
mutable SampleBlockCache sample_block_cache;
|
||||
@ -529,6 +531,10 @@ protected:
|
||||
mutable ThrottlerPtr local_write_query_throttler; /// A query-wide throttler for local IO writes
|
||||
|
||||
mutable ThrottlerPtr backups_query_throttler; /// A query-wide throttler for BACKUPs
|
||||
|
||||
mutable std::mutex mutex_shared_context; /// mutex to avoid accessing destroyed shared context pointer
|
||||
/// some Context methods can be called after the shared context is destroyed
|
||||
/// example, Context::handleCrash() method - called from signal handler
|
||||
};
|
||||
|
||||
/** A set of known objects that can be used in the query.
|
||||
@ -1387,8 +1393,6 @@ private:
|
||||
|
||||
ExternalUserDefinedExecutableFunctionsLoader & getExternalUserDefinedExecutableFunctionsLoaderWithLock(const std::lock_guard<std::mutex> & lock);
|
||||
|
||||
void initGlobal();
|
||||
|
||||
void setUserID(const UUID & user_id_);
|
||||
void setCurrentRolesImpl(const std::vector<UUID> & new_current_roles, bool throw_if_not_granted, bool skip_if_not_granted, const std::shared_ptr<const User> & user);
|
||||
|
||||
|
@ -701,7 +701,6 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription(
|
||||
col_decl.codec, column.type, sanity_check_compression_codecs, allow_experimental_codecs, enable_deflate_qpl_codec, enable_zstd_qat_codec);
|
||||
}
|
||||
|
||||
column.statistics.column_name = column.name; /// We assign column name here for better exception error message.
|
||||
if (col_decl.statistics_desc)
|
||||
{
|
||||
if (!skip_checks && !context_->getSettingsRef().allow_experimental_statistics)
|
||||
|
@ -54,13 +54,8 @@ void checkFinalInferredType(
|
||||
type = default_type;
|
||||
}
|
||||
|
||||
if (settings.schema_inference_make_columns_nullable)
|
||||
if (settings.schema_inference_make_columns_nullable == 1)
|
||||
type = makeNullableRecursively(type);
|
||||
/// In case when data for some column could contain nulls and regular values,
|
||||
/// resulting inferred type is Nullable.
|
||||
/// If input_format_null_as_default is enabled, we should remove Nullable type.
|
||||
else if (settings.null_as_default)
|
||||
type = removeNullable(type);
|
||||
}
|
||||
|
||||
void ISchemaReader::transformTypesIfNeeded(DB::DataTypePtr & type, DB::DataTypePtr & new_type)
|
||||
|
@ -204,8 +204,11 @@ NamesAndTypesList ArrowSchemaReader::readSchema()
|
||||
schema = file_reader->schema();
|
||||
|
||||
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(
|
||||
*schema, stream ? "ArrowStream" : "Arrow", format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference);
|
||||
if (format_settings.schema_inference_make_columns_nullable)
|
||||
*schema,
|
||||
stream ? "ArrowStream" : "Arrow",
|
||||
format_settings.arrow.skip_columns_with_unsupported_types_in_schema_inference,
|
||||
format_settings.schema_inference_make_columns_nullable != 0);
|
||||
if (format_settings.schema_inference_make_columns_nullable == 1)
|
||||
return getNamesAndRecursivelyNullableTypes(header);
|
||||
return header.getNamesAndTypesList();
|
||||
}
|
||||
|
@ -727,6 +727,7 @@ struct ReadColumnFromArrowColumnSettings
|
||||
FormatSettings::DateTimeOverflowBehavior date_time_overflow_behavior;
|
||||
bool allow_arrow_null_type;
|
||||
bool skip_columns_with_unsupported_types;
|
||||
bool allow_inferring_nullable_columns;
|
||||
};
|
||||
|
||||
static ColumnWithTypeAndName readColumnFromArrowColumn(
|
||||
@ -1109,7 +1110,7 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
|
||||
bool is_map_nested_column,
|
||||
const ReadColumnFromArrowColumnSettings & settings)
|
||||
{
|
||||
bool read_as_nullable_column = arrow_column->null_count() || is_nullable_column || (type_hint && type_hint->isNullable());
|
||||
bool read_as_nullable_column = (arrow_column->null_count() || is_nullable_column || (type_hint && type_hint->isNullable())) && settings.allow_inferring_nullable_columns;
|
||||
if (read_as_nullable_column &&
|
||||
arrow_column->type()->id() != arrow::Type::LIST &&
|
||||
arrow_column->type()->id() != arrow::Type::LARGE_LIST &&
|
||||
@ -1173,14 +1174,16 @@ static std::shared_ptr<arrow::ChunkedArray> createArrowColumn(const std::shared_
|
||||
Block ArrowColumnToCHColumn::arrowSchemaToCHHeader(
|
||||
const arrow::Schema & schema,
|
||||
const std::string & format_name,
|
||||
bool skip_columns_with_unsupported_types)
|
||||
bool skip_columns_with_unsupported_types,
|
||||
bool allow_inferring_nullable_columns)
|
||||
{
|
||||
ReadColumnFromArrowColumnSettings settings
|
||||
{
|
||||
.format_name = format_name,
|
||||
.date_time_overflow_behavior = FormatSettings::DateTimeOverflowBehavior::Ignore,
|
||||
.allow_arrow_null_type = false,
|
||||
.skip_columns_with_unsupported_types = skip_columns_with_unsupported_types
|
||||
.skip_columns_with_unsupported_types = skip_columns_with_unsupported_types,
|
||||
.allow_inferring_nullable_columns = allow_inferring_nullable_columns,
|
||||
};
|
||||
|
||||
ColumnsWithTypeAndName sample_columns;
|
||||
@ -1254,7 +1257,8 @@ Chunk ArrowColumnToCHColumn::arrowColumnsToCHChunk(const NameToArrowColumn & nam
|
||||
.format_name = format_name,
|
||||
.date_time_overflow_behavior = date_time_overflow_behavior,
|
||||
.allow_arrow_null_type = true,
|
||||
.skip_columns_with_unsupported_types = false
|
||||
.skip_columns_with_unsupported_types = false,
|
||||
.allow_inferring_nullable_columns = true
|
||||
};
|
||||
|
||||
Columns columns;
|
||||
|
@ -34,7 +34,8 @@ public:
|
||||
static Block arrowSchemaToCHHeader(
|
||||
const arrow::Schema & schema,
|
||||
const std::string & format_name,
|
||||
bool skip_columns_with_unsupported_types = false);
|
||||
bool skip_columns_with_unsupported_types = false,
|
||||
bool allow_inferring_nullable_columns = true);
|
||||
|
||||
struct DictionaryInfo
|
||||
{
|
||||
|
@ -15,8 +15,8 @@ namespace ErrorCodes
|
||||
}
|
||||
|
||||
template <bool with_defaults>
|
||||
BinaryRowInputFormat<with_defaults>::BinaryRowInputFormat(ReadBuffer & in_, const Block & header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
|
||||
: RowInputFormatWithNamesAndTypes(
|
||||
BinaryRowInputFormat<with_defaults>::BinaryRowInputFormat(ReadBuffer & in_, const Block & header, IRowInputFormat::Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
|
||||
: RowInputFormatWithNamesAndTypes<BinaryFormatReader<with_defaults>>(
|
||||
header,
|
||||
in_,
|
||||
params_,
|
||||
|
@ -10,13 +10,16 @@ namespace DB
|
||||
|
||||
class ReadBuffer;
|
||||
|
||||
template <bool>
|
||||
class BinaryFormatReader;
|
||||
|
||||
/** A stream for inputting data in a binary line-by-line format.
|
||||
*/
|
||||
template <bool with_defaults = false>
|
||||
class BinaryRowInputFormat final : public RowInputFormatWithNamesAndTypes
|
||||
class BinaryRowInputFormat final : public RowInputFormatWithNamesAndTypes<BinaryFormatReader<with_defaults>>
|
||||
{
|
||||
public:
|
||||
BinaryRowInputFormat(ReadBuffer & in_, const Block & header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_);
|
||||
BinaryRowInputFormat(ReadBuffer & in_, const Block & header, IRowInputFormat::Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_);
|
||||
|
||||
String getName() const override { return "BinaryRowInputFormat"; }
|
||||
|
||||
|
@ -61,7 +61,7 @@ CSVRowInputFormat::CSVRowInputFormat(
|
||||
bool with_names_,
|
||||
bool with_types_,
|
||||
const FormatSettings & format_settings_,
|
||||
std::unique_ptr<FormatWithNamesAndTypesReader> format_reader_)
|
||||
std::unique_ptr<CSVFormatReader> format_reader_)
|
||||
: RowInputFormatWithNamesAndTypes(
|
||||
header_,
|
||||
*in_,
|
||||
|
@ -1,7 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <optional>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <Core/Block.h>
|
||||
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
|
||||
@ -13,10 +12,12 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class CSVFormatReader;
|
||||
|
||||
/** A stream for inputting data in csv format.
|
||||
* Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values.
|
||||
*/
|
||||
class CSVRowInputFormat : public RowInputFormatWithNamesAndTypes
|
||||
class CSVRowInputFormat : public RowInputFormatWithNamesAndTypes<CSVFormatReader>
|
||||
{
|
||||
public:
|
||||
/** with_names - in the first line the header with column names
|
||||
@ -32,7 +33,7 @@ public:
|
||||
|
||||
protected:
|
||||
CSVRowInputFormat(const Block & header_, std::shared_ptr<PeekableReadBuffer> in_, const Params & params_,
|
||||
bool with_names_, bool with_types_, const FormatSettings & format_settings_, std::unique_ptr<FormatWithNamesAndTypesReader> format_reader_);
|
||||
bool with_names_, bool with_types_, const FormatSettings & format_settings_, std::unique_ptr<CSVFormatReader> format_reader_);
|
||||
|
||||
CSVRowInputFormat(const Block & header_, std::shared_ptr<PeekableReadBuffer> in_buf_, const Params & params_,
|
||||
bool with_names_, bool with_types_, const FormatSettings & format_settings_);
|
||||
|
@ -9,7 +9,8 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class CustomSeparatedRowInputFormat final : public RowInputFormatWithNamesAndTypes
|
||||
class CustomSeparatedFormatReader;
|
||||
class CustomSeparatedRowInputFormat final : public RowInputFormatWithNamesAndTypes<CustomSeparatedFormatReader>
|
||||
{
|
||||
public:
|
||||
CustomSeparatedRowInputFormat(
|
||||
|
@ -11,7 +11,7 @@ namespace DB
|
||||
{
|
||||
|
||||
class ReadBuffer;
|
||||
|
||||
class JSONCompactEachRowFormatReader;
|
||||
|
||||
/** A stream for reading data in a bunch of formats:
|
||||
* - JSONCompactEachRow
|
||||
@ -20,7 +20,7 @@ class ReadBuffer;
|
||||
* - JSONCompactStringsEachRowWithNamesAndTypes
|
||||
*
|
||||
*/
|
||||
class JSONCompactEachRowRowInputFormat final : public RowInputFormatWithNamesAndTypes
|
||||
class JSONCompactEachRowRowInputFormat final : public RowInputFormatWithNamesAndTypes<JSONCompactEachRowFormatReader>
|
||||
{
|
||||
public:
|
||||
JSONCompactEachRowRowInputFormat(
|
||||
|
@ -14,7 +14,7 @@ namespace ErrorCodes
|
||||
|
||||
JSONCompactRowInputFormat::JSONCompactRowInputFormat(
|
||||
const Block & header_, ReadBuffer & in_, Params params_, const FormatSettings & format_settings_)
|
||||
: RowInputFormatWithNamesAndTypes(
|
||||
: RowInputFormatWithNamesAndTypes<JSONCompactFormatReader>(
|
||||
header_, in_, params_, false, false, false, format_settings_, std::make_unique<JSONCompactFormatReader>(in_, format_settings_))
|
||||
{
|
||||
}
|
||||
|
@ -5,8 +5,8 @@
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class JSONCompactRowInputFormat final : public RowInputFormatWithNamesAndTypes
|
||||
class JSONCompactFormatReader;
|
||||
class JSONCompactRowInputFormat final : public RowInputFormatWithNamesAndTypes<JSONCompactFormatReader>
|
||||
{
|
||||
public:
|
||||
JSONCompactRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_, const FormatSettings & format_settings_);
|
||||
|
@ -1002,7 +1002,7 @@ NamesAndTypesList NativeORCSchemaReader::readSchema()
|
||||
header.insert(ColumnWithTypeAndName{type, name});
|
||||
}
|
||||
|
||||
if (format_settings.schema_inference_make_columns_nullable)
|
||||
if (format_settings.schema_inference_make_columns_nullable == 1)
|
||||
return getNamesAndRecursivelyNullableTypes(header);
|
||||
return header.getNamesAndTypesList();
|
||||
}
|
||||
|
@ -160,8 +160,11 @@ NamesAndTypesList ORCSchemaReader::readSchema()
|
||||
{
|
||||
initializeIfNeeded();
|
||||
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(
|
||||
*schema, "ORC", format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference);
|
||||
if (format_settings.schema_inference_make_columns_nullable)
|
||||
*schema,
|
||||
"ORC",
|
||||
format_settings.orc.skip_columns_with_unsupported_types_in_schema_inference,
|
||||
format_settings.schema_inference_make_columns_nullable != 0);
|
||||
if (format_settings.schema_inference_make_columns_nullable == 1)
|
||||
return getNamesAndRecursivelyNullableTypes(header);
|
||||
return header.getNamesAndTypesList();
|
||||
}
|
||||
|
@ -869,8 +869,11 @@ NamesAndTypesList ParquetSchemaReader::readSchema()
|
||||
THROW_ARROW_NOT_OK(parquet::arrow::FromParquetSchema(metadata->schema(), &schema));
|
||||
|
||||
auto header = ArrowColumnToCHColumn::arrowSchemaToCHHeader(
|
||||
*schema, "Parquet", format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference);
|
||||
if (format_settings.schema_inference_make_columns_nullable)
|
||||
*schema,
|
||||
"Parquet",
|
||||
format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference,
|
||||
format_settings.schema_inference_make_columns_nullable != 0);
|
||||
if (format_settings.schema_inference_make_columns_nullable == 1)
|
||||
return getNamesAndRecursivelyNullableTypes(header);
|
||||
return header.getNamesAndTypesList();
|
||||
}
|
||||
|
@ -10,9 +10,11 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class TabSeparatedFormatReader;
|
||||
|
||||
/** A stream to input data in tsv format.
|
||||
*/
|
||||
class TabSeparatedRowInputFormat final : public RowInputFormatWithNamesAndTypes
|
||||
class TabSeparatedRowInputFormat final : public RowInputFormatWithNamesAndTypes<TabSeparatedFormatReader>
|
||||
{
|
||||
public:
|
||||
/** with_names - the first line is the header with the names of the columns
|
||||
|
@ -1,14 +1,20 @@
|
||||
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
|
||||
#include <Processors/Formats/ISchemaReader.h>
|
||||
#include <DataTypes/DataTypeNothing.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <DataTypes/DataTypeFactory.h>
|
||||
#include <DataTypes/DataTypeLowCardinality.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/Operators.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/PeekableReadBuffer.h>
|
||||
#include <DataTypes/DataTypeNothing.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <Formats/EscapingRuleUtils.h>
|
||||
#include <IO/Operators.h>
|
||||
#include <IO/PeekableReadBuffer.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <Processors/Formats/ISchemaReader.h>
|
||||
#include <Processors/Formats/Impl/BinaryRowInputFormat.h>
|
||||
#include <Processors/Formats/Impl/CSVRowInputFormat.h>
|
||||
#include <Processors/Formats/Impl/CustomSeparatedRowInputFormat.h>
|
||||
#include <Processors/Formats/Impl/HiveTextRowInputFormat.h>
|
||||
#include <Processors/Formats/Impl/JSONCompactRowInputFormat.h>
|
||||
#include <Processors/Formats/Impl/TabSeparatedRowInputFormat.h>
|
||||
#include <Processors/Formats/RowInputFormatWithNamesAndTypes.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -44,7 +50,8 @@ namespace
|
||||
}
|
||||
}
|
||||
|
||||
RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes(
|
||||
template <typename FormatReaderImpl>
|
||||
RowInputFormatWithNamesAndTypes<FormatReaderImpl>::RowInputFormatWithNamesAndTypes(
|
||||
const Block & header_,
|
||||
ReadBuffer & in_,
|
||||
const Params & params_,
|
||||
@ -52,7 +59,7 @@ RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes(
|
||||
bool with_names_,
|
||||
bool with_types_,
|
||||
const FormatSettings & format_settings_,
|
||||
std::unique_ptr<FormatWithNamesAndTypesReader> format_reader_,
|
||||
std::unique_ptr<FormatReaderImpl> format_reader_,
|
||||
bool try_detect_header_)
|
||||
: RowInputFormatWithDiagnosticInfo(header_, in_, params_)
|
||||
, format_settings(format_settings_)
|
||||
@ -66,7 +73,8 @@ RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes(
|
||||
column_indexes_by_names = getPort().getHeader().getNamesToIndexesMap();
|
||||
}
|
||||
|
||||
void RowInputFormatWithNamesAndTypes::readPrefix()
|
||||
template <typename FormatReaderImpl>
|
||||
void RowInputFormatWithNamesAndTypes<FormatReaderImpl>::readPrefix()
|
||||
{
|
||||
/// Search and remove BOM only in textual formats (CSV, TSV etc), not in binary ones (RowBinary*).
|
||||
/// Also, we assume that column name or type cannot contain BOM, so, if format has header,
|
||||
@ -138,7 +146,8 @@ void RowInputFormatWithNamesAndTypes::readPrefix()
|
||||
}
|
||||
}
|
||||
|
||||
void RowInputFormatWithNamesAndTypes::tryDetectHeader(std::vector<String> & column_names_out, std::vector<String> & type_names_out)
|
||||
template <typename FormatReaderImpl>
|
||||
void RowInputFormatWithNamesAndTypes<FormatReaderImpl>::tryDetectHeader(std::vector<String> & column_names_out, std::vector<String> & type_names_out)
|
||||
{
|
||||
auto & read_buf = getReadBuffer();
|
||||
PeekableReadBuffer * peekable_buf = dynamic_cast<PeekableReadBuffer *>(&read_buf);
|
||||
@ -201,7 +210,8 @@ void RowInputFormatWithNamesAndTypes::tryDetectHeader(std::vector<String> & colu
|
||||
peekable_buf->dropCheckpoint();
|
||||
}
|
||||
|
||||
bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadExtension & ext)
|
||||
template <typename FormatReaderImpl>
|
||||
bool RowInputFormatWithNamesAndTypes<FormatReaderImpl>::readRow(MutableColumns & columns, RowReadExtension & ext)
|
||||
{
|
||||
if (unlikely(end_of_stream))
|
||||
return false;
|
||||
@ -280,7 +290,8 @@ bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadE
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t RowInputFormatWithNamesAndTypes::countRows(size_t max_block_size)
|
||||
template <typename FormatReaderImpl>
|
||||
size_t RowInputFormatWithNamesAndTypes<FormatReaderImpl>::countRows(size_t max_block_size)
|
||||
{
|
||||
if (unlikely(end_of_stream))
|
||||
return 0;
|
||||
@ -304,7 +315,8 @@ size_t RowInputFormatWithNamesAndTypes::countRows(size_t max_block_size)
|
||||
return num_rows;
|
||||
}
|
||||
|
||||
void RowInputFormatWithNamesAndTypes::resetParser()
|
||||
template <typename FormatReaderImpl>
|
||||
void RowInputFormatWithNamesAndTypes<FormatReaderImpl>::resetParser()
|
||||
{
|
||||
RowInputFormatWithDiagnosticInfo::resetParser();
|
||||
column_mapping->column_indexes_for_input_fields.clear();
|
||||
@ -313,7 +325,8 @@ void RowInputFormatWithNamesAndTypes::resetParser()
|
||||
end_of_stream = false;
|
||||
}
|
||||
|
||||
void RowInputFormatWithNamesAndTypes::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
|
||||
template <typename FormatReaderImpl>
|
||||
void RowInputFormatWithNamesAndTypes<FormatReaderImpl>::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column)
|
||||
{
|
||||
const auto & index = column_mapping->column_indexes_for_input_fields[file_column];
|
||||
if (index)
|
||||
@ -328,7 +341,8 @@ void RowInputFormatWithNamesAndTypes::tryDeserializeField(const DataTypePtr & ty
|
||||
}
|
||||
}
|
||||
|
||||
bool RowInputFormatWithNamesAndTypes::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out)
|
||||
template <typename FormatReaderImpl>
|
||||
bool RowInputFormatWithNamesAndTypes<FormatReaderImpl>::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out)
|
||||
{
|
||||
if (in->eof())
|
||||
{
|
||||
@ -374,12 +388,14 @@ bool RowInputFormatWithNamesAndTypes::parseRowAndPrintDiagnosticInfo(MutableColu
|
||||
return format_reader->parseRowEndWithDiagnosticInfo(out);
|
||||
}
|
||||
|
||||
bool RowInputFormatWithNamesAndTypes::isGarbageAfterField(size_t index, ReadBuffer::Position pos)
|
||||
template <typename FormatReaderImpl>
|
||||
bool RowInputFormatWithNamesAndTypes<FormatReaderImpl>::isGarbageAfterField(size_t index, ReadBuffer::Position pos)
|
||||
{
|
||||
return format_reader->isGarbageAfterField(index, pos);
|
||||
}
|
||||
|
||||
void RowInputFormatWithNamesAndTypes::setReadBuffer(ReadBuffer & in_)
|
||||
template <typename FormatReaderImpl>
|
||||
void RowInputFormatWithNamesAndTypes<FormatReaderImpl>::setReadBuffer(ReadBuffer & in_)
|
||||
{
|
||||
format_reader->setReadBuffer(in_);
|
||||
IInputFormat::setReadBuffer(in_);
|
||||
@ -582,5 +598,12 @@ void FormatWithNamesAndTypesSchemaReader::transformTypesIfNeeded(DB::DataTypePtr
|
||||
transformInferredTypesIfNeeded(type, new_type, format_settings);
|
||||
}
|
||||
|
||||
template class RowInputFormatWithNamesAndTypes<JSONCompactFormatReader>;
|
||||
template class RowInputFormatWithNamesAndTypes<JSONCompactEachRowFormatReader>;
|
||||
template class RowInputFormatWithNamesAndTypes<TabSeparatedFormatReader>;
|
||||
template class RowInputFormatWithNamesAndTypes<CSVFormatReader>;
|
||||
template class RowInputFormatWithNamesAndTypes<CustomSeparatedFormatReader>;
|
||||
template class RowInputFormatWithNamesAndTypes<BinaryFormatReader<true>>;
|
||||
template class RowInputFormatWithNamesAndTypes<BinaryFormatReader<false>>;
|
||||
}
|
||||
|
||||
|
@ -26,6 +26,7 @@ class FormatWithNamesAndTypesReader;
|
||||
/// will be compared types from header.
|
||||
/// It's important that firstly this class reads/skips names and only
|
||||
/// then reads/skips types. So you can this invariant.
|
||||
template <typename FormatReaderImpl>
|
||||
class RowInputFormatWithNamesAndTypes : public RowInputFormatWithDiagnosticInfo
|
||||
{
|
||||
protected:
|
||||
@ -41,7 +42,7 @@ protected:
|
||||
bool with_names_,
|
||||
bool with_types_,
|
||||
const FormatSettings & format_settings_,
|
||||
std::unique_ptr<FormatWithNamesAndTypesReader> format_reader_,
|
||||
std::unique_ptr<FormatReaderImpl> format_reader_,
|
||||
bool try_detect_header_ = false);
|
||||
|
||||
void resetParser() override;
|
||||
@ -70,7 +71,7 @@ private:
|
||||
bool is_header_detected = false;
|
||||
|
||||
protected:
|
||||
std::unique_ptr<FormatWithNamesAndTypesReader> format_reader;
|
||||
std::unique_ptr<FormatReaderImpl> format_reader;
|
||||
Block::NameMap column_indexes_by_names;
|
||||
};
|
||||
|
||||
|
@ -255,7 +255,7 @@ void buildSortingDAG(QueryPlan::Node & node, std::optional<ActionsDAG> & dag, Fi
|
||||
|
||||
/// Add more functions to fixed columns.
|
||||
/// Functions result is fixed if all arguments are fixed or constants.
|
||||
void enreachFixedColumns(const ActionsDAG & dag, FixedColumns & fixed_columns)
|
||||
void enrichFixedColumns(const ActionsDAG & dag, FixedColumns & fixed_columns)
|
||||
{
|
||||
struct Frame
|
||||
{
|
||||
@ -300,20 +300,20 @@ void enreachFixedColumns(const ActionsDAG & dag, FixedColumns & fixed_columns)
|
||||
{
|
||||
if (frame.node->function_base->isDeterministicInScopeOfQuery())
|
||||
{
|
||||
//std::cerr << "*** enreachFixedColumns check " << frame.node->result_name << std::endl;
|
||||
//std::cerr << "*** enrichFixedColumns check " << frame.node->result_name << std::endl;
|
||||
bool all_args_fixed_or_const = true;
|
||||
for (const auto * child : frame.node->children)
|
||||
{
|
||||
if (!child->column && !fixed_columns.contains(child))
|
||||
{
|
||||
//std::cerr << "*** enreachFixedColumns fail " << child->result_name << ' ' << static_cast<const void *>(child) << std::endl;
|
||||
//std::cerr << "*** enrichFixedColumns fail " << child->result_name << ' ' << static_cast<const void *>(child) << std::endl;
|
||||
all_args_fixed_or_const = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (all_args_fixed_or_const)
|
||||
{
|
||||
//std::cerr << "*** enreachFixedColumns add " << frame.node->result_name << ' ' << static_cast<const void *>(frame.node) << std::endl;
|
||||
//std::cerr << "*** enrichFixedColumns add " << frame.node->result_name << ' ' << static_cast<const void *>(frame.node) << std::endl;
|
||||
fixed_columns.insert(frame.node);
|
||||
}
|
||||
}
|
||||
@ -357,7 +357,7 @@ InputOrderInfoPtr buildInputOrderInfo(
|
||||
}
|
||||
}
|
||||
|
||||
enreachFixedColumns(sorting_key_dag, fixed_key_columns);
|
||||
enrichFixedColumns(sorting_key_dag, fixed_key_columns);
|
||||
}
|
||||
|
||||
/// This is a result direction we will read from MergeTree
|
||||
@ -530,7 +530,7 @@ AggregationInputOrder buildInputOrderInfo(
|
||||
}
|
||||
}
|
||||
|
||||
enreachFixedColumns(sorting_key_dag, fixed_key_columns);
|
||||
enrichFixedColumns(sorting_key_dag, fixed_key_columns);
|
||||
|
||||
for (const auto * output : dag->getOutputs())
|
||||
{
|
||||
@ -804,7 +804,7 @@ InputOrderInfoPtr buildInputOrderInfo(SortingStep & sorting, QueryPlan::Node & n
|
||||
buildSortingDAG(node, dag, fixed_columns, limit);
|
||||
|
||||
if (dag && !fixed_columns.empty())
|
||||
enreachFixedColumns(*dag, fixed_columns);
|
||||
enrichFixedColumns(*dag, fixed_columns);
|
||||
|
||||
if (auto * reading = typeid_cast<ReadFromMergeTree *>(reading_node->step.get()))
|
||||
{
|
||||
@ -858,7 +858,7 @@ AggregationInputOrder buildInputOrderInfo(AggregatingStep & aggregating, QueryPl
|
||||
buildSortingDAG(node, dag, fixed_columns, limit);
|
||||
|
||||
if (dag && !fixed_columns.empty())
|
||||
enreachFixedColumns(*dag, fixed_columns);
|
||||
enrichFixedColumns(*dag, fixed_columns);
|
||||
|
||||
if (auto * reading = typeid_cast<ReadFromMergeTree *>(reading_node->step.get()))
|
||||
{
|
||||
|
@ -706,9 +706,9 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context)
|
||||
}
|
||||
|
||||
auto stats_vec = ColumnStatisticsDescription::fromAST(statistics_decl, metadata.columns);
|
||||
for (const auto & stats : stats_vec)
|
||||
for (const auto & [stats_column_name, stats] : stats_vec)
|
||||
{
|
||||
metadata.columns.modify(stats.column_name,
|
||||
metadata.columns.modify(stats_column_name,
|
||||
[&](ColumnDescription & column) { column.statistics.merge(stats, column.name, column.type, if_not_exists); });
|
||||
}
|
||||
}
|
||||
@ -735,14 +735,14 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context)
|
||||
{
|
||||
if (!metadata.columns.has(statistics_column_name))
|
||||
{
|
||||
throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Cannot add statistics for column {}: this column is not found", statistics_column_name);
|
||||
throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Cannot modify statistics for column {}: this column is not found", statistics_column_name);
|
||||
}
|
||||
}
|
||||
|
||||
auto stats_vec = ColumnStatisticsDescription::fromAST(statistics_decl, metadata.columns);
|
||||
for (const auto & stats : stats_vec)
|
||||
for (const auto & [stats_column_name, stats] : stats_vec)
|
||||
{
|
||||
metadata.columns.modify(stats.column_name,
|
||||
metadata.columns.modify(stats_column_name,
|
||||
[&](ColumnDescription & column) { column.statistics.assign(stats); });
|
||||
}
|
||||
}
|
||||
@ -867,8 +867,6 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context)
|
||||
rename_visitor.visit(column_to_modify.default_desc.expression);
|
||||
if (column_to_modify.ttl)
|
||||
rename_visitor.visit(column_to_modify.ttl);
|
||||
if (column_to_modify.name == column_name && !column_to_modify.statistics.empty())
|
||||
column_to_modify.statistics.column_name = rename_to;
|
||||
});
|
||||
}
|
||||
if (metadata.table_ttl.definition_ast)
|
||||
|
@ -218,11 +218,7 @@ void ColumnDescription::readText(ReadBuffer & buf)
|
||||
settings = col_ast->settings->as<ASTSetQuery &>().changes;
|
||||
|
||||
if (col_ast->statistics_desc)
|
||||
{
|
||||
statistics = ColumnStatisticsDescription::fromColumnDeclaration(*col_ast, type);
|
||||
/// every column has name `x` here, so we have to set the name manually.
|
||||
statistics.column_name = name;
|
||||
}
|
||||
}
|
||||
else
|
||||
throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Cannot parse column description");
|
||||
|
@ -807,7 +807,7 @@ MergeTreeDataPartBuilder IMergeTreeDataPart::getProjectionPartBuilder(const Stri
|
||||
const char * projection_extension = is_temp_projection ? ".tmp_proj" : ".proj";
|
||||
auto projection_storage = getDataPartStorage().getProjection(projection_name + projection_extension, !is_temp_projection);
|
||||
MergeTreeDataPartBuilder builder(storage, projection_name, projection_storage);
|
||||
return builder.withPartInfo({"all", 0, 0, 0}).withParentPart(this);
|
||||
return builder.withPartInfo(MergeListElement::FAKE_RESULT_PART_FOR_PROJECTION).withParentPart(this);
|
||||
}
|
||||
|
||||
void IMergeTreeDataPart::addProjectionPart(
|
||||
@ -1334,17 +1334,6 @@ void IMergeTreeDataPart::loadRowsCount()
|
||||
auto buf = metadata_manager->read("count.txt");
|
||||
readIntText(rows_count, *buf);
|
||||
assertEOF(*buf);
|
||||
|
||||
if (!index_granularity.empty() && rows_count < index_granularity.getTotalRows() && index_granularity_info.fixed_index_granularity)
|
||||
{
|
||||
/// Adjust last granule size to match the number of rows in the part in case of fixed index_granularity.
|
||||
index_granularity.popMark();
|
||||
index_granularity.appendMark(rows_count % index_granularity_info.fixed_index_granularity);
|
||||
if (rows_count != index_granularity.getTotalRows())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Index granularity total rows in part {} does not match rows_count: {}, instead of {}",
|
||||
name, index_granularity.getTotalRows(), rows_count);
|
||||
}
|
||||
};
|
||||
|
||||
if (index_granularity.empty())
|
||||
|
@ -6,10 +6,18 @@
|
||||
#include <Common/CurrentThread.h>
|
||||
#include <Common/MemoryTracker.h>
|
||||
|
||||
#include <Common/logger_useful.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
const MergeTreePartInfo MergeListElement::FAKE_RESULT_PART_FOR_PROJECTION = {"all", 0, 0, 0};
|
||||
|
||||
MergeListElement::MergeListElement(const StorageID & table_id_, FutureMergedMutatedPartPtr future_part, const ContextPtr & context)
|
||||
: table_id{table_id_}
|
||||
, partition_id{future_part->part_info.partition_id}
|
||||
@ -21,8 +29,23 @@ MergeListElement::MergeListElement(const StorageID & table_id_, FutureMergedMuta
|
||||
, merge_type{future_part->merge_type}
|
||||
, merge_algorithm{MergeAlgorithm::Undecided}
|
||||
{
|
||||
auto format_version = MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING;
|
||||
if (result_part_name != result_part_info.getPartNameV1())
|
||||
format_version = MERGE_TREE_DATA_OLD_FORMAT_VERSION;
|
||||
|
||||
/// FIXME why do we need a merge list element for projection parts at all?
|
||||
bool is_fake_projection_part = future_part->part_info == FAKE_RESULT_PART_FOR_PROJECTION;
|
||||
|
||||
size_t normal_parts_count = 0;
|
||||
for (const auto & source_part : future_part->parts)
|
||||
{
|
||||
if (!is_fake_projection_part && !source_part->getParentPart())
|
||||
{
|
||||
++normal_parts_count;
|
||||
if (!result_part_info.contains(MergeTreePartInfo::fromPartName(source_part->name, format_version)))
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Source part {} is not covered by result part {}", source_part->name, result_part_info.getPartNameV1());
|
||||
}
|
||||
|
||||
source_part_names.emplace_back(source_part->name);
|
||||
source_part_paths.emplace_back(source_part->getDataPartStorage().getFullPath());
|
||||
|
||||
@ -35,13 +58,17 @@ MergeListElement::MergeListElement(const StorageID & table_id_, FutureMergedMuta
|
||||
if (!future_part->parts.empty())
|
||||
{
|
||||
source_data_version = future_part->parts[0]->info.getDataVersion();
|
||||
is_mutation = (result_part_info.getDataVersion() != source_data_version);
|
||||
is_mutation = (result_part_info.level == future_part->parts[0]->info.level) && !is_fake_projection_part;
|
||||
|
||||
WriteBufferFromString out(partition);
|
||||
const auto & part = future_part->parts[0];
|
||||
part->partition.serializeText(part->storage, out, {});
|
||||
}
|
||||
|
||||
if (!is_fake_projection_part && is_mutation && normal_parts_count != 1)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Got {} source parts for mutation {}: {}", future_part->parts.size(),
|
||||
result_part_info.getPartNameV1(), fmt::join(source_part_names, ", "));
|
||||
|
||||
thread_group = ThreadGroup::createForBackgroundProcess(context);
|
||||
}
|
||||
|
||||
|
@ -66,6 +66,8 @@ struct Settings;
|
||||
|
||||
struct MergeListElement : boost::noncopyable
|
||||
{
|
||||
static const MergeTreePartInfo FAKE_RESULT_PART_FOR_PROJECTION;
|
||||
|
||||
const StorageID table_id;
|
||||
std::string partition_id;
|
||||
std::string partition;
|
||||
|
95
src/Storages/MergeTree/MergeProjectionPartsTask.cpp
Normal file
95
src/Storages/MergeTree/MergeProjectionPartsTask.cpp
Normal file
@ -0,0 +1,95 @@
|
||||
#include <Storages/MergeTree/MergeProjectionPartsTask.h>
|
||||
|
||||
#include <Common/TransactionID.h>
|
||||
#include <Storages/MergeTree/MergeList.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
bool MergeProjectionPartsTask::executeStep()
|
||||
{
|
||||
auto & current_level_parts = level_parts[current_level];
|
||||
auto & next_level_parts = level_parts[next_level];
|
||||
|
||||
MergeTreeData::MutableDataPartsVector selected_parts;
|
||||
while (selected_parts.size() < max_parts_to_merge_in_one_level && !current_level_parts.empty())
|
||||
{
|
||||
selected_parts.push_back(std::move(current_level_parts.back()));
|
||||
current_level_parts.pop_back();
|
||||
}
|
||||
|
||||
if (selected_parts.empty())
|
||||
{
|
||||
if (next_level_parts.empty())
|
||||
{
|
||||
LOG_WARNING(log, "There is no projection parts merged");
|
||||
|
||||
/// Task is finished
|
||||
return false;
|
||||
}
|
||||
current_level = next_level;
|
||||
++next_level;
|
||||
}
|
||||
else if (selected_parts.size() == 1)
|
||||
{
|
||||
if (next_level_parts.empty())
|
||||
{
|
||||
LOG_DEBUG(log, "Merged a projection part in level {}", current_level);
|
||||
selected_parts[0]->renameTo(projection.name + ".proj", true);
|
||||
selected_parts[0]->setName(projection.name);
|
||||
selected_parts[0]->is_temp = false;
|
||||
new_data_part->addProjectionPart(name, std::move(selected_parts[0]));
|
||||
|
||||
/// Task is finished
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_DEBUG(log, "Forwarded part {} in level {} to next level", selected_parts[0]->name, current_level);
|
||||
next_level_parts.push_back(std::move(selected_parts[0]));
|
||||
}
|
||||
}
|
||||
else if (selected_parts.size() > 1)
|
||||
{
|
||||
// Generate a unique part name
|
||||
++block_num;
|
||||
auto projection_future_part = std::make_shared<FutureMergedMutatedPart>();
|
||||
MergeTreeData::DataPartsVector const_selected_parts(
|
||||
std::make_move_iterator(selected_parts.begin()), std::make_move_iterator(selected_parts.end()));
|
||||
projection_future_part->assign(std::move(const_selected_parts));
|
||||
projection_future_part->name = fmt::format("{}_{}", projection.name, ++block_num);
|
||||
projection_future_part->part_info = {"all", 0, 0, 0};
|
||||
|
||||
MergeTreeData::MergingParams projection_merging_params;
|
||||
projection_merging_params.mode = MergeTreeData::MergingParams::Ordinary;
|
||||
if (projection.type == ProjectionDescription::Type::Aggregate)
|
||||
projection_merging_params.mode = MergeTreeData::MergingParams::Aggregating;
|
||||
|
||||
LOG_DEBUG(log, "Merged {} parts in level {} to {}", selected_parts.size(), current_level, projection_future_part->name);
|
||||
auto tmp_part_merge_task = mutator->mergePartsToTemporaryPart(
|
||||
projection_future_part,
|
||||
projection.metadata,
|
||||
merge_entry,
|
||||
std::make_unique<MergeListElement>((*merge_entry)->table_id, projection_future_part, context),
|
||||
*table_lock_holder,
|
||||
time_of_merge,
|
||||
context,
|
||||
space_reservation,
|
||||
false, // TODO Do we need deduplicate for projections
|
||||
{},
|
||||
false, // no cleanup
|
||||
projection_merging_params,
|
||||
NO_TRANSACTION_PTR,
|
||||
/* need_prefix */ true,
|
||||
new_data_part.get(),
|
||||
".tmp_proj");
|
||||
|
||||
next_level_parts.push_back(executeHere(tmp_part_merge_task));
|
||||
next_level_parts.back()->is_temp = true;
|
||||
}
|
||||
|
||||
/// Need execute again
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
84
src/Storages/MergeTree/MergeProjectionPartsTask.h
Normal file
84
src/Storages/MergeTree/MergeProjectionPartsTask.h
Normal file
@ -0,0 +1,84 @@
|
||||
#pragma once
|
||||
|
||||
#include <Interpreters/StorageID.h>
|
||||
#include <Storages/MergeTree/IExecutableTask.h>
|
||||
#include <Storages/MergeTree/MergeTreeData.h>
|
||||
#include <Storages/MergeTree/MergeTreeDataMergerMutator.h>
|
||||
#include <Storages/MergeTree/MergeProgress.h>
|
||||
#include <Storages/MergeTree/FutureMergedMutatedPart.h>
|
||||
#include <Storages/ProjectionsDescription.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
class MergeProjectionPartsTask : public IExecutableTask
|
||||
{
|
||||
public:
|
||||
|
||||
MergeProjectionPartsTask(
|
||||
String name_,
|
||||
MergeTreeData::MutableDataPartsVector && parts_,
|
||||
const ProjectionDescription & projection_,
|
||||
size_t & block_num_,
|
||||
ContextPtr context_,
|
||||
TableLockHolder * table_lock_holder_,
|
||||
MergeTreeDataMergerMutator * mutator_,
|
||||
MergeListEntry * merge_entry_,
|
||||
time_t time_of_merge_,
|
||||
MergeTreeData::MutableDataPartPtr new_data_part_,
|
||||
ReservationSharedPtr space_reservation_)
|
||||
: name(std::move(name_))
|
||||
, parts(std::move(parts_))
|
||||
, projection(projection_)
|
||||
, block_num(block_num_)
|
||||
, context(context_)
|
||||
, table_lock_holder(table_lock_holder_)
|
||||
, mutator(mutator_)
|
||||
, merge_entry(merge_entry_)
|
||||
, time_of_merge(time_of_merge_)
|
||||
, new_data_part(new_data_part_)
|
||||
, space_reservation(space_reservation_)
|
||||
, log(getLogger("MergeProjectionPartsTask"))
|
||||
{
|
||||
LOG_DEBUG(log, "Selected {} projection_parts from {} to {}", parts.size(), parts.front()->name, parts.back()->name);
|
||||
level_parts[current_level] = std::move(parts);
|
||||
}
|
||||
|
||||
void onCompleted() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); }
|
||||
StorageID getStorageID() const override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); }
|
||||
Priority getPriority() const override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); }
|
||||
String getQueryId() const override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); }
|
||||
|
||||
bool executeStep() override;
|
||||
|
||||
private:
|
||||
String name;
|
||||
MergeTreeData::MutableDataPartsVector parts;
|
||||
const ProjectionDescription & projection;
|
||||
size_t & block_num;
|
||||
|
||||
ContextPtr context;
|
||||
TableLockHolder * table_lock_holder;
|
||||
MergeTreeDataMergerMutator * mutator;
|
||||
MergeListEntry * merge_entry;
|
||||
time_t time_of_merge;
|
||||
|
||||
MergeTreeData::MutableDataPartPtr new_data_part;
|
||||
ReservationSharedPtr space_reservation;
|
||||
|
||||
LoggerPtr log;
|
||||
|
||||
std::map<size_t, MergeTreeData::MutableDataPartsVector> level_parts;
|
||||
size_t current_level = 0;
|
||||
size_t next_level = 1;
|
||||
|
||||
/// TODO(nikitamikhaylov): make this constant a setting
|
||||
static constexpr size_t max_parts_to_merge_in_one_level = 10;
|
||||
};
|
||||
|
||||
}
|
@ -21,6 +21,8 @@
|
||||
#include <Storages/MergeTree/MergeTreeSettings.h>
|
||||
#include <Storages/MergeTree/FutureMergedMutatedPart.h>
|
||||
#include <Storages/MergeTree/MergeTreeDataMergerMutator.h>
|
||||
#include <Storages/MergeTree/MergeTreeDataWriter.h>
|
||||
#include <Storages/MergeTree/MergeProjectionPartsTask.h>
|
||||
#include <Processors/Transforms/ExpressionTransform.h>
|
||||
#include <Processors/Transforms/MaterializingTransform.h>
|
||||
#include <Processors/Transforms/FilterTransform.h>
|
||||
@ -63,6 +65,7 @@ namespace ErrorCodes
|
||||
extern const int SUPPORT_IS_DISABLED;
|
||||
}
|
||||
|
||||
|
||||
static ColumnsStatistics getStatisticsForColumns(
|
||||
const NamesAndTypesList & columns_to_read,
|
||||
const StorageMetadataPtr & metadata_snapshot)
|
||||
@ -75,7 +78,7 @@ static ColumnsStatistics getStatisticsForColumns(
|
||||
const auto * desc = all_columns.tryGet(column.name);
|
||||
if (desc && !desc->statistics.empty())
|
||||
{
|
||||
auto statistics = MergeTreeStatisticsFactory::instance().get(desc->statistics);
|
||||
auto statistics = MergeTreeStatisticsFactory::instance().get(*desc);
|
||||
all_statistics.push_back(std::move(statistics));
|
||||
}
|
||||
}
|
||||
@ -155,6 +158,13 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::extractMergingAndGatheringColu
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto * projection : global_ctx->projections_to_rebuild)
|
||||
{
|
||||
Names projection_columns_vec = projection->getRequiredColumns();
|
||||
std::copy(projection_columns_vec.cbegin(), projection_columns_vec.cend(),
|
||||
std::inserter(key_columns, key_columns.end()));
|
||||
}
|
||||
|
||||
/// TODO: also force "summing" and "aggregating" columns to make Horizontal merge only for such columns
|
||||
|
||||
for (const auto & column : global_ctx->storage_columns)
|
||||
@ -254,6 +264,8 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
|
||||
extendObjectColumns(global_ctx->storage_columns, object_columns, false);
|
||||
global_ctx->storage_snapshot = std::make_shared<StorageSnapshot>(*global_ctx->data, global_ctx->metadata_snapshot, std::move(object_columns));
|
||||
|
||||
prepareProjectionsToMergeAndRebuild();
|
||||
|
||||
extractMergingAndGatheringColumns();
|
||||
|
||||
global_ctx->new_data_part->uuid = global_ctx->future_part->uuid;
|
||||
@ -517,6 +529,148 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::execute()
|
||||
}
|
||||
|
||||
|
||||
void MergeTask::ExecuteAndFinalizeHorizontalPart::prepareProjectionsToMergeAndRebuild() const
|
||||
{
|
||||
const auto mode = global_ctx->data->getSettings()->deduplicate_merge_projection_mode;
|
||||
/// Under throw mode, we still choose to drop projections due to backward compatibility since some
|
||||
/// users might have projections before this change.
|
||||
if (global_ctx->data->merging_params.mode != MergeTreeData::MergingParams::Ordinary
|
||||
&& (mode == DeduplicateMergeProjectionMode::THROW || mode == DeduplicateMergeProjectionMode::DROP))
|
||||
return;
|
||||
|
||||
/// These merging modes may or may not reduce number of rows. It's not known until the horizontal stage is finished.
|
||||
const bool merge_may_reduce_rows =
|
||||
global_ctx->cleanup ||
|
||||
global_ctx->deduplicate ||
|
||||
ctx->merging_params.mode == MergeTreeData::MergingParams::Collapsing ||
|
||||
ctx->merging_params.mode == MergeTreeData::MergingParams::Replacing ||
|
||||
ctx->merging_params.mode == MergeTreeData::MergingParams::VersionedCollapsing;
|
||||
|
||||
const auto & projections = global_ctx->metadata_snapshot->getProjections();
|
||||
|
||||
for (const auto & projection : projections)
|
||||
{
|
||||
if (merge_may_reduce_rows)
|
||||
{
|
||||
global_ctx->projections_to_rebuild.push_back(&projection);
|
||||
continue;
|
||||
}
|
||||
|
||||
MergeTreeData::DataPartsVector projection_parts;
|
||||
for (const auto & part : global_ctx->future_part->parts)
|
||||
{
|
||||
auto it = part->getProjectionParts().find(projection.name);
|
||||
if (it != part->getProjectionParts().end() && !it->second->is_broken)
|
||||
projection_parts.push_back(it->second);
|
||||
}
|
||||
if (projection_parts.size() == global_ctx->future_part->parts.size())
|
||||
{
|
||||
global_ctx->projections_to_merge.push_back(&projection);
|
||||
global_ctx->projections_to_merge_parts[projection.name].assign(projection_parts.begin(), projection_parts.end());
|
||||
}
|
||||
else
|
||||
{
|
||||
chassert(projection_parts.size() < global_ctx->future_part->parts.size());
|
||||
LOG_DEBUG(ctx->log, "Projection {} is not merged because some parts don't have it", projection.name);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const auto & settings = global_ctx->context->getSettingsRef();
|
||||
|
||||
for (const auto * projection : global_ctx->projections_to_rebuild)
|
||||
ctx->projection_squashes.emplace_back(projection->sample_block.cloneEmpty(),
|
||||
settings.min_insert_block_size_rows, settings.min_insert_block_size_bytes);
|
||||
}
|
||||
|
||||
|
||||
void MergeTask::ExecuteAndFinalizeHorizontalPart::calculateProjections(const Block & block) const
|
||||
{
|
||||
for (size_t i = 0, size = global_ctx->projections_to_rebuild.size(); i < size; ++i)
|
||||
{
|
||||
const auto & projection = *global_ctx->projections_to_rebuild[i];
|
||||
Block block_to_squash = projection.calculate(block, global_ctx->context);
|
||||
auto & projection_squash_plan = ctx->projection_squashes[i];
|
||||
projection_squash_plan.setHeader(block_to_squash.cloneEmpty());
|
||||
Chunk squashed_chunk = Squashing::squash(projection_squash_plan.add({block_to_squash.getColumns(), block_to_squash.rows()}));
|
||||
if (squashed_chunk)
|
||||
{
|
||||
auto result = projection_squash_plan.getHeader().cloneWithColumns(squashed_chunk.detachColumns());
|
||||
auto tmp_part = MergeTreeDataWriter::writeTempProjectionPart(
|
||||
*global_ctx->data, ctx->log, result, projection, global_ctx->new_data_part.get(), ++ctx->projection_block_num);
|
||||
tmp_part.finalize();
|
||||
tmp_part.part->getDataPartStorage().commitTransaction();
|
||||
ctx->projection_parts[projection.name].emplace_back(std::move(tmp_part.part));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void MergeTask::ExecuteAndFinalizeHorizontalPart::finalizeProjections() const
|
||||
{
|
||||
for (size_t i = 0, size = global_ctx->projections_to_rebuild.size(); i < size; ++i)
|
||||
{
|
||||
const auto & projection = *global_ctx->projections_to_rebuild[i];
|
||||
auto & projection_squash_plan = ctx->projection_squashes[i];
|
||||
auto squashed_chunk = Squashing::squash(projection_squash_plan.flush());
|
||||
if (squashed_chunk)
|
||||
{
|
||||
auto result = projection_squash_plan.getHeader().cloneWithColumns(squashed_chunk.detachColumns());
|
||||
auto temp_part = MergeTreeDataWriter::writeTempProjectionPart(
|
||||
*global_ctx->data, ctx->log, result, projection, global_ctx->new_data_part.get(), ++ctx->projection_block_num);
|
||||
temp_part.finalize();
|
||||
temp_part.part->getDataPartStorage().commitTransaction();
|
||||
ctx->projection_parts[projection.name].emplace_back(std::move(temp_part.part));
|
||||
}
|
||||
}
|
||||
|
||||
ctx->projection_parts_iterator = std::make_move_iterator(ctx->projection_parts.begin());
|
||||
if (ctx->projection_parts_iterator != std::make_move_iterator(ctx->projection_parts.end()))
|
||||
constructTaskForProjectionPartsMerge();
|
||||
}
|
||||
|
||||
|
||||
void MergeTask::ExecuteAndFinalizeHorizontalPart::constructTaskForProjectionPartsMerge() const
|
||||
{
|
||||
auto && [name, parts] = *ctx->projection_parts_iterator;
|
||||
const auto & projection = global_ctx->metadata_snapshot->projections.get(name);
|
||||
|
||||
ctx->merge_projection_parts_task_ptr = std::make_unique<MergeProjectionPartsTask>
|
||||
(
|
||||
name,
|
||||
std::move(parts),
|
||||
projection,
|
||||
ctx->projection_block_num,
|
||||
global_ctx->context,
|
||||
global_ctx->holder,
|
||||
global_ctx->mutator,
|
||||
global_ctx->merge_entry,
|
||||
global_ctx->time_of_merge,
|
||||
global_ctx->new_data_part,
|
||||
global_ctx->space_reservation
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
bool MergeTask::ExecuteAndFinalizeHorizontalPart::executeMergeProjections() // NOLINT
|
||||
{
|
||||
/// In case if there are no projections we didn't construct a task
|
||||
if (!ctx->merge_projection_parts_task_ptr)
|
||||
return false;
|
||||
|
||||
if (ctx->merge_projection_parts_task_ptr->executeStep())
|
||||
return true;
|
||||
|
||||
++ctx->projection_parts_iterator;
|
||||
|
||||
if (ctx->projection_parts_iterator == std::make_move_iterator(ctx->projection_parts.end()))
|
||||
return false;
|
||||
|
||||
constructTaskForProjectionPartsMerge();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool MergeTask::ExecuteAndFinalizeHorizontalPart::executeImpl()
|
||||
{
|
||||
Stopwatch watch(CLOCK_MONOTONIC_COARSE);
|
||||
@ -535,6 +689,8 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::executeImpl()
|
||||
global_ctx->rows_written += block.rows();
|
||||
const_cast<MergedBlockOutputStream &>(*global_ctx->to).write(block);
|
||||
|
||||
calculateProjections(block);
|
||||
|
||||
UInt64 result_rows = 0;
|
||||
UInt64 result_bytes = 0;
|
||||
global_ctx->merged_pipeline.tryGetResultRowsAndBytes(result_rows, result_bytes);
|
||||
@ -558,8 +714,10 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::executeImpl()
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void MergeTask::ExecuteAndFinalizeHorizontalPart::finalize() const
|
||||
{
|
||||
finalizeProjections();
|
||||
global_ctx->merging_executor.reset();
|
||||
global_ctx->merged_pipeline.reset();
|
||||
|
||||
@ -847,35 +1005,9 @@ bool MergeTask::MergeProjectionsStage::mergeMinMaxIndexAndPrepareProjections() c
|
||||
ReadableSize(global_ctx->merge_list_element_ptr->bytes_read_uncompressed / elapsed_seconds));
|
||||
}
|
||||
|
||||
|
||||
const auto mode = global_ctx->data->getSettings()->deduplicate_merge_projection_mode;
|
||||
/// Under throw mode, we still choose to drop projections due to backward compatibility since some
|
||||
/// users might have projections before this change.
|
||||
if (global_ctx->data->merging_params.mode != MergeTreeData::MergingParams::Ordinary
|
||||
&& (mode == DeduplicateMergeProjectionMode::THROW || mode == DeduplicateMergeProjectionMode::DROP))
|
||||
for (const auto & projection : global_ctx->projections_to_merge)
|
||||
{
|
||||
ctx->projections_iterator = ctx->tasks_for_projections.begin();
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto & projections = global_ctx->metadata_snapshot->getProjections();
|
||||
|
||||
for (const auto & projection : projections)
|
||||
{
|
||||
MergeTreeData::DataPartsVector projection_parts;
|
||||
for (const auto & part : global_ctx->future_part->parts)
|
||||
{
|
||||
auto actual_projection_parts = part->getProjectionParts();
|
||||
auto it = actual_projection_parts.find(projection.name);
|
||||
if (it != actual_projection_parts.end() && !it->second->is_broken)
|
||||
projection_parts.push_back(it->second);
|
||||
}
|
||||
if (projection_parts.size() < global_ctx->future_part->parts.size())
|
||||
{
|
||||
LOG_DEBUG(ctx->log, "Projection {} is not merged because some parts don't have it", projection.name);
|
||||
continue;
|
||||
}
|
||||
|
||||
MergeTreeData::DataPartsVector projection_parts = global_ctx->projections_to_merge_parts[projection->name];
|
||||
LOG_DEBUG(
|
||||
ctx->log,
|
||||
"Selected {} projection_parts from {} to {}",
|
||||
@ -885,24 +1017,25 @@ bool MergeTask::MergeProjectionsStage::mergeMinMaxIndexAndPrepareProjections() c
|
||||
|
||||
auto projection_future_part = std::make_shared<FutureMergedMutatedPart>();
|
||||
projection_future_part->assign(std::move(projection_parts));
|
||||
projection_future_part->name = projection.name;
|
||||
projection_future_part->name = projection->name;
|
||||
// TODO (ab): path in future_part is only for merge process introspection, which is not available for merges of projection parts.
|
||||
// Let's comment this out to avoid code inconsistency and add it back after we implement projection merge introspection.
|
||||
// projection_future_part->path = global_ctx->future_part->path + "/" + projection.name + ".proj/";
|
||||
projection_future_part->part_info = {"all", 0, 0, 0};
|
||||
projection_future_part->part_info = MergeListElement::FAKE_RESULT_PART_FOR_PROJECTION;
|
||||
|
||||
MergeTreeData::MergingParams projection_merging_params;
|
||||
projection_merging_params.mode = MergeTreeData::MergingParams::Ordinary;
|
||||
if (projection.type == ProjectionDescription::Type::Aggregate)
|
||||
if (projection->type == ProjectionDescription::Type::Aggregate)
|
||||
projection_merging_params.mode = MergeTreeData::MergingParams::Aggregating;
|
||||
|
||||
ctx->tasks_for_projections.emplace_back(std::make_shared<MergeTask>(
|
||||
projection_future_part,
|
||||
projection.metadata,
|
||||
projection->metadata,
|
||||
global_ctx->merge_entry,
|
||||
std::make_unique<MergeListElement>((*global_ctx->merge_entry)->table_id, projection_future_part, global_ctx->context),
|
||||
global_ctx->time_of_merge,
|
||||
global_ctx->context,
|
||||
*global_ctx->holder,
|
||||
global_ctx->space_reservation,
|
||||
global_ctx->deduplicate,
|
||||
global_ctx->deduplicate_by_columns,
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <Compression/CompressedReadBuffer.h>
|
||||
#include <Compression/CompressedReadBufferFromFile.h>
|
||||
|
||||
#include <Interpreters/Squashing.h>
|
||||
#include <Interpreters/TemporaryDataOnDisk.h>
|
||||
|
||||
#include <Processors/Executors/PullingPipelineExecutor.h>
|
||||
@ -72,6 +73,7 @@ public:
|
||||
std::unique_ptr<MergeListElement> projection_merge_list_element_,
|
||||
time_t time_of_merge_,
|
||||
ContextPtr context_,
|
||||
TableLockHolder & holder,
|
||||
ReservationSharedPtr space_reservation_,
|
||||
bool deduplicate_,
|
||||
Names deduplicate_by_columns_,
|
||||
@ -96,6 +98,7 @@ public:
|
||||
= global_ctx->projection_merge_list_element ? global_ctx->projection_merge_list_element.get() : (*global_ctx->merge_entry)->ptr();
|
||||
global_ctx->time_of_merge = std::move(time_of_merge_);
|
||||
global_ctx->context = std::move(context_);
|
||||
global_ctx->holder = &holder;
|
||||
global_ctx->space_reservation = std::move(space_reservation_);
|
||||
global_ctx->deduplicate = std::move(deduplicate_);
|
||||
global_ctx->deduplicate_by_columns = std::move(deduplicate_by_columns_);
|
||||
@ -151,6 +154,7 @@ private:
|
||||
/// Proper initialization is responsibility of the author
|
||||
struct GlobalRuntimeContext : public IStageRuntimeContext
|
||||
{
|
||||
TableLockHolder * holder;
|
||||
MergeList::Entry * merge_entry{nullptr};
|
||||
/// If not null, use this instead of the global MergeList::Entry. This is for merging projections.
|
||||
std::unique_ptr<MergeListElement> projection_merge_list_element;
|
||||
@ -181,6 +185,10 @@ private:
|
||||
|
||||
MergeAlgorithm chosen_merge_algorithm{MergeAlgorithm::Undecided};
|
||||
|
||||
std::vector<ProjectionDescriptionRawPtr> projections_to_rebuild{};
|
||||
std::vector<ProjectionDescriptionRawPtr> projections_to_merge{};
|
||||
std::map<String, MergeTreeData::DataPartsVector> projections_to_merge_parts{};
|
||||
|
||||
std::unique_ptr<MergeStageProgress> horizontal_stage_progress{nullptr};
|
||||
std::unique_ptr<MergeStageProgress> column_progress{nullptr};
|
||||
|
||||
@ -228,6 +236,14 @@ private:
|
||||
std::unique_ptr<WriteBuffer> rows_sources_write_buf{nullptr};
|
||||
std::optional<ColumnSizeEstimator> column_sizes{};
|
||||
|
||||
/// For projections to rebuild
|
||||
using ProjectionNameToItsBlocks = std::map<String, MergeTreeData::MutableDataPartsVector>;
|
||||
ProjectionNameToItsBlocks projection_parts;
|
||||
std::move_iterator<ProjectionNameToItsBlocks::iterator> projection_parts_iterator;
|
||||
std::vector<Squashing> projection_squashes;
|
||||
size_t projection_block_num = 0;
|
||||
ExecutableTaskPtr merge_projection_parts_task_ptr;
|
||||
|
||||
size_t initial_reservation{0};
|
||||
bool read_with_direct_io{false};
|
||||
|
||||
@ -257,16 +273,23 @@ private:
|
||||
void finalize() const;
|
||||
|
||||
/// NOTE: Using pointer-to-member instead of std::function and lambda makes stacktraces much more concise and readable
|
||||
using ExecuteAndFinalizeHorizontalPartSubtasks = std::array<bool(ExecuteAndFinalizeHorizontalPart::*)(), 2>;
|
||||
using ExecuteAndFinalizeHorizontalPartSubtasks = std::array<bool(ExecuteAndFinalizeHorizontalPart::*)(), 3>;
|
||||
|
||||
const ExecuteAndFinalizeHorizontalPartSubtasks subtasks
|
||||
{
|
||||
&ExecuteAndFinalizeHorizontalPart::prepare,
|
||||
&ExecuteAndFinalizeHorizontalPart::executeImpl
|
||||
&ExecuteAndFinalizeHorizontalPart::executeImpl,
|
||||
&ExecuteAndFinalizeHorizontalPart::executeMergeProjections
|
||||
};
|
||||
|
||||
ExecuteAndFinalizeHorizontalPartSubtasks::const_iterator subtasks_iterator = subtasks.begin();
|
||||
|
||||
void prepareProjectionsToMergeAndRebuild() const;
|
||||
void calculateProjections(const Block & block) const;
|
||||
void finalizeProjections() const;
|
||||
void constructTaskForProjectionPartsMerge() const;
|
||||
bool executeMergeProjections();
|
||||
|
||||
MergeAlgorithm chooseMergeAlgorithm() const;
|
||||
void createMergedStream();
|
||||
void extractMergingAndGatheringColumns() const;
|
||||
|
@ -671,7 +671,7 @@ MergeTaskPtr MergeTreeDataMergerMutator::mergePartsToTemporaryPart(
|
||||
const StorageMetadataPtr & metadata_snapshot,
|
||||
MergeList::Entry * merge_entry,
|
||||
std::unique_ptr<MergeListElement> projection_merge_list_element,
|
||||
TableLockHolder,
|
||||
TableLockHolder & holder,
|
||||
time_t time_of_merge,
|
||||
ContextPtr context,
|
||||
ReservationSharedPtr space_reservation,
|
||||
@ -691,6 +691,7 @@ MergeTaskPtr MergeTreeDataMergerMutator::mergePartsToTemporaryPart(
|
||||
std::move(projection_merge_list_element),
|
||||
time_of_merge,
|
||||
context,
|
||||
holder,
|
||||
space_reservation,
|
||||
deduplicate,
|
||||
deduplicate_by_columns,
|
||||
|
@ -159,7 +159,7 @@ public:
|
||||
const StorageMetadataPtr & metadata_snapshot,
|
||||
MergeListEntry * merge_entry,
|
||||
std::unique_ptr<MergeListElement> projection_merge_list_element,
|
||||
TableLockHolder table_lock_holder,
|
||||
TableLockHolder & table_lock_holder,
|
||||
time_t time_of_merge,
|
||||
ContextPtr context,
|
||||
ReservationSharedPtr space_reservation,
|
||||
|
@ -577,10 +577,7 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const NameAndTypePai
|
||||
|
||||
if (index_granularity_rows != index_granularity.getMarkRows(mark_num))
|
||||
{
|
||||
/// With fixed granularity we can have last mark with less rows than granularity
|
||||
const bool is_last_mark = (mark_num + 1 == index_granularity.getMarksCount());
|
||||
if (!index_granularity_info.fixed_index_granularity || !is_last_mark)
|
||||
throw Exception(
|
||||
throw Exception(
|
||||
ErrorCodes::LOGICAL_ERROR,
|
||||
"Incorrect mark rows for part {} for mark #{}"
|
||||
" (compressed offset {}, decompressed offset {}), in-memory {}, on disk {}, total marks {}",
|
||||
@ -844,14 +841,7 @@ void MergeTreeDataPartWriterWide::adjustLastMarkIfNeedAndFlushToDisk(size_t new_
|
||||
/// Without offset
|
||||
rows_written_in_last_mark = 0;
|
||||
}
|
||||
|
||||
if (compute_granularity)
|
||||
{
|
||||
index_granularity.popMark();
|
||||
index_granularity.appendMark(new_rows_in_last_mark);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h>
|
||||
#include <Storages/MergeTree/StorageFromMergeTreeDataPart.h>
|
||||
#include <Storages/MergeTree/MergeTreeDataWriter.h>
|
||||
#include <Storages/MergeTree/MergeProjectionPartsTask.h>
|
||||
#include <Storages/MutationCommands.h>
|
||||
#include <Storages/MergeTree/MergeTreeDataMergerMutator.h>
|
||||
#include <Storages/MergeTree/MergeTreeIndexFullText.h>
|
||||
@ -552,7 +553,7 @@ static std::set<ColumnStatisticsPtr> getStatisticsToRecalculate(const StorageMet
|
||||
{
|
||||
if (!col_desc.statistics.empty() && materialized_stats.contains(col_desc.name))
|
||||
{
|
||||
stats_to_recalc.insert(stats_factory.get(col_desc.statistics));
|
||||
stats_to_recalc.insert(stats_factory.get(col_desc));
|
||||
}
|
||||
}
|
||||
return stats_to_recalc;
|
||||
@ -1058,136 +1059,6 @@ struct MutationContext
|
||||
using MutationContextPtr = std::shared_ptr<MutationContext>;
|
||||
|
||||
|
||||
class MergeProjectionPartsTask : public IExecutableTask
|
||||
{
|
||||
public:
|
||||
|
||||
MergeProjectionPartsTask(
|
||||
String name_,
|
||||
MergeTreeData::MutableDataPartsVector && parts_,
|
||||
const ProjectionDescription & projection_,
|
||||
size_t & block_num_,
|
||||
MutationContextPtr ctx_)
|
||||
: name(std::move(name_))
|
||||
, parts(std::move(parts_))
|
||||
, projection(projection_)
|
||||
, block_num(block_num_)
|
||||
, ctx(ctx_)
|
||||
, log(getLogger("MergeProjectionPartsTask"))
|
||||
{
|
||||
LOG_DEBUG(log, "Selected {} projection_parts from {} to {}", parts.size(), parts.front()->name, parts.back()->name);
|
||||
level_parts[current_level] = std::move(parts);
|
||||
}
|
||||
|
||||
void onCompleted() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); }
|
||||
StorageID getStorageID() const override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); }
|
||||
Priority getPriority() const override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); }
|
||||
String getQueryId() const override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); }
|
||||
|
||||
bool executeStep() override
|
||||
{
|
||||
auto & current_level_parts = level_parts[current_level];
|
||||
auto & next_level_parts = level_parts[next_level];
|
||||
|
||||
MergeTreeData::MutableDataPartsVector selected_parts;
|
||||
while (selected_parts.size() < max_parts_to_merge_in_one_level && !current_level_parts.empty())
|
||||
{
|
||||
selected_parts.push_back(std::move(current_level_parts.back()));
|
||||
current_level_parts.pop_back();
|
||||
}
|
||||
|
||||
if (selected_parts.empty())
|
||||
{
|
||||
if (next_level_parts.empty())
|
||||
{
|
||||
LOG_WARNING(log, "There is no projection parts merged");
|
||||
|
||||
/// Task is finished
|
||||
return false;
|
||||
}
|
||||
current_level = next_level;
|
||||
++next_level;
|
||||
}
|
||||
else if (selected_parts.size() == 1)
|
||||
{
|
||||
if (next_level_parts.empty())
|
||||
{
|
||||
LOG_DEBUG(log, "Merged a projection part in level {}", current_level);
|
||||
selected_parts[0]->renameTo(projection.name + ".proj", true);
|
||||
selected_parts[0]->setName(projection.name);
|
||||
selected_parts[0]->is_temp = false;
|
||||
ctx->new_data_part->addProjectionPart(name, std::move(selected_parts[0]));
|
||||
|
||||
/// Task is finished
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
LOG_DEBUG(log, "Forwarded part {} in level {} to next level", selected_parts[0]->name, current_level);
|
||||
next_level_parts.push_back(std::move(selected_parts[0]));
|
||||
}
|
||||
}
|
||||
else if (selected_parts.size() > 1)
|
||||
{
|
||||
// Generate a unique part name
|
||||
++block_num;
|
||||
auto projection_future_part = std::make_shared<FutureMergedMutatedPart>();
|
||||
MergeTreeData::DataPartsVector const_selected_parts(
|
||||
std::make_move_iterator(selected_parts.begin()), std::make_move_iterator(selected_parts.end()));
|
||||
projection_future_part->assign(std::move(const_selected_parts));
|
||||
projection_future_part->name = fmt::format("{}_{}", projection.name, ++block_num);
|
||||
projection_future_part->part_info = {"all", 0, 0, 0};
|
||||
|
||||
MergeTreeData::MergingParams projection_merging_params;
|
||||
projection_merging_params.mode = MergeTreeData::MergingParams::Ordinary;
|
||||
if (projection.type == ProjectionDescription::Type::Aggregate)
|
||||
projection_merging_params.mode = MergeTreeData::MergingParams::Aggregating;
|
||||
|
||||
LOG_DEBUG(log, "Merged {} parts in level {} to {}", selected_parts.size(), current_level, projection_future_part->name);
|
||||
auto tmp_part_merge_task = ctx->mutator->mergePartsToTemporaryPart(
|
||||
projection_future_part,
|
||||
projection.metadata,
|
||||
ctx->mutate_entry,
|
||||
std::make_unique<MergeListElement>((*ctx->mutate_entry)->table_id, projection_future_part, ctx->context),
|
||||
*ctx->holder,
|
||||
ctx->time_of_mutation,
|
||||
ctx->context,
|
||||
ctx->space_reservation,
|
||||
false, // TODO Do we need deduplicate for projections
|
||||
{},
|
||||
false, // no cleanup
|
||||
projection_merging_params,
|
||||
NO_TRANSACTION_PTR,
|
||||
/* need_prefix */ true,
|
||||
ctx->new_data_part.get(),
|
||||
".tmp_proj");
|
||||
|
||||
next_level_parts.push_back(executeHere(tmp_part_merge_task));
|
||||
next_level_parts.back()->is_temp = true;
|
||||
}
|
||||
|
||||
/// Need execute again
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
String name;
|
||||
MergeTreeData::MutableDataPartsVector parts;
|
||||
const ProjectionDescription & projection;
|
||||
size_t & block_num;
|
||||
MutationContextPtr ctx;
|
||||
|
||||
LoggerPtr log;
|
||||
|
||||
std::map<size_t, MergeTreeData::MutableDataPartsVector> level_parts;
|
||||
size_t current_level = 0;
|
||||
size_t next_level = 1;
|
||||
|
||||
/// TODO(nikitamikhaylov): make this constant a setting
|
||||
static constexpr size_t max_parts_to_merge_in_one_level = 10;
|
||||
};
|
||||
|
||||
|
||||
// This class is responsible for:
|
||||
// 1. get projection pipeline and a sink to write parts
|
||||
// 2. build an executor that can write block to the input stream (actually we can write through it to generate as many parts as possible)
|
||||
@ -1406,7 +1277,13 @@ void PartMergerWriter::constructTaskForProjectionPartsMerge()
|
||||
std::move(parts),
|
||||
projection,
|
||||
block_num,
|
||||
ctx
|
||||
ctx->context,
|
||||
ctx->holder,
|
||||
ctx->mutator,
|
||||
ctx->mutate_entry,
|
||||
ctx->time_of_mutation,
|
||||
ctx->new_data_part,
|
||||
ctx->space_reservation
|
||||
);
|
||||
}
|
||||
|
||||
@ -1557,7 +1434,7 @@ private:
|
||||
|
||||
if (ctx->materialized_statistics.contains(col.name))
|
||||
{
|
||||
stats_to_rewrite.push_back(MergeTreeStatisticsFactory::instance().get(col.statistics));
|
||||
stats_to_rewrite.push_back(MergeTreeStatisticsFactory::instance().get(col));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -148,10 +148,12 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context,
|
||||
{
|
||||
if (engine_args.size() < 3 || engine_args.size() > (with_structure ? 8 : 7))
|
||||
{
|
||||
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
|
||||
"Storage AzureBlobStorage requires 3 to 7 arguments: "
|
||||
"AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, "
|
||||
"[account_name, account_key, format, compression, structure)])");
|
||||
throw Exception(
|
||||
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
|
||||
"Storage AzureBlobStorage requires 3 to {} arguments: "
|
||||
"AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, "
|
||||
"[account_name, account_key, format, compression, structure)])",
|
||||
(with_structure ? 8 : 7));
|
||||
}
|
||||
|
||||
for (auto & engine_arg : engine_args)
|
||||
|
@ -3,7 +3,7 @@
|
||||
#include "config.h"
|
||||
#include <set>
|
||||
|
||||
#if USE_AWS_S3 && USE_PARQUET
|
||||
#if USE_PARQUET
|
||||
|
||||
#include <Common/logger_useful.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
@ -425,8 +425,9 @@ struct DeltaLakeMetadataImpl
|
||||
{
|
||||
auto field = fields->getObject(static_cast<Int32>(i));
|
||||
element_names.push_back(field->getValue<String>("name"));
|
||||
auto required = field->getValue<bool>("required");
|
||||
element_types.push_back(getFieldType(field, "type", required));
|
||||
|
||||
auto is_nullable = field->getValue<bool>("nullable");
|
||||
element_types.push_back(getFieldType(field, "type", is_nullable));
|
||||
}
|
||||
|
||||
return std::make_shared<DataTypeTuple>(element_types, element_names);
|
||||
@ -434,16 +435,16 @@ struct DeltaLakeMetadataImpl
|
||||
|
||||
if (type_name == "array")
|
||||
{
|
||||
bool is_nullable = type->getValue<bool>("containsNull");
|
||||
auto element_type = getFieldType(type, "elementType", is_nullable);
|
||||
bool element_nullable = type->getValue<bool>("containsNull");
|
||||
auto element_type = getFieldType(type, "elementType", element_nullable);
|
||||
return std::make_shared<DataTypeArray>(element_type);
|
||||
}
|
||||
|
||||
if (type_name == "map")
|
||||
{
|
||||
bool is_nullable = type->getValue<bool>("containsNull");
|
||||
auto key_type = getFieldType(type, "keyType", /* is_nullable */false);
|
||||
auto value_type = getFieldType(type, "valueType", is_nullable);
|
||||
bool value_nullable = type->getValue<bool>("valueContainsNull");
|
||||
auto value_type = getFieldType(type, "valueType", value_nullable);
|
||||
return std::make_shared<DataTypeMap>(key_type, value_type);
|
||||
}
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#if USE_AWS_S3 && USE_AVRO
|
||||
#if USE_AVRO
|
||||
|
||||
#include <Formats/FormatFactory.h>
|
||||
#include <Storages/IStorage.h>
|
||||
|
@ -1,6 +1,6 @@
|
||||
#include "config.h"
|
||||
|
||||
#if USE_AWS_S3 && USE_AVRO
|
||||
#if USE_AVRO
|
||||
|
||||
#include <Common/logger_useful.h>
|
||||
#include <Core/Settings.h>
|
||||
|
@ -1,6 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#if USE_AWS_S3 && USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format.
|
||||
#if USE_AVRO /// StorageIceberg depending on Avro to parse metadata with Avro format.
|
||||
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <Core/Types.h>
|
||||
|
@ -2,10 +2,12 @@
|
||||
|
||||
#if USE_AWS_S3
|
||||
|
||||
#include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
|
||||
#include <Storages/ObjectStorage/DataLakes/IStorageDataLake.h>
|
||||
#include <Storages/ObjectStorage/DataLakes/IcebergMetadata.h>
|
||||
#include <Storages/ObjectStorage/S3/Configuration.h>
|
||||
# include <Storages/ObjectStorage/Azure/Configuration.h>
|
||||
# include <Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h>
|
||||
# include <Storages/ObjectStorage/DataLakes/IStorageDataLake.h>
|
||||
# include <Storages/ObjectStorage/DataLakes/IcebergMetadata.h>
|
||||
# include <Storages/ObjectStorage/Local/Configuration.h>
|
||||
# include <Storages/ObjectStorage/S3/Configuration.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -22,6 +24,54 @@ void registerStorageIceberg(StorageFactory & factory)
|
||||
auto configuration = std::make_shared<StorageS3Configuration>();
|
||||
StorageObjectStorage::Configuration::initialize(*configuration, args.engine_args, args.getLocalContext(), false);
|
||||
|
||||
return StorageIceberg::create(
|
||||
configuration, args.getContext(), args.table_id, args.columns, args.constraints, args.comment, std::nullopt, args.mode);
|
||||
},
|
||||
{
|
||||
.supports_settings = false,
|
||||
.supports_schema_inference = true,
|
||||
.source_access_type = AccessType::S3,
|
||||
});
|
||||
|
||||
factory.registerStorage(
|
||||
"IcebergS3",
|
||||
[&](const StorageFactory::Arguments & args)
|
||||
{
|
||||
auto configuration = std::make_shared<StorageS3Configuration>();
|
||||
StorageObjectStorage::Configuration::initialize(*configuration, args.engine_args, args.getLocalContext(), false);
|
||||
|
||||
return StorageIceberg::create(
|
||||
configuration, args.getContext(), args.table_id, args.columns, args.constraints, args.comment, std::nullopt, args.mode);
|
||||
},
|
||||
{
|
||||
.supports_settings = false,
|
||||
.supports_schema_inference = true,
|
||||
.source_access_type = AccessType::S3,
|
||||
});
|
||||
|
||||
factory.registerStorage(
|
||||
"IcebergAzure",
|
||||
[&](const StorageFactory::Arguments & args)
|
||||
{
|
||||
auto configuration = std::make_shared<StorageAzureConfiguration>();
|
||||
StorageObjectStorage::Configuration::initialize(*configuration, args.engine_args, args.getLocalContext(), true);
|
||||
|
||||
return StorageIceberg::create(
|
||||
configuration, args.getContext(), args.table_id, args.columns, args.constraints, args.comment, std::nullopt, args.mode);
|
||||
},
|
||||
{
|
||||
.supports_settings = false,
|
||||
.supports_schema_inference = true,
|
||||
.source_access_type = AccessType::AZURE,
|
||||
});
|
||||
|
||||
factory.registerStorage(
|
||||
"IcebergLocal",
|
||||
[&](const StorageFactory::Arguments & args)
|
||||
{
|
||||
auto configuration = std::make_shared<StorageLocalConfiguration>();
|
||||
StorageObjectStorage::Configuration::initialize(*configuration, args.engine_args, args.getLocalContext(), false);
|
||||
|
||||
return StorageIceberg::create(
|
||||
configuration, args.getContext(), args.table_id, args.columns,
|
||||
args.constraints, args.comment, std::nullopt, args.mode);
|
||||
@ -29,7 +79,7 @@ void registerStorageIceberg(StorageFactory & factory)
|
||||
{
|
||||
.supports_settings = false,
|
||||
.supports_schema_inference = true,
|
||||
.source_access_type = AccessType::S3,
|
||||
.source_access_type = AccessType::FILE,
|
||||
});
|
||||
}
|
||||
|
||||
|
77
src/Storages/ObjectStorage/Local/Configuration.cpp
Normal file
77
src/Storages/ObjectStorage/Local/Configuration.cpp
Normal file
@ -0,0 +1,77 @@
|
||||
#include <Core/Settings.h>
|
||||
#include <Interpreters/Context.h>
|
||||
#include <Interpreters/evaluateConstantExpression.h>
|
||||
#include <Storages/ObjectStorage/Local/Configuration.h>
|
||||
#include <Storages/checkAndGetLiteralArgument.h>
|
||||
#include "Common/NamedCollections/NamedCollections.h"
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
}
|
||||
|
||||
void StorageLocalConfiguration::fromNamedCollection(const NamedCollection & collection, ContextPtr)
|
||||
{
|
||||
path = collection.get<String>("path");
|
||||
format = collection.getOrDefault<String>("format", "auto");
|
||||
compression_method = collection.getOrDefault<String>("compression_method", collection.getOrDefault<String>("compression", "auto"));
|
||||
structure = collection.getOrDefault<String>("structure", "auto");
|
||||
paths = {path};
|
||||
}
|
||||
|
||||
|
||||
void StorageLocalConfiguration::fromAST(ASTs & args, ContextPtr context, bool with_structure)
|
||||
{
|
||||
const size_t max_args_num = with_structure ? 4 : 3;
|
||||
if (args.empty() || args.size() > max_args_num)
|
||||
{
|
||||
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Expected not more than {} arguments", max_args_num);
|
||||
}
|
||||
|
||||
for (auto & arg : args)
|
||||
arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context);
|
||||
|
||||
path = checkAndGetLiteralArgument<String>(args[0], "path");
|
||||
|
||||
if (args.size() > 1)
|
||||
{
|
||||
format = checkAndGetLiteralArgument<String>(args[1], "format_name");
|
||||
}
|
||||
|
||||
if (with_structure)
|
||||
{
|
||||
if (args.size() > 2)
|
||||
{
|
||||
structure = checkAndGetLiteralArgument<String>(args[2], "structure");
|
||||
}
|
||||
if (args.size() > 3)
|
||||
{
|
||||
compression_method = checkAndGetLiteralArgument<String>(args[3], "compression_method");
|
||||
}
|
||||
}
|
||||
else if (args.size() > 2)
|
||||
{
|
||||
compression_method = checkAndGetLiteralArgument<String>(args[2], "compression_method");
|
||||
}
|
||||
paths = {path};
|
||||
}
|
||||
|
||||
StorageObjectStorage::QuerySettings StorageLocalConfiguration::getQuerySettings(const ContextPtr & context) const
|
||||
{
|
||||
const auto & settings = context->getSettingsRef();
|
||||
return StorageObjectStorage::QuerySettings{
|
||||
.truncate_on_insert = settings.engine_file_truncate_on_insert,
|
||||
.create_new_file_on_insert = false,
|
||||
.schema_inference_use_cache = settings.schema_inference_use_cache_for_file,
|
||||
.schema_inference_mode = settings.schema_inference_mode,
|
||||
.skip_empty_files = settings.engine_file_skip_empty_files,
|
||||
.list_object_keys_size = 0,
|
||||
.throw_on_zero_files_match = false,
|
||||
.ignore_non_existent_file = false};
|
||||
}
|
||||
|
||||
}
|
52
src/Storages/ObjectStorage/Local/Configuration.h
Normal file
52
src/Storages/ObjectStorage/Local/Configuration.h
Normal file
@ -0,0 +1,52 @@
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include "Disks/ObjectStorages/Local/LocalObjectStorage.h"
|
||||
|
||||
#include <Storages/ObjectStorage/StorageObjectStorage.h>
|
||||
|
||||
#include <filesystem>
|
||||
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class StorageLocalConfiguration : public StorageObjectStorage::Configuration
|
||||
{
|
||||
public:
|
||||
using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr;
|
||||
|
||||
static constexpr auto type_name = "local";
|
||||
|
||||
StorageLocalConfiguration() = default;
|
||||
StorageLocalConfiguration(const StorageLocalConfiguration & other) = default;
|
||||
|
||||
std::string getTypeName() const override { return type_name; }
|
||||
std::string getEngineName() const override { return "Local"; }
|
||||
|
||||
Path getPath() const override { return path; }
|
||||
void setPath(const Path & path_) override { path = path_; }
|
||||
|
||||
const Paths & getPaths() const override { return paths; }
|
||||
void setPaths(const Paths & paths_) override { paths = paths_; }
|
||||
|
||||
String getNamespace() const override { return ""; }
|
||||
String getDataSourceDescription() const override { return ""; }
|
||||
StorageObjectStorage::QuerySettings getQuerySettings(const ContextPtr &) const override;
|
||||
|
||||
ConfigurationPtr clone() override { return std::make_shared<StorageLocalConfiguration>(*this); }
|
||||
|
||||
ObjectStoragePtr createObjectStorage(ContextPtr, bool) override { return std::make_shared<LocalObjectStorage>("/"); }
|
||||
|
||||
void addStructureAndFormatToArgs(ASTs &, const String &, const String &, ContextPtr) override { }
|
||||
|
||||
private:
|
||||
void fromNamedCollection(const NamedCollection & collection, ContextPtr context) override;
|
||||
void fromAST(ASTs & args, ContextPtr context, bool with_structure) override;
|
||||
Path path;
|
||||
Paths paths;
|
||||
};
|
||||
|
||||
}
|
@ -465,6 +465,12 @@ SchemaCache & StorageObjectStorage::getSchemaCache(const ContextPtr & context, c
|
||||
DEFAULT_SCHEMA_CACHE_ELEMENTS));
|
||||
return schema_cache;
|
||||
}
|
||||
else if (storage_type_name == "local")
|
||||
{
|
||||
static SchemaCache schema_cache(
|
||||
context->getConfigRef().getUInt("schema_inference_cache_max_elements_for_local", DEFAULT_SCHEMA_CACHE_ELEMENTS));
|
||||
return schema_cache;
|
||||
}
|
||||
else
|
||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported storage type: {}", storage_type_name);
|
||||
}
|
||||
|
@ -162,7 +162,7 @@ public:
|
||||
ContextPtr local_context,
|
||||
bool with_table_structure);
|
||||
|
||||
/// Storage type: s3, hdfs, azure.
|
||||
/// Storage type: s3, hdfs, azure, local.
|
||||
virtual std::string getTypeName() const = 0;
|
||||
/// Engine name: S3, HDFS, Azure.
|
||||
virtual std::string getEngineName() const = 0;
|
||||
|
@ -417,10 +417,7 @@ std::future<StorageObjectStorageSource::ReaderHolder> StorageObjectStorageSource
|
||||
}
|
||||
|
||||
std::unique_ptr<ReadBuffer> StorageObjectStorageSource::createReadBuffer(
|
||||
const ObjectInfo & object_info,
|
||||
const ObjectStoragePtr & object_storage,
|
||||
const ContextPtr & context_,
|
||||
const LoggerPtr & log)
|
||||
const ObjectInfo & object_info, const ObjectStoragePtr & object_storage, const ContextPtr & context_, const LoggerPtr & log)
|
||||
{
|
||||
const auto & object_size = object_info.metadata->size_bytes;
|
||||
|
||||
|
@ -58,8 +58,8 @@ IStatistics::IStatistics(const SingleStatisticsDescription & stat_)
|
||||
{
|
||||
}
|
||||
|
||||
ColumnStatistics::ColumnStatistics(const ColumnStatisticsDescription & stats_desc_)
|
||||
: stats_desc(stats_desc_)
|
||||
ColumnStatistics::ColumnStatistics(const ColumnStatisticsDescription & stats_desc_, const String & column_name_)
|
||||
: stats_desc(stats_desc_), column_name(column_name_)
|
||||
{
|
||||
}
|
||||
|
||||
@ -176,7 +176,7 @@ String ColumnStatistics::getFileName() const
|
||||
|
||||
const String & ColumnStatistics::columnName() const
|
||||
{
|
||||
return stats_desc.column_name;
|
||||
return column_name;
|
||||
}
|
||||
|
||||
UInt64 ColumnStatistics::rowCount() const
|
||||
@ -227,15 +227,15 @@ void MergeTreeStatisticsFactory::validate(const ColumnStatisticsDescription & st
|
||||
}
|
||||
}
|
||||
|
||||
ColumnStatisticsPtr MergeTreeStatisticsFactory::get(const ColumnStatisticsDescription & stats) const
|
||||
ColumnStatisticsPtr MergeTreeStatisticsFactory::get(const ColumnDescription & column_desc) const
|
||||
{
|
||||
ColumnStatisticsPtr column_stat = std::make_shared<ColumnStatistics>(stats);
|
||||
for (const auto & [type, desc] : stats.types_to_desc)
|
||||
ColumnStatisticsPtr column_stat = std::make_shared<ColumnStatistics>(column_desc.statistics, column_desc.name);
|
||||
for (const auto & [type, desc] : column_desc.statistics.types_to_desc)
|
||||
{
|
||||
auto it = creators.find(type);
|
||||
if (it == creators.end())
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistic type '{}'. Available types: 'tdigest' 'uniq' and 'count_min'", type);
|
||||
auto stat_ptr = (it->second)(desc, stats.data_type);
|
||||
auto stat_ptr = (it->second)(desc, column_desc.type);
|
||||
column_stat->stats[type] = stat_ptr;
|
||||
}
|
||||
return column_stat;
|
||||
@ -246,7 +246,7 @@ ColumnsStatistics MergeTreeStatisticsFactory::getMany(const ColumnsDescription &
|
||||
ColumnsStatistics result;
|
||||
for (const auto & col : columns)
|
||||
if (!col.statistics.empty())
|
||||
result.push_back(get(col.statistics));
|
||||
result.push_back(get(col));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -54,7 +54,7 @@ using StatisticsPtr = std::shared_ptr<IStatistics>;
|
||||
class ColumnStatistics
|
||||
{
|
||||
public:
|
||||
explicit ColumnStatistics(const ColumnStatisticsDescription & stats_desc_);
|
||||
explicit ColumnStatistics(const ColumnStatisticsDescription & stats_desc_, const String & column_name_);
|
||||
|
||||
void serialize(WriteBuffer & buf);
|
||||
void deserialize(ReadBuffer & buf);
|
||||
@ -73,10 +73,12 @@ public:
|
||||
private:
|
||||
friend class MergeTreeStatisticsFactory;
|
||||
ColumnStatisticsDescription stats_desc;
|
||||
String column_name;
|
||||
std::map<StatisticsType, StatisticsPtr> stats;
|
||||
UInt64 rows = 0; /// the number of rows in the column
|
||||
};
|
||||
|
||||
struct ColumnDescription;
|
||||
class ColumnsDescription;
|
||||
using ColumnStatisticsPtr = std::shared_ptr<ColumnStatistics>;
|
||||
using ColumnsStatistics = std::vector<ColumnStatisticsPtr>;
|
||||
@ -91,7 +93,7 @@ public:
|
||||
using Validator = std::function<void(const SingleStatisticsDescription & stats, const DataTypePtr & data_type)>;
|
||||
using Creator = std::function<StatisticsPtr(const SingleStatisticsDescription & stats, const DataTypePtr & data_type)>;
|
||||
|
||||
ColumnStatisticsPtr get(const ColumnStatisticsDescription & stats) const;
|
||||
ColumnStatisticsPtr get(const ColumnDescription & column_desc) const;
|
||||
ColumnsStatistics getMany(const ColumnsDescription & columns) const;
|
||||
|
||||
void registerValidator(StatisticsType type, Validator validator);
|
||||
|
@ -6,7 +6,6 @@
|
||||
#include <Parsers/ASTStatisticsDeclaration.h>
|
||||
#include <Parsers/queryToString.h>
|
||||
#include <Parsers/ParserCreateQuery.h>
|
||||
#include <Poco/Logger.h>
|
||||
#include <Storages/ColumnsDescription.h>
|
||||
|
||||
|
||||
@ -97,16 +96,13 @@ void ColumnStatisticsDescription::merge(const ColumnStatisticsDescription & othe
|
||||
{
|
||||
chassert(merging_column_type);
|
||||
|
||||
if (column_name.empty())
|
||||
column_name = merging_column_name;
|
||||
|
||||
data_type = merging_column_type;
|
||||
|
||||
for (const auto & [stats_type, stats_desc]: other.types_to_desc)
|
||||
{
|
||||
if (!if_not_exists && types_to_desc.contains(stats_type))
|
||||
{
|
||||
throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics type name {} has existed in column {}", stats_type, column_name);
|
||||
throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics type name {} has existed in column {}", stats_type, merging_column_name);
|
||||
}
|
||||
else if (!types_to_desc.contains(stats_type))
|
||||
types_to_desc.emplace(stats_type, stats_desc);
|
||||
@ -115,9 +111,6 @@ void ColumnStatisticsDescription::merge(const ColumnStatisticsDescription & othe
|
||||
|
||||
void ColumnStatisticsDescription::assign(const ColumnStatisticsDescription & other)
|
||||
{
|
||||
if (other.column_name != column_name)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot assign statistics from column {} to {}", column_name, other.column_name);
|
||||
|
||||
types_to_desc = other.types_to_desc;
|
||||
data_type = other.data_type;
|
||||
}
|
||||
@ -127,7 +120,7 @@ void ColumnStatisticsDescription::clear()
|
||||
types_to_desc.clear();
|
||||
}
|
||||
|
||||
std::vector<ColumnStatisticsDescription> ColumnStatisticsDescription::fromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns)
|
||||
std::vector<std::pair<String, ColumnStatisticsDescription>> ColumnStatisticsDescription::fromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns)
|
||||
{
|
||||
const auto * stat_definition_ast = definition_ast->as<ASTStatisticsDeclaration>();
|
||||
if (!stat_definition_ast)
|
||||
@ -145,7 +138,7 @@ std::vector<ColumnStatisticsDescription> ColumnStatisticsDescription::fromAST(co
|
||||
statistics_types.emplace(stat.type, stat);
|
||||
}
|
||||
|
||||
std::vector<ColumnStatisticsDescription> result;
|
||||
std::vector<std::pair<String, ColumnStatisticsDescription>> result;
|
||||
result.reserve(stat_definition_ast->columns->children.size());
|
||||
|
||||
for (const auto & column_ast : stat_definition_ast->columns->children)
|
||||
@ -157,10 +150,9 @@ std::vector<ColumnStatisticsDescription> ColumnStatisticsDescription::fromAST(co
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Incorrect column name {}", physical_column_name);
|
||||
|
||||
const auto & column = columns.getPhysical(physical_column_name);
|
||||
stats.column_name = column.name;
|
||||
stats.data_type = column.type;
|
||||
stats.types_to_desc = statistics_types;
|
||||
result.push_back(stats);
|
||||
result.emplace_back(physical_column_name, stats);
|
||||
}
|
||||
|
||||
if (result.empty())
|
||||
@ -175,14 +167,13 @@ ColumnStatisticsDescription ColumnStatisticsDescription::fromColumnDeclaration(c
|
||||
if (stat_type_list_ast->children.empty())
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "We expect at least one statistics type for column {}", queryToString(column));
|
||||
ColumnStatisticsDescription stats;
|
||||
stats.column_name = column.name;
|
||||
for (const auto & ast : stat_type_list_ast->children)
|
||||
{
|
||||
const auto & stat_type = ast->as<const ASTFunction &>().name;
|
||||
|
||||
SingleStatisticsDescription stat(stringToStatisticsType(Poco::toLower(stat_type)), ast->clone());
|
||||
if (stats.types_to_desc.contains(stat.type))
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Column {} already contains statistics type {}", stats.column_name, stat_type);
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Column {} already contains statistics type {}", column.name, stat_type);
|
||||
stats.types_to_desc.emplace(stat.type, std::move(stat));
|
||||
}
|
||||
stats.data_type = data_type;
|
||||
|
@ -55,12 +55,12 @@ struct ColumnStatisticsDescription
|
||||
|
||||
ASTPtr getAST() const;
|
||||
|
||||
static std::vector<ColumnStatisticsDescription> fromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns);
|
||||
/// get a vector of <column name, statistics desc> pair
|
||||
static std::vector<std::pair<String, ColumnStatisticsDescription>> fromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns);
|
||||
static ColumnStatisticsDescription fromColumnDeclaration(const ASTColumnDeclaration & column, DataTypePtr data_type);
|
||||
|
||||
using StatisticsTypeDescMap = std::map<StatisticsType, SingleStatisticsDescription>;
|
||||
StatisticsTypeDescMap types_to_desc;
|
||||
String column_name;
|
||||
DataTypePtr data_type;
|
||||
};
|
||||
|
||||
|
@ -1051,17 +1051,27 @@ void StorageWindowView::threadFuncFireProc()
|
||||
if (shutdown_called)
|
||||
return;
|
||||
|
||||
/// Acquiring the lock can take seconds (depends on how long it takes to push) so we keep a reference to remember
|
||||
/// what's the starting point where we want to push from
|
||||
UInt32 timestamp_start = now();
|
||||
|
||||
std::lock_guard lock(fire_signal_mutex);
|
||||
/// TODO: consider using time_t instead (for every timestamp in this class)
|
||||
UInt32 timestamp_now = now();
|
||||
|
||||
LOG_TRACE(log, "Now: {}, next fire signal: {}, max watermark: {}", timestamp_now, next_fire_signal, max_watermark);
|
||||
LOG_TRACE(
|
||||
log,
|
||||
"Start: {}, now: {}, next fire signal: {}, max watermark: {}",
|
||||
timestamp_start,
|
||||
timestamp_now,
|
||||
next_fire_signal,
|
||||
max_watermark);
|
||||
|
||||
while (next_fire_signal <= timestamp_now)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (max_watermark >= timestamp_now)
|
||||
if (max_watermark >= timestamp_start)
|
||||
fire(next_fire_signal);
|
||||
}
|
||||
catch (...)
|
||||
@ -1075,11 +1085,18 @@ void StorageWindowView::threadFuncFireProc()
|
||||
slide_interval *= 86400;
|
||||
next_fire_signal += slide_interval;
|
||||
|
||||
LOG_TRACE(log, "Now: {}, next fire signal: {}, max watermark: {}, max fired watermark: {}, slide interval: {}",
|
||||
timestamp_now, next_fire_signal, max_watermark, max_fired_watermark, slide_interval);
|
||||
LOG_TRACE(
|
||||
log,
|
||||
"Start: {}, now: {}, next fire signal: {}, max watermark: {}, max fired watermark: {}, slide interval: {}",
|
||||
timestamp_start,
|
||||
timestamp_now,
|
||||
next_fire_signal,
|
||||
max_watermark,
|
||||
max_fired_watermark,
|
||||
slide_interval);
|
||||
}
|
||||
|
||||
if (max_watermark >= timestamp_now)
|
||||
if (max_watermark >= timestamp_start)
|
||||
clean_cache_task->schedule();
|
||||
|
||||
UInt64 next_fire_ms = static_cast<UInt64>(next_fire_signal) * 1000;
|
||||
|
@ -76,6 +76,21 @@ struct TableFunctionIcebergName
|
||||
static constexpr auto name = "iceberg";
|
||||
};
|
||||
|
||||
struct TableFunctionIcebergS3Name
|
||||
{
|
||||
static constexpr auto name = "icebergS3";
|
||||
};
|
||||
|
||||
struct TableFunctionIcebergAzureName
|
||||
{
|
||||
static constexpr auto name = "icebergAzure";
|
||||
};
|
||||
|
||||
struct TableFunctionIcebergLocalName
|
||||
{
|
||||
static constexpr auto name = "icebergLocal";
|
||||
};
|
||||
|
||||
struct TableFunctionDeltaLakeName
|
||||
{
|
||||
static constexpr auto name = "deltaLake";
|
||||
@ -86,14 +101,20 @@ struct TableFunctionHudiName
|
||||
static constexpr auto name = "hudi";
|
||||
};
|
||||
|
||||
#if USE_AWS_S3
|
||||
#if USE_AVRO
|
||||
# if USE_AWS_S3
|
||||
using TableFunctionIceberg = ITableFunctionDataLake<TableFunctionIcebergName, StorageIceberg, TableFunctionS3>;
|
||||
using TableFunctionIcebergS3 = ITableFunctionDataLake<TableFunctionIcebergS3Name, StorageIceberg, TableFunctionS3>;
|
||||
# endif
|
||||
# if USE_AZURE_BLOB_STORAGE
|
||||
using TableFunctionIcebergAzure = ITableFunctionDataLake<TableFunctionIcebergAzureName, StorageIceberg, TableFunctionAzureBlob>;
|
||||
# endif
|
||||
using TableFunctionIcebergLocal = ITableFunctionDataLake<TableFunctionIcebergLocalName, StorageIceberg, TableFunctionLocal>;
|
||||
#endif
|
||||
#if USE_PARQUET
|
||||
#if USE_AWS_S3
|
||||
# if USE_PARQUET
|
||||
using TableFunctionDeltaLake = ITableFunctionDataLake<TableFunctionDeltaLakeName, StorageDeltaLake, TableFunctionS3>;
|
||||
#endif
|
||||
using TableFunctionHudi = ITableFunctionDataLake<TableFunctionHudiName, StorageHudi, TableFunctionS3>;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
@ -14,10 +14,11 @@
|
||||
|
||||
#include <Storages/ObjectStorage/Utils.h>
|
||||
#include <Storages/NamedCollectionsHelpers.h>
|
||||
#include <Storages/ObjectStorage/S3/Configuration.h>
|
||||
#include <Storages/ObjectStorage/HDFS/Configuration.h>
|
||||
#include <Storages/ObjectStorage/StorageObjectStorage.h>
|
||||
#include <Storages/ObjectStorage/Azure/Configuration.h>
|
||||
#include <Storages/ObjectStorage/HDFS/Configuration.h>
|
||||
#include <Storages/ObjectStorage/Local/Configuration.h>
|
||||
#include <Storages/ObjectStorage/S3/Configuration.h>
|
||||
#include <Storages/ObjectStorage/StorageObjectStorage.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -223,5 +224,5 @@ template class TableFunctionObjectStorage<OSSDefinition, StorageS3Configuration>
|
||||
template class TableFunctionObjectStorage<HDFSDefinition, StorageHDFSConfiguration>;
|
||||
template class TableFunctionObjectStorage<HDFSClusterDefinition, StorageHDFSConfiguration>;
|
||||
#endif
|
||||
|
||||
template class TableFunctionObjectStorage<LocalDefinition, StorageLocalConfiguration>;
|
||||
}
|
||||
|
@ -1,11 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#include "config.h"
|
||||
#include <TableFunctions/ITableFunction.h>
|
||||
#include <Formats/FormatFactory.h>
|
||||
#include <Disks/ObjectStorages/IObjectStorage_fwd.h>
|
||||
#include <Storages/VirtualColumnUtils.h>
|
||||
#include <Formats/FormatFactory.h>
|
||||
#include <Storages/ObjectStorage/StorageObjectStorage.h>
|
||||
#include <Storages/VirtualColumnUtils.h>
|
||||
#include <TableFunctions/ITableFunction.h>
|
||||
#include "config.h"
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -14,6 +14,7 @@ class Context;
|
||||
class StorageS3Configuration;
|
||||
class StorageAzureConfiguration;
|
||||
class StorageHDFSConfiguration;
|
||||
class StorageLocalConfiguration;
|
||||
struct S3StorageSettings;
|
||||
struct AzureStorageSettings;
|
||||
struct HDFSStorageSettings;
|
||||
@ -90,6 +91,17 @@ struct HDFSDefinition
|
||||
static constexpr auto max_number_of_arguments = 4;
|
||||
};
|
||||
|
||||
struct LocalDefinition
|
||||
{
|
||||
static constexpr auto name = "local";
|
||||
static constexpr auto storage_type_name = "Local";
|
||||
static constexpr auto signature = " - path\n"
|
||||
" - path, format\n"
|
||||
" - path, format, structure\n"
|
||||
" - path, format, structure, compression_method\n";
|
||||
static constexpr auto max_number_of_arguments = 4;
|
||||
};
|
||||
|
||||
template <typename Definition, typename Configuration>
|
||||
class TableFunctionObjectStorage : public ITableFunction
|
||||
{
|
||||
@ -169,4 +181,6 @@ using TableFunctionAzureBlob = TableFunctionObjectStorage<AzureDefinition, Stora
|
||||
#if USE_HDFS
|
||||
using TableFunctionHDFS = TableFunctionObjectStorage<HDFSDefinition, StorageHDFSConfiguration>;
|
||||
#endif
|
||||
|
||||
using TableFunctionLocal = TableFunctionObjectStorage<LocalDefinition, StorageLocalConfiguration>;
|
||||
}
|
||||
|
@ -4,24 +4,43 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
#if USE_AWS_S3
|
||||
#if USE_AVRO
|
||||
void registerTableFunctionIceberg(TableFunctionFactory & factory)
|
||||
{
|
||||
# if USE_AWS_S3
|
||||
factory.registerFunction<TableFunctionIceberg>(
|
||||
{
|
||||
.documentation =
|
||||
{
|
||||
.description=R"(The table function can be used to read the Iceberg table stored on object store.)",
|
||||
{.documentation
|
||||
= {.description = R"(The table function can be used to read the Iceberg table stored on S3 object store. Alias to icebergS3)",
|
||||
.examples{{"iceberg", "SELECT * FROM iceberg(url, access_key_id, secret_access_key)", ""}},
|
||||
.categories{"DataLake"}
|
||||
},
|
||||
.allow_readonly = false
|
||||
});
|
||||
.categories{"DataLake"}},
|
||||
.allow_readonly = false});
|
||||
factory.registerFunction<TableFunctionIcebergS3>(
|
||||
{.documentation
|
||||
= {.description = R"(The table function can be used to read the Iceberg table stored on S3 object store.)",
|
||||
.examples{{"icebergS3", "SELECT * FROM icebergS3(url, access_key_id, secret_access_key)", ""}},
|
||||
.categories{"DataLake"}},
|
||||
.allow_readonly = false});
|
||||
|
||||
# endif
|
||||
# if USE_AZURE_BLOB_STORAGE
|
||||
factory.registerFunction<TableFunctionIcebergAzure>(
|
||||
{.documentation
|
||||
= {.description = R"(The table function can be used to read the Iceberg table stored on Azure object store.)",
|
||||
.examples{{"icebergAzure", "SELECT * FROM icebergAzure(url, access_key_id, secret_access_key)", ""}},
|
||||
.categories{"DataLake"}},
|
||||
.allow_readonly = false});
|
||||
# endif
|
||||
factory.registerFunction<TableFunctionIcebergLocal>(
|
||||
{.documentation
|
||||
= {.description = R"(The table function can be used to read the Iceberg table stored locally.)",
|
||||
.examples{{"icebergLocal", "SELECT * FROM icebergLocal(filename)", ""}},
|
||||
.categories{"DataLake"}},
|
||||
.allow_readonly = false});
|
||||
}
|
||||
#endif
|
||||
|
||||
#if USE_PARQUET
|
||||
#if USE_AWS_S3
|
||||
# if USE_PARQUET
|
||||
void registerTableFunctionDeltaLake(TableFunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction<TableFunctionDeltaLake>(
|
||||
@ -55,11 +74,11 @@ void registerTableFunctionHudi(TableFunctionFactory & factory)
|
||||
void registerDataLakeTableFunctions(TableFunctionFactory & factory)
|
||||
{
|
||||
UNUSED(factory);
|
||||
#if USE_AWS_S3
|
||||
#if USE_AVRO
|
||||
registerTableFunctionIceberg(factory);
|
||||
#endif
|
||||
#if USE_PARQUET
|
||||
#if USE_AWS_S3
|
||||
# if USE_PARQUET
|
||||
registerTableFunctionDeltaLake(factory);
|
||||
#endif
|
||||
registerTableFunctionHudi(factory);
|
||||
|
@ -286,4 +286,7 @@ class Utils:
|
||||
|
||||
@staticmethod
|
||||
def is_job_triggered_manually():
|
||||
return "robot" not in Envs.GITHUB_ACTOR
|
||||
return (
|
||||
"robot" not in Envs.GITHUB_ACTOR
|
||||
and "clickhouse-ci" not in Envs.GITHUB_ACTOR
|
||||
)
|
||||
|
@ -2,30 +2,92 @@ from minio import Minio
|
||||
import glob
|
||||
import os
|
||||
import json
|
||||
import shutil
|
||||
|
||||
|
||||
def upload_directory(minio_client, bucket_name, local_path, s3_path):
|
||||
result_files = []
|
||||
for local_file in glob.glob(local_path + "/**"):
|
||||
if os.path.isfile(local_file):
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class CloudUploader:
|
||||
|
||||
def upload_directory(self, local_path, remote_blob_path, **kwargs):
|
||||
print(kwargs)
|
||||
result_files = []
|
||||
# print(f"Arguments: {local_path}, {s3_path}")
|
||||
# for local_file in glob.glob(local_path + "/**"):
|
||||
# print("Local file: {}", local_file)
|
||||
for local_file in glob.glob(local_path + "/**"):
|
||||
result_local_path = os.path.join(local_path, local_file)
|
||||
result_s3_path = os.path.join(s3_path, local_file)
|
||||
print(f"Putting file {result_local_path} to {result_s3_path}")
|
||||
minio_client.fput_object(
|
||||
bucket_name=bucket_name,
|
||||
object_name=result_s3_path,
|
||||
file_path=result_local_path,
|
||||
result_remote_blob_path = os.path.join(remote_blob_path, local_file)
|
||||
if os.path.isfile(local_file):
|
||||
self.upload_file(result_local_path, result_remote_blob_path, **kwargs)
|
||||
result_files.append(result_remote_blob_path)
|
||||
else:
|
||||
files = self.upload_directory(
|
||||
result_local_path, result_remote_blob_path, **kwargs
|
||||
)
|
||||
result_files.extend(files)
|
||||
return result_files
|
||||
|
||||
|
||||
class S3Uploader(CloudUploader):
|
||||
def __init__(self, minio_client, bucket_name):
|
||||
self.minio_client = minio_client
|
||||
self.bucket_name = bucket_name
|
||||
|
||||
def upload_file(self, local_path, remote_blob_path, bucket=None):
|
||||
print(f"Upload to bucket: {bucket}")
|
||||
if bucket is None:
|
||||
bucket = self.bucket_name
|
||||
self.minio_client.fput_object(
|
||||
bucket_name=bucket,
|
||||
object_name=remote_blob_path,
|
||||
file_path=local_path,
|
||||
)
|
||||
|
||||
|
||||
class LocalUploader(CloudUploader):
|
||||
|
||||
def __init__(self, clickhouse_node):
|
||||
self.clickhouse_node = clickhouse_node
|
||||
|
||||
def upload_file(self, local_path, remote_blob_path):
|
||||
dir_path = os.path.dirname(remote_blob_path)
|
||||
if dir_path != "":
|
||||
self.clickhouse_node.exec_in_container(
|
||||
[
|
||||
"bash",
|
||||
"-c",
|
||||
"mkdir -p {}".format(dir_path),
|
||||
]
|
||||
)
|
||||
result_files.append(result_s3_path)
|
||||
self.clickhouse_node.copy_file_to_container(local_path, remote_blob_path)
|
||||
|
||||
|
||||
class AzureUploader(CloudUploader):
|
||||
|
||||
def __init__(self, blob_service_client, container_name):
|
||||
self.blob_service_client = blob_service_client
|
||||
self.container_client = self.blob_service_client.get_container_client(
|
||||
container_name
|
||||
)
|
||||
|
||||
def upload_file(self, local_path, remote_blob_path, container_name=None):
|
||||
if container_name is None:
|
||||
container_client = self.container_client
|
||||
else:
|
||||
files = upload_directory(
|
||||
minio_client,
|
||||
bucket_name,
|
||||
os.path.join(local_path, local_file),
|
||||
os.path.join(s3_path, local_file),
|
||||
container_client = self.blob_service_client.get_container_client(
|
||||
container_name
|
||||
)
|
||||
result_files.extend(files)
|
||||
return result_files
|
||||
blob_client = container_client.get_blob_client(remote_blob_path)
|
||||
with open(local_path, "rb") as data:
|
||||
blob_client.upload_blob(data, overwrite=True)
|
||||
|
||||
|
||||
def upload_directory(minio_client, bucket, local_path, remote_path):
|
||||
return S3Uploader(minio_client=minio_client, bucket_name=bucket).upload_directory(
|
||||
local_path, remote_path
|
||||
)
|
||||
|
||||
|
||||
def get_file_contents(minio_client, bucket, s3_path):
|
||||
|
@ -200,7 +200,6 @@ def test_distributed_replica_max_ignored_errors():
|
||||
"connect_timeout": 2,
|
||||
"receive_timeout": 2,
|
||||
"send_timeout": 2,
|
||||
"idle_connection_timeout": 2,
|
||||
"tcp_keep_alive_timeout": 2,
|
||||
"distributed_replica_max_ignored_errors": 0,
|
||||
"distributed_replica_error_half_life": 60,
|
||||
|
@ -6,11 +6,17 @@ from helpers.cluster import ClickHouseCluster
|
||||
cluster = ClickHouseCluster(__file__)
|
||||
|
||||
node1 = cluster.add_instance(
|
||||
"node1", user_configs=["config/config.xml"], with_zookeeper=True
|
||||
"node1",
|
||||
user_configs=["config/config.xml"],
|
||||
with_zookeeper=True,
|
||||
macros={"replica": "a", "shard": "shard1"},
|
||||
)
|
||||
|
||||
node2 = cluster.add_instance(
|
||||
"node2", user_configs=["config/config.xml"], with_zookeeper=True
|
||||
"node2",
|
||||
user_configs=["config/config.xml"],
|
||||
with_zookeeper=True,
|
||||
macros={"replica": "b", "shard": "shard1"},
|
||||
)
|
||||
|
||||
|
||||
@ -129,8 +135,8 @@ def test_single_node_normal(started_cluster):
|
||||
|
||||
|
||||
def test_replicated_table_ddl(started_cluster):
|
||||
node1.query("DROP TABLE IF EXISTS test_stat")
|
||||
node2.query("DROP TABLE IF EXISTS test_stat")
|
||||
node1.query("DROP TABLE IF EXISTS test_stat SYNC")
|
||||
node2.query("DROP TABLE IF EXISTS test_stat SYNC")
|
||||
|
||||
node1.query(
|
||||
"""
|
||||
@ -183,3 +189,19 @@ def test_replicated_table_ddl(started_cluster):
|
||||
)
|
||||
check_stat_file_on_disk(node2, "test_stat", "all_0_0_0_3", "a", True)
|
||||
check_stat_file_on_disk(node2, "test_stat", "all_0_0_0_3", "b", True)
|
||||
|
||||
|
||||
def test_replicated_db(started_cluster):
|
||||
node1.query("DROP DATABASE IF EXISTS test SYNC")
|
||||
node2.query("DROP DATABASE IF EXISTS test SYNC")
|
||||
node1.query(
|
||||
"CREATE DATABASE test ENGINE = Replicated('/test/shared_stats', '{shard}', '{replica}')"
|
||||
)
|
||||
node2.query(
|
||||
"CREATE DATABASE test ENGINE = Replicated('/test/shared_stats', '{shard}', '{replica}')"
|
||||
)
|
||||
node1.query(
|
||||
"CREATE TABLE test.test_stats (a Int64, b Int64) ENGINE = ReplicatedMergeTree() ORDER BY()"
|
||||
)
|
||||
node2.query("ALTER TABLE test.test_stats MODIFY COLUMN b Float64")
|
||||
node2.query("ALTER TABLE test.test_stats MODIFY STATISTICS b TYPE tdigest")
|
||||
|
@ -29,6 +29,9 @@ from datetime import datetime
|
||||
from pyspark.sql.functions import monotonically_increasing_id, row_number
|
||||
from pyspark.sql.window import Window
|
||||
from minio.deleteobjects import DeleteObject
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
from deltalake.writer import write_deltalake
|
||||
|
||||
from helpers.s3_tools import (
|
||||
prepare_s3_bucket,
|
||||
@ -728,3 +731,96 @@ SELECT * FROM deltaLake('http://{started_cluster.minio_ip}:{started_cluster.mini
|
||||
)
|
||||
== 1
|
||||
)
|
||||
|
||||
|
||||
def test_complex_types(started_cluster):
|
||||
node = started_cluster.instances["node1"]
|
||||
minio_client = started_cluster.minio_client
|
||||
bucket = started_cluster.minio_bucket
|
||||
|
||||
schema = pa.schema(
|
||||
[
|
||||
("id", pa.int32()),
|
||||
("name", pa.string()),
|
||||
(
|
||||
"address",
|
||||
pa.struct(
|
||||
[
|
||||
("street", pa.string()),
|
||||
("city", pa.string()),
|
||||
("state", pa.string()),
|
||||
]
|
||||
),
|
||||
),
|
||||
("interests", pa.list_(pa.string())),
|
||||
(
|
||||
"metadata",
|
||||
pa.map_(
|
||||
pa.string(), pa.string()
|
||||
), # Map with string keys and string values
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
# Create sample data
|
||||
data = [
|
||||
pa.array([1, 2, 3], type=pa.int32()),
|
||||
pa.array(["John Doe", "Jane Smith", "Jake Johnson"], type=pa.string()),
|
||||
pa.array(
|
||||
[
|
||||
{"street": "123 Elm St", "city": "Springfield", "state": "IL"},
|
||||
{"street": "456 Maple St", "city": "Shelbyville", "state": "IL"},
|
||||
{"street": "789 Oak St", "city": "Ogdenville", "state": "IL"},
|
||||
],
|
||||
type=schema.field("address").type,
|
||||
),
|
||||
pa.array(
|
||||
[
|
||||
pa.array(["dancing", "coding", "hiking"]),
|
||||
pa.array(["dancing", "coding", "hiking"]),
|
||||
pa.array(["dancing", "coding", "hiking"]),
|
||||
],
|
||||
type=schema.field("interests").type,
|
||||
),
|
||||
pa.array(
|
||||
[
|
||||
{"key1": "value1", "key2": "value2"},
|
||||
{"key1": "value3", "key2": "value4"},
|
||||
{"key1": "value5", "key2": "value6"},
|
||||
],
|
||||
type=schema.field("metadata").type,
|
||||
),
|
||||
]
|
||||
|
||||
endpoint_url = f"http://{started_cluster.minio_ip}:{started_cluster.minio_port}"
|
||||
aws_access_key_id = "minio"
|
||||
aws_secret_access_key = "minio123"
|
||||
table_name = randomize_table_name("test_complex_types")
|
||||
|
||||
storage_options = {
|
||||
"AWS_ENDPOINT_URL": endpoint_url,
|
||||
"AWS_ACCESS_KEY_ID": aws_access_key_id,
|
||||
"AWS_SECRET_ACCESS_KEY": aws_secret_access_key,
|
||||
"AWS_ALLOW_HTTP": "true",
|
||||
"AWS_S3_ALLOW_UNSAFE_RENAME": "true",
|
||||
}
|
||||
path = f"s3://root/{table_name}"
|
||||
table = pa.Table.from_arrays(data, schema=schema)
|
||||
|
||||
write_deltalake(path, table, storage_options=storage_options)
|
||||
|
||||
assert "1\n2\n3\n" in node.query(
|
||||
f"SELECT id FROM deltaLake('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/root/{table_name}' , 'minio', 'minio123')"
|
||||
)
|
||||
assert (
|
||||
"('123 Elm St','Springfield','IL')\n('456 Maple St','Shelbyville','IL')\n('789 Oak St','Ogdenville','IL')"
|
||||
in node.query(
|
||||
f"SELECT address FROM deltaLake('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/root/{table_name}' , 'minio', 'minio123')"
|
||||
)
|
||||
)
|
||||
assert (
|
||||
"{'key1':'value1','key2':'value2'}\n{'key1':'value3','key2':'value4'}\n{'key1':'value5','key2':'value6'}"
|
||||
in node.query(
|
||||
f"SELECT metadata FROM deltaLake('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/root/{table_name}' , 'minio', 'minio123')"
|
||||
)
|
||||
)
|
||||
|
@ -5,5 +5,11 @@
|
||||
<access_key_id>minio</access_key_id>
|
||||
<secret_access_key>minio123</secret_access_key>
|
||||
</s3>
|
||||
<azure>
|
||||
<account_name>devstoreaccount1</account_name>
|
||||
<account_key>Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==</account_key>
|
||||
</azure>
|
||||
<local>
|
||||
</local>
|
||||
</named_collections>
|
||||
</clickhouse>
|
||||
|
@ -28,12 +28,15 @@ from pyspark.sql.functions import monotonically_increasing_id, row_number
|
||||
from pyspark.sql.window import Window
|
||||
from pyspark.sql.readwriter import DataFrameWriter, DataFrameWriterV2
|
||||
from minio.deleteobjects import DeleteObject
|
||||
from azure.storage.blob import BlobServiceClient
|
||||
|
||||
from helpers.s3_tools import (
|
||||
prepare_s3_bucket,
|
||||
upload_directory,
|
||||
get_file_contents,
|
||||
list_s3_objects,
|
||||
S3Uploader,
|
||||
AzureUploader,
|
||||
LocalUploader,
|
||||
)
|
||||
|
||||
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
@ -67,6 +70,7 @@ def started_cluster():
|
||||
main_configs=["configs/config.d/named_collections.xml"],
|
||||
user_configs=["configs/users.d/users.xml"],
|
||||
with_minio=True,
|
||||
with_azurite=True,
|
||||
stay_alive=True,
|
||||
)
|
||||
|
||||
@ -77,6 +81,25 @@ def started_cluster():
|
||||
logging.info("S3 bucket created")
|
||||
|
||||
cluster.spark_session = get_spark()
|
||||
cluster.default_s3_uploader = S3Uploader(
|
||||
cluster.minio_client, cluster.minio_bucket
|
||||
)
|
||||
|
||||
cluster.azure_container_name = "mycontainer"
|
||||
|
||||
cluster.blob_service_client = cluster.blob_service_client
|
||||
|
||||
container_client = cluster.blob_service_client.create_container(
|
||||
cluster.azure_container_name
|
||||
)
|
||||
|
||||
cluster.container_client = container_client
|
||||
|
||||
cluster.default_azure_uploader = AzureUploader(
|
||||
cluster.blob_service_client, cluster.azure_container_name
|
||||
)
|
||||
|
||||
cluster.default_local_uploader = LocalUploader(cluster.instances["node1"])
|
||||
|
||||
yield cluster
|
||||
|
||||
@ -142,12 +165,65 @@ def generate_data(spark, start, end):
|
||||
return df
|
||||
|
||||
|
||||
def create_iceberg_table(node, table_name, format="Parquet", bucket="root"):
|
||||
def get_creation_expression(
|
||||
storage_type,
|
||||
table_name,
|
||||
cluster,
|
||||
format="Parquet",
|
||||
table_function=False,
|
||||
**kwargs,
|
||||
):
|
||||
if storage_type == "s3":
|
||||
if "bucket" in kwargs:
|
||||
bucket = kwargs["bucket"]
|
||||
else:
|
||||
bucket = cluster.minio_bucket
|
||||
print(bucket)
|
||||
if table_function:
|
||||
return f"icebergS3(s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"
|
||||
else:
|
||||
return f"""
|
||||
DROP TABLE IF EXISTS {table_name};
|
||||
CREATE TABLE {table_name}
|
||||
ENGINE=IcebergS3(s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"""
|
||||
elif storage_type == "azure":
|
||||
if table_function:
|
||||
return f"""
|
||||
icebergAzure(azure, container = '{cluster.azure_container_name}', storage_account_url = '{cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]}', blob_path = '/iceberg_data/default/{table_name}/', format={format})
|
||||
"""
|
||||
else:
|
||||
return f"""
|
||||
DROP TABLE IF EXISTS {table_name};
|
||||
CREATE TABLE {table_name}
|
||||
ENGINE=IcebergAzure(azure, container = {cluster.azure_container_name}, storage_account_url = '{cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]}', blob_path = '/iceberg_data/default/{table_name}/', format={format})"""
|
||||
elif storage_type == "local":
|
||||
if table_function:
|
||||
return f"""
|
||||
icebergLocal(local, path = '/iceberg_data/default/{table_name}/', format={format})
|
||||
"""
|
||||
else:
|
||||
return f"""
|
||||
DROP TABLE IF EXISTS {table_name};
|
||||
CREATE TABLE {table_name}
|
||||
ENGINE=IcebergLocal(local, path = '/iceberg_data/default/{table_name}/', format={format});"""
|
||||
else:
|
||||
raise Exception(f"Unknown iceberg storage type: {storage_type}")
|
||||
|
||||
|
||||
def get_uuid_str():
|
||||
return str(uuid.uuid4()).replace("-", "_")
|
||||
|
||||
|
||||
def create_iceberg_table(
|
||||
storage_type,
|
||||
node,
|
||||
table_name,
|
||||
cluster,
|
||||
format="Parquet",
|
||||
**kwargs,
|
||||
):
|
||||
node.query(
|
||||
f"""
|
||||
DROP TABLE IF EXISTS {table_name};
|
||||
CREATE TABLE {table_name}
|
||||
ENGINE=Iceberg(s3, filename = 'iceberg_data/default/{table_name}/', format={format}, url = 'http://minio1:9001/{bucket}/')"""
|
||||
get_creation_expression(storage_type, table_name, cluster, format, **kwargs)
|
||||
)
|
||||
|
||||
|
||||
@ -170,40 +246,69 @@ def create_initial_data_file(
|
||||
return result_path
|
||||
|
||||
|
||||
def default_upload_directory(
|
||||
started_cluster, storage_type, local_path, remote_path, **kwargs
|
||||
):
|
||||
if storage_type == "local":
|
||||
return started_cluster.default_local_uploader.upload_directory(
|
||||
local_path, remote_path, **kwargs
|
||||
)
|
||||
elif storage_type == "s3":
|
||||
print(kwargs)
|
||||
return started_cluster.default_s3_uploader.upload_directory(
|
||||
local_path, remote_path, **kwargs
|
||||
)
|
||||
elif storage_type == "azure":
|
||||
return started_cluster.default_azure_uploader.upload_directory(
|
||||
local_path, remote_path, **kwargs
|
||||
)
|
||||
else:
|
||||
raise Exception(f"Unknown iceberg storage type: {storage_type}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("format_version", ["1", "2"])
|
||||
def test_single_iceberg_file(started_cluster, format_version):
|
||||
@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
|
||||
def test_single_iceberg_file(started_cluster, format_version, storage_type):
|
||||
instance = started_cluster.instances["node1"]
|
||||
spark = started_cluster.spark_session
|
||||
minio_client = started_cluster.minio_client
|
||||
bucket = started_cluster.minio_bucket
|
||||
TABLE_NAME = "test_single_iceberg_file_" + format_version
|
||||
|
||||
inserted_data = "SELECT number, toString(number) as string FROM numbers(100)"
|
||||
parquet_data_path = create_initial_data_file(
|
||||
started_cluster, instance, inserted_data, TABLE_NAME
|
||||
TABLE_NAME = (
|
||||
"test_single_iceberg_file_"
|
||||
+ format_version
|
||||
+ "_"
|
||||
+ storage_type
|
||||
+ "_"
|
||||
+ get_uuid_str()
|
||||
)
|
||||
|
||||
write_iceberg_from_file(
|
||||
spark, parquet_data_path, TABLE_NAME, format_version=format_version
|
||||
write_iceberg_from_df(spark, generate_data(spark, 0, 100), TABLE_NAME)
|
||||
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
)
|
||||
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
|
||||
)
|
||||
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
|
||||
|
||||
create_iceberg_table(instance, TABLE_NAME)
|
||||
assert instance.query(f"SELECT * FROM {TABLE_NAME}") == instance.query(
|
||||
inserted_data
|
||||
"SELECT number, toString(number + 1) FROM numbers(100)"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("format_version", ["1", "2"])
|
||||
def test_partition_by(started_cluster, format_version):
|
||||
@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
|
||||
def test_partition_by(started_cluster, format_version, storage_type):
|
||||
instance = started_cluster.instances["node1"]
|
||||
spark = started_cluster.spark_session
|
||||
minio_client = started_cluster.minio_client
|
||||
bucket = started_cluster.minio_bucket
|
||||
TABLE_NAME = "test_partition_by_" + format_version
|
||||
TABLE_NAME = (
|
||||
"test_partition_by_"
|
||||
+ format_version
|
||||
+ "_"
|
||||
+ storage_type
|
||||
+ "_"
|
||||
+ get_uuid_str()
|
||||
)
|
||||
|
||||
write_iceberg_from_df(
|
||||
spark,
|
||||
@ -214,22 +319,33 @@ def test_partition_by(started_cluster, format_version):
|
||||
partition_by="a",
|
||||
)
|
||||
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
)
|
||||
assert len(files) == 14 # 10 partitiions + 4 metadata files
|
||||
|
||||
create_iceberg_table(instance, TABLE_NAME)
|
||||
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
|
||||
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 10
|
||||
|
||||
|
||||
@pytest.mark.parametrize("format_version", ["1", "2"])
|
||||
def test_multiple_iceberg_files(started_cluster, format_version):
|
||||
@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
|
||||
def test_multiple_iceberg_files(started_cluster, format_version, storage_type):
|
||||
instance = started_cluster.instances["node1"]
|
||||
spark = started_cluster.spark_session
|
||||
minio_client = started_cluster.minio_client
|
||||
bucket = started_cluster.minio_bucket
|
||||
TABLE_NAME = "test_multiple_iceberg_files_" + format_version
|
||||
TABLE_NAME = (
|
||||
"test_multiple_iceberg_files_"
|
||||
+ format_version
|
||||
+ "_"
|
||||
+ storage_type
|
||||
+ "_"
|
||||
+ get_uuid_str()
|
||||
)
|
||||
|
||||
write_iceberg_from_df(
|
||||
spark,
|
||||
@ -239,9 +355,13 @@ def test_multiple_iceberg_files(started_cluster, format_version):
|
||||
format_version=format_version,
|
||||
)
|
||||
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
)
|
||||
|
||||
# ['/iceberg_data/default/test_multiple_iceberg_files/data/00000-1-35302d56-f1ed-494e-a85b-fbf85c05ab39-00001.parquet',
|
||||
# '/iceberg_data/default/test_multiple_iceberg_files/metadata/version-hint.text',
|
||||
# '/iceberg_data/default/test_multiple_iceberg_files/metadata/3127466b-299d-48ca-a367-6b9b1df1e78c-m0.avro',
|
||||
@ -249,7 +369,7 @@ def test_multiple_iceberg_files(started_cluster, format_version):
|
||||
# '/iceberg_data/default/test_multiple_iceberg_files/metadata/v1.metadata.json']
|
||||
assert len(files) == 5
|
||||
|
||||
create_iceberg_table(instance, TABLE_NAME)
|
||||
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
|
||||
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
|
||||
|
||||
write_iceberg_from_df(
|
||||
@ -259,8 +379,11 @@ def test_multiple_iceberg_files(started_cluster, format_version):
|
||||
mode="append",
|
||||
format_version=format_version,
|
||||
)
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
"",
|
||||
)
|
||||
assert len(files) == 9
|
||||
|
||||
@ -271,12 +394,13 @@ def test_multiple_iceberg_files(started_cluster, format_version):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("format_version", ["1", "2"])
|
||||
def test_types(started_cluster, format_version):
|
||||
@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
|
||||
def test_types(started_cluster, format_version, storage_type):
|
||||
instance = started_cluster.instances["node1"]
|
||||
spark = started_cluster.spark_session
|
||||
minio_client = started_cluster.minio_client
|
||||
bucket = started_cluster.minio_bucket
|
||||
TABLE_NAME = "test_types_" + format_version
|
||||
TABLE_NAME = (
|
||||
"test_types_" + format_version + "_" + storage_type + "_" + get_uuid_str()
|
||||
)
|
||||
|
||||
data = [
|
||||
(
|
||||
@ -302,22 +426,29 @@ def test_types(started_cluster, format_version):
|
||||
spark, df, TABLE_NAME, mode="overwrite", format_version=format_version
|
||||
)
|
||||
|
||||
upload_directory(minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}", "")
|
||||
default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
)
|
||||
|
||||
create_iceberg_table(instance, TABLE_NAME)
|
||||
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
|
||||
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 1
|
||||
assert (
|
||||
instance.query(f"SELECT a, b, c, d, e FROM {TABLE_NAME}").strip()
|
||||
== "123\tstring\t2000-01-01\t['str1','str2']\ttrue"
|
||||
)
|
||||
|
||||
table_function = f"iceberg(s3, filename='iceberg_data/default/{TABLE_NAME}/')"
|
||||
table_function_expr = get_creation_expression(
|
||||
storage_type, TABLE_NAME, started_cluster, table_function=True
|
||||
)
|
||||
assert (
|
||||
instance.query(f"SELECT a, b, c, d, e FROM {table_function}").strip()
|
||||
instance.query(f"SELECT a, b, c, d, e FROM {table_function_expr}").strip()
|
||||
== "123\tstring\t2000-01-01\t['str1','str2']\ttrue"
|
||||
)
|
||||
|
||||
assert instance.query(f"DESCRIBE {table_function} FORMAT TSV") == TSV(
|
||||
assert instance.query(f"DESCRIBE {table_function_expr} FORMAT TSV") == TSV(
|
||||
[
|
||||
["a", "Nullable(Int32)"],
|
||||
["b", "Nullable(String)"],
|
||||
@ -329,12 +460,20 @@ def test_types(started_cluster, format_version):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("format_version", ["1", "2"])
|
||||
def test_delete_files(started_cluster, format_version):
|
||||
@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
|
||||
def test_delete_files(started_cluster, format_version, storage_type):
|
||||
instance = started_cluster.instances["node1"]
|
||||
spark = started_cluster.spark_session
|
||||
minio_client = started_cluster.minio_client
|
||||
bucket = started_cluster.minio_bucket
|
||||
TABLE_NAME = "test_delete_files_" + format_version
|
||||
TABLE_NAME = (
|
||||
"test_delete_files_"
|
||||
+ format_version
|
||||
+ "_"
|
||||
+ storage_type
|
||||
+ "_"
|
||||
+ get_uuid_str()
|
||||
)
|
||||
|
||||
write_iceberg_from_df(
|
||||
spark,
|
||||
@ -344,17 +483,22 @@ def test_delete_files(started_cluster, format_version):
|
||||
format_version=format_version,
|
||||
)
|
||||
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
)
|
||||
|
||||
create_iceberg_table(instance, TABLE_NAME)
|
||||
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
|
||||
|
||||
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
|
||||
|
||||
spark.sql(f"DELETE FROM {TABLE_NAME} WHERE a >= 0")
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
"",
|
||||
)
|
||||
|
||||
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 0
|
||||
@ -368,27 +512,41 @@ def test_delete_files(started_cluster, format_version):
|
||||
format_version=format_version,
|
||||
)
|
||||
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
"",
|
||||
)
|
||||
|
||||
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
|
||||
|
||||
spark.sql(f"DELETE FROM {TABLE_NAME} WHERE a >= 150")
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
"",
|
||||
)
|
||||
|
||||
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 50
|
||||
|
||||
|
||||
@pytest.mark.parametrize("format_version", ["1", "2"])
|
||||
def test_evolved_schema(started_cluster, format_version):
|
||||
@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
|
||||
def test_evolved_schema(started_cluster, format_version, storage_type):
|
||||
instance = started_cluster.instances["node1"]
|
||||
spark = started_cluster.spark_session
|
||||
minio_client = started_cluster.minio_client
|
||||
bucket = started_cluster.minio_bucket
|
||||
TABLE_NAME = "test_evolved_schema_" + format_version
|
||||
TABLE_NAME = (
|
||||
"test_evolved_schema_"
|
||||
+ format_version
|
||||
+ "_"
|
||||
+ storage_type
|
||||
+ "_"
|
||||
+ get_uuid_str()
|
||||
)
|
||||
|
||||
write_iceberg_from_df(
|
||||
spark,
|
||||
@ -398,19 +556,25 @@ def test_evolved_schema(started_cluster, format_version):
|
||||
format_version=format_version,
|
||||
)
|
||||
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
)
|
||||
|
||||
create_iceberg_table(instance, TABLE_NAME)
|
||||
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
|
||||
|
||||
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
|
||||
|
||||
expected_data = instance.query(f"SELECT * FROM {TABLE_NAME} order by a, b")
|
||||
|
||||
spark.sql(f"ALTER TABLE {TABLE_NAME} ADD COLUMNS (x bigint)")
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
"",
|
||||
)
|
||||
|
||||
error = instance.query_and_get_error(f"SELECT * FROM {TABLE_NAME}")
|
||||
@ -422,12 +586,13 @@ def test_evolved_schema(started_cluster, format_version):
|
||||
assert data == expected_data
|
||||
|
||||
|
||||
def test_row_based_deletes(started_cluster):
|
||||
@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
|
||||
def test_row_based_deletes(started_cluster, storage_type):
|
||||
instance = started_cluster.instances["node1"]
|
||||
spark = started_cluster.spark_session
|
||||
minio_client = started_cluster.minio_client
|
||||
bucket = started_cluster.minio_bucket
|
||||
TABLE_NAME = "test_row_based_deletes"
|
||||
TABLE_NAME = "test_row_based_deletes_" + storage_type + "_" + get_uuid_str()
|
||||
|
||||
spark.sql(
|
||||
f"CREATE TABLE {TABLE_NAME} (id bigint, data string) USING iceberg TBLPROPERTIES ('format-version' = '2', 'write.update.mode'='merge-on-read', 'write.delete.mode'='merge-on-read', 'write.merge.mode'='merge-on-read')"
|
||||
@ -436,17 +601,23 @@ def test_row_based_deletes(started_cluster):
|
||||
f"INSERT INTO {TABLE_NAME} select id, char(id + ascii('a')) from range(100)"
|
||||
)
|
||||
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
)
|
||||
|
||||
create_iceberg_table(instance, TABLE_NAME)
|
||||
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
|
||||
|
||||
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
|
||||
|
||||
spark.sql(f"DELETE FROM {TABLE_NAME} WHERE id < 10")
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
"",
|
||||
)
|
||||
|
||||
error = instance.query_and_get_error(f"SELECT * FROM {TABLE_NAME}")
|
||||
@ -454,13 +625,21 @@ def test_row_based_deletes(started_cluster):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("format_version", ["1", "2"])
|
||||
def test_schema_inference(started_cluster, format_version):
|
||||
@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
|
||||
def test_schema_inference(started_cluster, format_version, storage_type):
|
||||
instance = started_cluster.instances["node1"]
|
||||
spark = started_cluster.spark_session
|
||||
minio_client = started_cluster.minio_client
|
||||
bucket = started_cluster.minio_bucket
|
||||
for format in ["Parquet", "ORC", "Avro"]:
|
||||
TABLE_NAME = "test_schema_inference_" + format + "_" + format_version
|
||||
TABLE_NAME = (
|
||||
"test_schema_inference_"
|
||||
+ format
|
||||
+ "_"
|
||||
+ format_version
|
||||
+ "_"
|
||||
+ storage_type
|
||||
+ "_"
|
||||
+ get_uuid_str()
|
||||
)
|
||||
|
||||
# Types time, timestamptz, fixed are not supported in Spark.
|
||||
spark.sql(
|
||||
@ -470,12 +649,16 @@ def test_schema_inference(started_cluster, format_version):
|
||||
spark.sql(
|
||||
f"insert into {TABLE_NAME} select 42, 4242, 42.42, 4242.4242, decimal(42.42), decimal(42.42), decimal(42.42), date('2020-01-01'), timestamp('2020-01-01 20:00:00'), 'hello', binary('hello'), array(1,2,3), map('key', 'value'), struct(42, 'hello'), array(struct(map('key', array(map('key', 42))), struct(42, 'hello')))"
|
||||
)
|
||||
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
)
|
||||
|
||||
create_iceberg_table(instance, TABLE_NAME, format)
|
||||
create_iceberg_table(
|
||||
storage_type, instance, TABLE_NAME, started_cluster, format=format
|
||||
)
|
||||
|
||||
res = instance.query(
|
||||
f"DESC {TABLE_NAME} FORMAT TSVRaw", settings={"print_pretty_type_names": 0}
|
||||
@ -510,12 +693,18 @@ def test_schema_inference(started_cluster, format_version):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("format_version", ["1", "2"])
|
||||
def test_metadata_file_selection(started_cluster, format_version):
|
||||
@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
|
||||
def test_metadata_file_selection(started_cluster, format_version, storage_type):
|
||||
instance = started_cluster.instances["node1"]
|
||||
spark = started_cluster.spark_session
|
||||
minio_client = started_cluster.minio_client
|
||||
bucket = started_cluster.minio_bucket
|
||||
TABLE_NAME = "test_metadata_selection_" + format_version
|
||||
TABLE_NAME = (
|
||||
"test_metadata_selection_"
|
||||
+ format_version
|
||||
+ "_"
|
||||
+ storage_type
|
||||
+ "_"
|
||||
+ get_uuid_str()
|
||||
)
|
||||
|
||||
spark.sql(
|
||||
f"CREATE TABLE {TABLE_NAME} (id bigint, data string) USING iceberg TBLPROPERTIES ('format-version' = '2', 'write.update.mode'='merge-on-read', 'write.delete.mode'='merge-on-read', 'write.merge.mode'='merge-on-read')"
|
||||
@ -526,22 +715,31 @@ def test_metadata_file_selection(started_cluster, format_version):
|
||||
f"INSERT INTO {TABLE_NAME} select id, char(id + ascii('a')) from range(10)"
|
||||
)
|
||||
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
)
|
||||
|
||||
create_iceberg_table(instance, TABLE_NAME)
|
||||
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
|
||||
|
||||
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 500
|
||||
|
||||
|
||||
@pytest.mark.parametrize("format_version", ["1", "2"])
|
||||
def test_metadata_file_format_with_uuid(started_cluster, format_version):
|
||||
@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"])
|
||||
def test_metadata_file_format_with_uuid(started_cluster, format_version, storage_type):
|
||||
instance = started_cluster.instances["node1"]
|
||||
spark = started_cluster.spark_session
|
||||
minio_client = started_cluster.minio_client
|
||||
bucket = started_cluster.minio_bucket
|
||||
TABLE_NAME = "test_metadata_selection_with_uuid_" + format_version
|
||||
TABLE_NAME = (
|
||||
"test_metadata_selection_with_uuid_"
|
||||
+ format_version
|
||||
+ "_"
|
||||
+ storage_type
|
||||
+ "_"
|
||||
+ get_uuid_str()
|
||||
)
|
||||
|
||||
spark.sql(
|
||||
f"CREATE TABLE {TABLE_NAME} (id bigint, data string) USING iceberg TBLPROPERTIES ('format-version' = '2', 'write.update.mode'='merge-on-read', 'write.delete.mode'='merge-on-read', 'write.merge.mode'='merge-on-read')"
|
||||
@ -555,40 +753,48 @@ def test_metadata_file_format_with_uuid(started_cluster, format_version):
|
||||
for i in range(50):
|
||||
os.rename(
|
||||
f"/iceberg_data/default/{TABLE_NAME}/metadata/v{i + 1}.metadata.json",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/metadata/{str(i).zfill(5)}-{uuid.uuid4()}.metadata.json",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/metadata/{str(i).zfill(5)}-{get_uuid_str()}.metadata.json",
|
||||
)
|
||||
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
storage_type,
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
)
|
||||
|
||||
create_iceberg_table(instance, TABLE_NAME)
|
||||
create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster)
|
||||
|
||||
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 500
|
||||
|
||||
|
||||
def test_restart_broken(started_cluster):
|
||||
def test_restart_broken_s3(started_cluster):
|
||||
instance = started_cluster.instances["node1"]
|
||||
spark = started_cluster.spark_session
|
||||
TABLE_NAME = "test_restart_broken_table_function_s3" + "_" + get_uuid_str()
|
||||
|
||||
minio_client = started_cluster.minio_client
|
||||
bucket = "broken2"
|
||||
TABLE_NAME = "test_restart_broken_table_function"
|
||||
|
||||
if not minio_client.bucket_exists(bucket):
|
||||
minio_client.make_bucket(bucket)
|
||||
|
||||
parquet_data_path = create_initial_data_file(
|
||||
started_cluster,
|
||||
instance,
|
||||
"SELECT number, toString(number) FROM numbers(100)",
|
||||
write_iceberg_from_df(
|
||||
spark,
|
||||
generate_data(spark, 0, 100),
|
||||
TABLE_NAME,
|
||||
mode="overwrite",
|
||||
format_version="1",
|
||||
)
|
||||
|
||||
write_iceberg_from_file(spark, parquet_data_path, TABLE_NAME, format_version="1")
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
"s3",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
bucket=bucket,
|
||||
)
|
||||
create_iceberg_table(instance, TABLE_NAME, bucket=bucket)
|
||||
create_iceberg_table("s3", instance, TABLE_NAME, started_cluster, bucket=bucket)
|
||||
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
|
||||
|
||||
s3_objects = list_s3_objects(minio_client, bucket, prefix="")
|
||||
@ -613,8 +819,12 @@ def test_restart_broken(started_cluster):
|
||||
|
||||
minio_client.make_bucket(bucket)
|
||||
|
||||
files = upload_directory(
|
||||
minio_client, bucket, f"/iceberg_data/default/{TABLE_NAME}/", ""
|
||||
files = default_upload_directory(
|
||||
started_cluster,
|
||||
"s3",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
f"/iceberg_data/default/{TABLE_NAME}/",
|
||||
bucket=bucket,
|
||||
)
|
||||
|
||||
assert int(instance.query(f"SELECT count() FROM {TABLE_NAME}")) == 100
|
||||
|
@ -6,8 +6,8 @@ CREATE DATABASE {CLICKHOUSE_DATABASE:Identifier};
|
||||
CREATE TABLE {CLICKHOUSE_DATABASE:Identifier}.A (A UInt8) ENGINE = TinyLog;
|
||||
CREATE TABLE {CLICKHOUSE_DATABASE:Identifier}.B (A UInt8) ENGINE = TinyLog;
|
||||
|
||||
SHOW TABLES from {CLICKHOUSE_DATABASE:Identifier};
|
||||
SHOW TABLES in system where engine like '%System%' and name in ('numbers', 'one');
|
||||
SHOW TABLES FROM {CLICKHOUSE_DATABASE:Identifier};
|
||||
SHOW TABLES IN system WHERE engine LIKE '%System%' AND name IN ('numbers', 'one') AND database = 'system';
|
||||
|
||||
SELECT name, toUInt32(metadata_modification_time) > 0, engine_full, create_table_query FROM system.tables WHERE database = currentDatabase() ORDER BY name FORMAT TSVRaw;
|
||||
|
||||
@ -16,7 +16,7 @@ SELECT name FROM system.tables WHERE is_temporary = 1 AND name = 'test_temporary
|
||||
|
||||
CREATE TABLE {CLICKHOUSE_DATABASE:Identifier}.test_log(id UInt64) ENGINE = Log;
|
||||
CREATE MATERIALIZED VIEW {CLICKHOUSE_DATABASE:Identifier}.test_materialized ENGINE = Log AS SELECT * FROM {CLICKHOUSE_DATABASE:Identifier}.test_log;
|
||||
SELECT dependencies_database, dependencies_table FROM system.tables WHERE name = 'test_log' and database=currentDatabase();
|
||||
SELECT dependencies_database, dependencies_table FROM system.tables WHERE name = 'test_log' AND database=currentDatabase();
|
||||
|
||||
DROP DATABASE {CLICKHOUSE_DATABASE:Identifier};
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-parallel
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
@ -42,6 +41,7 @@ function check()
|
||||
|
||||
while [ "$query_result" != "2.2" ]
|
||||
do
|
||||
sleep 0.2
|
||||
query_result=$($CLICKHOUSE_CLIENT --query "SELECT dictGetFloat64('${CLICKHOUSE_DATABASE}.dict_with_zero_min_lifetime', 'value', toUInt64(2))")
|
||||
done
|
||||
}
|
||||
|
@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-random-settings, no-parallel
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
|
@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-random-settings, no-parallel
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
|
@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-random-settings, no-parallel
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user