diff --git a/docs/en/getting-started/example-datasets/images/stackoverflow.png b/docs/en/getting-started/example-datasets/images/stackoverflow.png
new file mode 100644
index 00000000000..f31acdc8cc3
Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/stackoverflow.png differ
diff --git a/docs/en/getting-started/example-datasets/stackoverflow.md b/docs/en/getting-started/example-datasets/stackoverflow.md
new file mode 100644
index 00000000000..e982a3c3dfc
--- /dev/null
+++ b/docs/en/getting-started/example-datasets/stackoverflow.md
@@ -0,0 +1,394 @@
+---
+slug: /en/getting-started/example-datasets/stackoverflow
+sidebar_label: Stack Overflow
+sidebar_position: 1
+description: Analyzing Stack Overflow data with ClickHouse
+---
+
+# Analyzing Stack Overflow data with ClickHouse
+
+This dataset contains every `Post`, `User`, `Vote`, `Comment`, `Badge, `PostHistory`, and `PostLink` that has occurred on Stack Overflow.
+
+Users can either download pre-prepared Parquet versions of the data, containing every post up to April 2024, or download the latest data in XML format and load this. Stack Overflow provide updates to this data periodically - historically every 3 months.
+
+The following diagram shows the schema for the available tables assuming Parquet format.
+
+![Stack Overflow schema](./images/stackoverflow.png)
+
+A description of the schema of this data can be found [here](https://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede).
+
+## Pre-prepared data
+
+We provide a copy of this data in Parquet format, up to date as of April 2024. While small for ClickHouse with respect to the number of rows (60 million posts), this dataset contains significant volumes of text and large String columns.
+
+```sql
+CREATE DATABASE stackoverflow
+```
+
+The following timings are for a 96 GiB, 24 vCPU ClickHouse Cloud cluster located in `eu-west-2`. The dataset is located in `eu-west-3`.
+
+### Posts
+
+```sql
+CREATE TABLE stackoverflow.posts
+(
+ `Id` Int32 CODEC(Delta(4), ZSTD(1)),
+ `PostTypeId` Enum8('Question' = 1, 'Answer' = 2, 'Wiki' = 3, 'TagWikiExcerpt' = 4, 'TagWiki' = 5, 'ModeratorNomination' = 6, 'WikiPlaceholder' = 7, 'PrivilegeWiki' = 8),
+ `AcceptedAnswerId` UInt32,
+ `CreationDate` DateTime64(3, 'UTC'),
+ `Score` Int32,
+ `ViewCount` UInt32 CODEC(Delta(4), ZSTD(1)),
+ `Body` String,
+ `OwnerUserId` Int32,
+ `OwnerDisplayName` String,
+ `LastEditorUserId` Int32,
+ `LastEditorDisplayName` String,
+ `LastEditDate` DateTime64(3, 'UTC') CODEC(Delta(8), ZSTD(1)),
+ `LastActivityDate` DateTime64(3, 'UTC'),
+ `Title` String,
+ `Tags` String,
+ `AnswerCount` UInt16 CODEC(Delta(2), ZSTD(1)),
+ `CommentCount` UInt8,
+ `FavoriteCount` UInt8,
+ `ContentLicense` LowCardinality(String),
+ `ParentId` String,
+ `CommunityOwnedDate` DateTime64(3, 'UTC'),
+ `ClosedDate` DateTime64(3, 'UTC')
+)
+ENGINE = MergeTree
+PARTITION BY toYear(CreationDate)
+ORDER BY (PostTypeId, toDate(CreationDate), CreationDate)
+
+INSERT INTO stackoverflow.posts SELECT * FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/*.parquet')
+
+0 rows in set. Elapsed: 265.466 sec. Processed 59.82 million rows, 38.07 GB (225.34 thousand rows/s., 143.42 MB/s.)
+```
+
+Posts are also available by year e.g. [https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/2020.parquet](https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/2020.parquet)
+
+
+### Votes
+
+```sql
+CREATE TABLE stackoverflow.votes
+(
+ `Id` UInt32,
+ `PostId` Int32,
+ `VoteTypeId` UInt8,
+ `CreationDate` DateTime64(3, 'UTC'),
+ `UserId` Int32,
+ `BountyAmount` UInt8
+)
+ENGINE = MergeTree
+ORDER BY (VoteTypeId, CreationDate, PostId, UserId)
+
+INSERT INTO stackoverflow.votes SELECT * FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/votes/*.parquet')
+
+0 rows in set. Elapsed: 21.605 sec. Processed 238.98 million rows, 2.13 GB (11.06 million rows/s., 98.46 MB/s.)
+```
+
+Votes are also available by year e.g. [https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/2020.parquet](https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/votes/2020.parquet)
+
+
+### Comments
+
+```sql
+CREATE TABLE stackoverflow.comments
+(
+ `Id` UInt32,
+ `PostId` UInt32,
+ `Score` UInt16,
+ `Text` String,
+ `CreationDate` DateTime64(3, 'UTC'),
+ `UserId` Int32,
+ `UserDisplayName` LowCardinality(String)
+)
+ENGINE = MergeTree
+ORDER BY CreationDate
+
+INSERT INTO stackoverflow.comments SELECT * FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/comments/*.parquet')
+
+0 rows in set. Elapsed: 56.593 sec. Processed 90.38 million rows, 11.14 GB (1.60 million rows/s., 196.78 MB/s.)
+```
+
+Comments are also available by year e.g. [https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posts/2020.parquet](https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/comments/2020.parquet)
+
+### Users
+
+```sql
+CREATE TABLE stackoverflow.users
+(
+ `Id` Int32,
+ `Reputation` LowCardinality(String),
+ `CreationDate` DateTime64(3, 'UTC') CODEC(Delta(8), ZSTD(1)),
+ `DisplayName` String,
+ `LastAccessDate` DateTime64(3, 'UTC'),
+ `AboutMe` String,
+ `Views` UInt32,
+ `UpVotes` UInt32,
+ `DownVotes` UInt32,
+ `WebsiteUrl` String,
+ `Location` LowCardinality(String),
+ `AccountId` Int32
+)
+ENGINE = MergeTree
+ORDER BY (Id, CreationDate)
+
+INSERT INTO stackoverflow.users SELECT * FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/users.parquet')
+
+0 rows in set. Elapsed: 10.988 sec. Processed 22.48 million rows, 1.36 GB (2.05 million rows/s., 124.10 MB/s.)
+```
+
+### Badges
+
+```sql
+CREATE TABLE stackoverflow.badges
+(
+ `Id` UInt32,
+ `UserId` Int32,
+ `Name` LowCardinality(String),
+ `Date` DateTime64(3, 'UTC'),
+ `Class` Enum8('Gold' = 1, 'Silver' = 2, 'Bronze' = 3),
+ `TagBased` Bool
+)
+ENGINE = MergeTree
+ORDER BY UserId
+
+INSERT INTO stackoverflow.badges SELECT * FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/badges.parquet')
+
+0 rows in set. Elapsed: 6.635 sec. Processed 51.29 million rows, 797.05 MB (7.73 million rows/s., 120.13 MB/s.)
+```
+
+### `PostLinks`
+
+```sql
+CREATE TABLE stackoverflow.postlinks
+(
+ `Id` UInt64,
+ `CreationDate` DateTime64(3, 'UTC'),
+ `PostId` Int32,
+ `RelatedPostId` Int32,
+ `LinkTypeId` Enum8('Linked' = 1, 'Duplicate' = 3)
+)
+ENGINE = MergeTree
+ORDER BY (PostId, RelatedPostId)
+
+INSERT INTO stackoverflow.postlinks SELECT * FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/postlinks.parquet')
+
+0 rows in set. Elapsed: 1.534 sec. Processed 6.55 million rows, 129.70 MB (4.27 million rows/s., 84.57 MB/s.)
+```
+
+### `PostHistory`
+
+```sql
+CREATE TABLE stackoverflow.posthistory
+(
+ `Id` UInt64,
+ `PostHistoryTypeId` UInt8,
+ `PostId` Int32,
+ `RevisionGUID` String,
+ `CreationDate` DateTime64(3, 'UTC'),
+ `UserId` Int32,
+ `Text` String,
+ `ContentLicense` LowCardinality(String),
+ `Comment` String,
+ `UserDisplayName` String
+)
+ENGINE = MergeTree
+ORDER BY (CreationDate, PostId)
+
+INSERT INTO stackoverflow.posthistory SELECT * FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/stackoverflow/parquet/posthistory/*.parquet')
+
+0 rows in set. Elapsed: 422.795 sec. Processed 160.79 million rows, 67.08 GB (380.30 thousand rows/s., 158.67 MB/s.)
+```
+
+## Original dataset
+
+The original dataset is available in compressed (7zip) XML format at [https://archive.org/download/stackexchange](https://archive.org/download/stackexchange) - files with prefix `stackoverflow.com*`.
+
+### Download
+
+```bash
+wget https://archive.org/download/stackexchange/stackoverflow.com-Badges.7z
+wget https://archive.org/download/stackexchange/stackoverflow.com-Comments.7z
+wget https://archive.org/download/stackexchange/stackoverflow.com-PostHistory.7z
+wget https://archive.org/download/stackexchange/stackoverflow.com-PostLinks.7z
+wget https://archive.org/download/stackexchange/stackoverflow.com-Posts.7z
+wget https://archive.org/download/stackexchange/stackoverflow.com-Users.7z
+wget https://archive.org/download/stackexchange/stackoverflow.com-Votes.7z
+```
+
+These files are up to 35GB and can take around 30 mins to download depending on internet connection - the download server throttles at around 20MB/sec.
+
+### Convert to JSON
+
+At the time of writing, ClickHouse does not have native support for XML as an input format. To load the data into ClickHouse we first convert to NDJSON.
+
+To convert XML to JSON we recommend the [`xq`](https://github.com/kislyuk/yq) linux tool, a simple `jq` wrapper for XML documents.
+
+Install xq and jq:
+
+```bash
+sudo apt install jq
+pip install yq
+```
+
+The following steps apply to any of the above files. We use the `stackoverflow.com-Posts.7z` file as an example. Modify as required.
+
+Extract the file using [p7zip](https://p7zip.sourceforge.net/). This will produce a single xml file - in this case `Posts.xml`.
+
+> Files are compressed approximately 4.5x. At 22GB compressed, the posts file requires around 97G uncompressed.
+
+```bash
+p7zip -d stackoverflow.com-Posts.7z
+```
+
+The following splits the xml file into files, each containing 10000 rows.
+
+```bash
+mkdir posts
+cd posts
+# the following splits the input xml file into sub files of 10000 rows
+tail +3 ../Posts.xml | head -n -1 | split -l 10000 --filter='{ printf "\n"; cat - ; printf "\n"; } > $FILE' -
+```
+
+After running the above users will have a set of files, each with 10000 lines. This ensures the memory overhead of the next command is not excessive (xml to JSON conversion is done in memory).
+
+```bash
+find . -maxdepth 1 -type f -exec xq -c '.rows.row[]' {} \; | sed -e 's:"@:":g' > posts_v2.json
+```
+
+The above command will produce a single `posts.json` file.
+
+Load into ClickHouse with the following command. Note the schema is specified for the `posts.json` file. This will need to be adjusted per data type to align with the target table.
+
+```bash
+clickhouse local --query "SELECT * FROM file('posts.json', JSONEachRow, 'Id Int32, PostTypeId UInt8, AcceptedAnswerId UInt32, CreationDate DateTime64(3, \'UTC\'), Score Int32, ViewCount UInt32, Body String, OwnerUserId Int32, OwnerDisplayName String, LastEditorUserId Int32, LastEditorDisplayName String, LastEditDate DateTime64(3, \'UTC\'), LastActivityDate DateTime64(3, \'UTC\'), Title String, Tags String, AnswerCount UInt16, CommentCount UInt8, FavoriteCount UInt8, ContentLicense String, ParentId String, CommunityOwnedDate DateTime64(3, \'UTC\'), ClosedDate DateTime64(3, \'UTC\')') FORMAT Native" | clickhouse client --host --secure --password --query "INSERT INTO stackoverflow.posts_v2 FORMAT Native"
+```
+
+## Example queries
+
+A few simple questions to you get started.
+
+### Most popular tags on Stack Overflow
+
+```sql
+
+SELECT
+ arrayJoin(arrayFilter(t -> (t != ''), splitByChar('|', Tags))) AS Tags,
+ count() AS c
+FROM stackoverflow.posts
+GROUP BY Tags
+ORDER BY c DESC
+LIMIT 10
+
+┌─Tags───────┬───────c─┐
+│ javascript │ 2527130 │
+│ python │ 2189638 │
+│ java │ 1916156 │
+│ c# │ 1614236 │
+│ php │ 1463901 │
+│ android │ 1416442 │
+│ html │ 1186567 │
+│ jquery │ 1034621 │
+│ c++ │ 806202 │
+│ css │ 803755 │
+└────────────┴─────────┘
+
+10 rows in set. Elapsed: 1.013 sec. Processed 59.82 million rows, 1.21 GB (59.07 million rows/s., 1.19 GB/s.)
+Peak memory usage: 224.03 MiB.
+```
+
+### User with the most answers (active accounts)
+
+Account requires a `UserId`.
+
+```sql
+SELECT
+ any(OwnerUserId) UserId,
+ OwnerDisplayName,
+ count() AS c
+FROM stackoverflow.posts WHERE OwnerDisplayName != '' AND PostTypeId='Answer' AND OwnerUserId != 0
+GROUP BY OwnerDisplayName
+ORDER BY c DESC
+LIMIT 5
+
+┌─UserId─┬─OwnerDisplayName─┬────c─┐
+│ 22656 │ Jon Skeet │ 2727 │
+│ 23354 │ Marc Gravell │ 2150 │
+│ 12950 │ tvanfosson │ 1530 │
+│ 3043 │ Joel Coehoorn │ 1438 │
+│ 10661 │ S.Lott │ 1087 │
+└────────┴──────────────────┴──────┘
+
+5 rows in set. Elapsed: 0.154 sec. Processed 35.83 million rows, 193.39 MB (232.33 million rows/s., 1.25 GB/s.)
+Peak memory usage: 206.45 MiB.
+```
+
+### ClickHouse related posts with the most views
+
+```sql
+SELECT
+ Id,
+ Title,
+ ViewCount,
+ AnswerCount
+FROM stackoverflow.posts
+WHERE Title ILIKE '%ClickHouse%'
+ORDER BY ViewCount DESC
+LIMIT 10
+
+┌───────Id─┬─Title────────────────────────────────────────────────────────────────────────────┬─ViewCount─┬─AnswerCount─┐
+│ 52355143 │ Is it possible to delete old records from clickhouse table? │ 41462 │ 3 │
+│ 37954203 │ Clickhouse Data Import │ 38735 │ 3 │
+│ 37901642 │ Updating data in Clickhouse │ 36236 │ 6 │
+│ 58422110 │ Pandas: How to insert dataframe into Clickhouse │ 29731 │ 4 │
+│ 63621318 │ DBeaver - Clickhouse - SQL Error [159] .. Read timed out │ 27350 │ 1 │
+│ 47591813 │ How to filter clickhouse table by array column contents? │ 27078 │ 2 │
+│ 58728436 │ How to search the string in query with case insensitive on Clickhouse database? │ 26567 │ 3 │
+│ 65316905 │ Clickhouse: DB::Exception: Memory limit (for query) exceeded │ 24899 │ 2 │
+│ 49944865 │ How to add a column in clickhouse │ 24424 │ 1 │
+│ 59712399 │ How to cast date Strings to DateTime format with extended parsing in ClickHouse? │ 22620 │ 1 │
+└──────────┴──────────────────────────────────────────────────────────────────────────────────┴───────────┴─────────────┘
+
+10 rows in set. Elapsed: 0.472 sec. Processed 59.82 million rows, 1.91 GB (126.63 million rows/s., 4.03 GB/s.)
+Peak memory usage: 240.01 MiB.
+```
+
+### Most controversial posts
+
+```sql
+SELECT
+ Id,
+ Title,
+ UpVotes,
+ DownVotes,
+ abs(UpVotes - DownVotes) AS Controversial_ratio
+FROM stackoverflow.posts
+INNER JOIN
+(
+ SELECT
+ PostId,
+ countIf(VoteTypeId = 2) AS UpVotes,
+ countIf(VoteTypeId = 3) AS DownVotes
+ FROM stackoverflow.votes
+ GROUP BY PostId
+ HAVING (UpVotes > 10) AND (DownVotes > 10)
+) AS votes ON posts.Id = votes.PostId
+WHERE Title != ''
+ORDER BY Controversial_ratio ASC
+LIMIT 3
+
+┌───────Id─┬─Title─────────────────────────────────────────────┬─UpVotes─┬─DownVotes─┬─Controversial_ratio─┐
+│ 583177 │ VB.NET Infinite For Loop │ 12 │ 12 │ 0 │
+│ 9756797 │ Read console input as enumerable - one statement? │ 16 │ 16 │ 0 │
+│ 13329132 │ What's the point of ARGV in Ruby? │ 22 │ 22 │ 0 │
+└──────────┴───────────────────────────────────────────────────┴─────────┴───────────┴─────────────────────┘
+
+3 rows in set. Elapsed: 4.779 sec. Processed 298.80 million rows, 3.16 GB (62.52 million rows/s., 661.05 MB/s.)
+Peak memory usage: 6.05 GiB.
+```
+
+## Attribution
+
+We thank Stack Overflow for providing this data under the `cc-by-sa 4.0` license, acknowledging their efforts and the original source of the data at [https://archive.org/details/stackexchange](https://archive.org/details/stackexchange).
diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp
index 503cb0fb97d..670dc378b97 100644
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@@ -150,7 +150,6 @@ void LocalServer::initialize(Poco::Util::Application & self)
getClientConfiguration().getUInt("max_io_thread_pool_free_size", 0),
getClientConfiguration().getUInt("io_thread_pool_queue_size", 10000));
-
const size_t active_parts_loading_threads = getClientConfiguration().getUInt("max_active_parts_loading_thread_pool_size", 64);
getActivePartsLoadingThreadPool().initialize(
active_parts_loading_threads,
diff --git a/programs/main.cpp b/programs/main.cpp
index c270388f17f..61e2bc18ed7 100644
--- a/programs/main.cpp
+++ b/programs/main.cpp
@@ -13,6 +13,7 @@
#include
+#include "config.h"
#include "config_tools.h"
#include
@@ -439,6 +440,14 @@ extern "C"
}
#endif
+/// Prevent messages from JeMalloc in the release build.
+/// Some of these messages are non-actionable for the users, such as:
+/// : Number of CPUs detected is not deterministic. Per-CPU arena disabled.
+#if USE_JEMALLOC && defined(NDEBUG) && !defined(SANITIZER)
+extern "C" void (*malloc_message)(void *, const char *s);
+__attribute__((constructor(0))) void init_je_malloc_message() { malloc_message = [](void *, const char *){}; }
+#endif
+
/// This allows to implement assert to forbid initialization of a class in static constructors.
/// Usage:
///
diff --git a/src/Client/LocalConnection.cpp b/src/Client/LocalConnection.cpp
index 3b2c14ee4f9..072184e0a66 100644
--- a/src/Client/LocalConnection.cpp
+++ b/src/Client/LocalConnection.cpp
@@ -358,22 +358,18 @@ bool LocalConnection::poll(size_t)
if (!state->is_finished)
{
- if (send_progress && (state->after_send_progress.elapsedMicroseconds() >= query_context->getSettingsRef().interactive_delay))
- {
- state->after_send_progress.restart();
- next_packet_type = Protocol::Server::Progress;
+ if (needSendProgressOrMetrics())
return true;
- }
-
- if (send_profile_events && (state->after_send_profile_events.elapsedMicroseconds() >= query_context->getSettingsRef().interactive_delay))
- {
- sendProfileEvents();
- return true;
- }
try
{
- pollImpl();
+ while (pollImpl())
+ {
+ LOG_DEBUG(&Poco::Logger::get("LocalConnection"), "Executor timeout encountered, will retry");
+
+ if (needSendProgressOrMetrics())
+ return true;
+ }
}
catch (const Exception & e)
{
@@ -468,12 +464,34 @@ bool LocalConnection::poll(size_t)
return false;
}
+bool LocalConnection::needSendProgressOrMetrics()
+{
+ if (send_progress && (state->after_send_progress.elapsedMicroseconds() >= query_context->getSettingsRef().interactive_delay))
+ {
+ state->after_send_progress.restart();
+ next_packet_type = Protocol::Server::Progress;
+ return true;
+ }
+
+ if (send_profile_events && (state->after_send_profile_events.elapsedMicroseconds() >= query_context->getSettingsRef().interactive_delay))
+ {
+ sendProfileEvents();
+ return true;
+ }
+
+ return false;
+}
+
bool LocalConnection::pollImpl()
{
Block block;
auto next_read = pullBlock(block);
- if (block && !state->io.null_format)
+ if (!block && next_read)
+ {
+ return true;
+ }
+ else if (block && !state->io.null_format)
{
state->block.emplace(block);
}
@@ -482,7 +500,7 @@ bool LocalConnection::pollImpl()
state->is_finished = true;
}
- return true;
+ return false;
}
Packet LocalConnection::receivePacket()
diff --git a/src/Client/LocalConnection.h b/src/Client/LocalConnection.h
index 899d134cce5..fb6fa1b55eb 100644
--- a/src/Client/LocalConnection.h
+++ b/src/Client/LocalConnection.h
@@ -151,8 +151,11 @@ private:
void sendProfileEvents();
+ /// Returns true on executor timeout, meaning a retryable error.
bool pollImpl();
+ bool needSendProgressOrMetrics();
+
ContextMutablePtr query_context;
Session session;
diff --git a/src/Interpreters/Cache/FileSegment.cpp b/src/Interpreters/Cache/FileSegment.cpp
index 61a356fa3c3..838ca0b491e 100644
--- a/src/Interpreters/Cache/FileSegment.cpp
+++ b/src/Interpreters/Cache/FileSegment.cpp
@@ -187,13 +187,6 @@ size_t FileSegment::getDownloadedSize() const
return downloaded_size;
}
-void FileSegment::setDownloadedSize(size_t delta)
-{
- auto lk = lock();
- downloaded_size += delta;
- assert(downloaded_size == std::filesystem::file_size(getPath()));
-}
-
bool FileSegment::isDownloaded() const
{
auto lk = lock();
@@ -311,6 +304,11 @@ FileSegment::RemoteFileReaderPtr FileSegment::getRemoteFileReader()
return remote_file_reader;
}
+FileSegment::LocalCacheWriterPtr FileSegment::getLocalCacheWriter()
+{
+ return cache_writer;
+}
+
void FileSegment::resetRemoteFileReader()
{
auto lk = lock();
@@ -340,33 +338,31 @@ void FileSegment::setRemoteFileReader(RemoteFileReaderPtr remote_file_reader_)
remote_file_reader = remote_file_reader_;
}
-void FileSegment::write(char * from, size_t size, size_t offset)
+void FileSegment::write(char * from, size_t size, size_t offset_in_file)
{
ProfileEventTimeIncrement watch(ProfileEvents::FileSegmentWriteMicroseconds);
-
- if (!size)
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Writing zero size is not allowed");
-
+ auto file_segment_path = getPath();
{
- auto lk = lock();
- assertIsDownloaderUnlocked("write", lk);
- assertNotDetachedUnlocked(lk);
- }
+ if (!size)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Writing zero size is not allowed");
- const auto file_segment_path = getPath();
+ {
+ auto lk = lock();
+ assertIsDownloaderUnlocked("write", lk);
+ assertNotDetachedUnlocked(lk);
+ }
- {
if (download_state != State::DOWNLOADING)
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"Expected DOWNLOADING state, got {}", stateToString(download_state));
const size_t first_non_downloaded_offset = getCurrentWriteOffset();
- if (offset != first_non_downloaded_offset)
+ if (offset_in_file != first_non_downloaded_offset)
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"Attempt to write {} bytes to offset: {}, but current write offset is {}",
- size, offset, first_non_downloaded_offset);
+ size, offset_in_file, first_non_downloaded_offset);
const size_t current_downloaded_size = getDownloadedSize();
chassert(reserved_size >= current_downloaded_size);
@@ -396,10 +392,10 @@ void FileSegment::write(char * from, size_t size, size_t offset)
#endif
if (!cache_writer)
- cache_writer = std::make_unique(file_segment_path, /* buf_size */0);
+ cache_writer = std::make_unique(getPath(), /* buf_size */0);
/// Size is equal to offset as offset for write buffer points to data end.
- cache_writer->set(from, size, /* offset */size);
+ cache_writer->set(from, /* size */size, /* offset */size);
/// Reset the buffer when finished.
SCOPE_EXIT({ cache_writer->set(nullptr, 0); });
/// Flush the buffer.
@@ -435,7 +431,6 @@ void FileSegment::write(char * from, size_t size, size_t offset)
}
throw;
-
}
catch (Exception & e)
{
@@ -445,7 +440,7 @@ void FileSegment::write(char * from, size_t size, size_t offset)
throw;
}
- chassert(getCurrentWriteOffset() == offset + size);
+ chassert(getCurrentWriteOffset() == offset_in_file + size);
}
FileSegment::State FileSegment::wait(size_t offset)
@@ -828,7 +823,7 @@ bool FileSegment::assertCorrectnessUnlocked(const FileSegmentGuard::Lock & lock)
};
const auto file_path = getPath();
- if (segment_kind != FileSegmentKind::Temporary)
+
{
std::lock_guard lk(write_mutex);
if (downloaded_size == 0)
diff --git a/src/Interpreters/Cache/FileSegment.h b/src/Interpreters/Cache/FileSegment.h
index f28482a1ce4..d6b37b60dc1 100644
--- a/src/Interpreters/Cache/FileSegment.h
+++ b/src/Interpreters/Cache/FileSegment.h
@@ -48,7 +48,7 @@ friend class FileCache; /// Because of reserved_size in tryReserve().
public:
using Key = FileCacheKey;
using RemoteFileReaderPtr = std::shared_ptr;
- using LocalCacheWriterPtr = std::unique_ptr;
+ using LocalCacheWriterPtr = std::shared_ptr;
using Downloader = std::string;
using DownloaderId = std::string;
using Priority = IFileCachePriority;
@@ -204,7 +204,7 @@ public:
bool reserve(size_t size_to_reserve, size_t lock_wait_timeout_milliseconds, FileCacheReserveStat * reserve_stat = nullptr);
/// Write data into reserved space.
- void write(char * from, size_t size, size_t offset);
+ void write(char * from, size_t size, size_t offset_in_file);
// Invariant: if state() != DOWNLOADING and remote file reader is present, the reader's
// available() == 0, and getFileOffsetOfBufferEnd() == our getCurrentWriteOffset().
@@ -212,6 +212,7 @@ public:
// The reader typically requires its internal_buffer to be assigned from the outside before
// calling next().
RemoteFileReaderPtr getRemoteFileReader();
+ LocalCacheWriterPtr getLocalCacheWriter();
RemoteFileReaderPtr extractRemoteFileReader();
@@ -219,8 +220,6 @@ public:
void setRemoteFileReader(RemoteFileReaderPtr remote_file_reader_);
- void setDownloadedSize(size_t delta);
-
void setDownloadFailed();
private:
diff --git a/src/Interpreters/Cache/Metadata.cpp b/src/Interpreters/Cache/Metadata.cpp
index 5ed4ccdbeca..1d23278a255 100644
--- a/src/Interpreters/Cache/Metadata.cpp
+++ b/src/Interpreters/Cache/Metadata.cpp
@@ -944,14 +944,7 @@ KeyMetadata::iterator LockedKey::removeFileSegmentImpl(
try
{
const auto path = key_metadata->getFileSegmentPath(*file_segment);
- if (file_segment->segment_kind == FileSegmentKind::Temporary)
- {
- /// FIXME: For temporary file segment the requirement is not as strong because
- /// the implementation of "temporary data in cache" creates files in advance.
- if (fs::exists(path))
- fs::remove(path);
- }
- else if (file_segment->downloaded_size == 0)
+ if (file_segment->downloaded_size == 0)
{
chassert(!fs::exists(path));
}
diff --git a/src/Interpreters/Cache/WriteBufferToFileSegment.cpp b/src/Interpreters/Cache/WriteBufferToFileSegment.cpp
index a593ebfdab2..e654d091561 100644
--- a/src/Interpreters/Cache/WriteBufferToFileSegment.cpp
+++ b/src/Interpreters/Cache/WriteBufferToFileSegment.cpp
@@ -4,6 +4,7 @@
#include
#include
#include
+#include
#include
@@ -33,21 +34,20 @@ namespace
}
WriteBufferToFileSegment::WriteBufferToFileSegment(FileSegment * file_segment_)
- : WriteBufferFromFileDecorator(std::make_unique(file_segment_->getPath()))
+ : WriteBufferFromFileBase(DBMS_DEFAULT_BUFFER_SIZE, nullptr, 0)
, file_segment(file_segment_)
, reserve_space_lock_wait_timeout_milliseconds(getCacheLockWaitTimeout())
{
}
WriteBufferToFileSegment::WriteBufferToFileSegment(FileSegmentsHolderPtr segment_holder_)
- : WriteBufferFromFileDecorator(
- segment_holder_->size() == 1
- ? std::make_unique(segment_holder_->front().getPath())
- : throw Exception(ErrorCodes::LOGICAL_ERROR, "WriteBufferToFileSegment can be created only from single segment"))
+ : WriteBufferFromFileBase(DBMS_DEFAULT_BUFFER_SIZE, nullptr, 0)
, file_segment(&segment_holder_->front())
, segment_holder(std::move(segment_holder_))
, reserve_space_lock_wait_timeout_milliseconds(getCacheLockWaitTimeout())
{
+ if (segment_holder->size() != 1)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "WriteBufferToFileSegment can be created only from single segment");
}
/// If it throws an exception, the file segment will be incomplete, so you should not use it in the future.
@@ -82,9 +82,6 @@ void WriteBufferToFileSegment::nextImpl()
reserve_stat_msg += fmt::format("{} hold {}, can release {}; ",
toString(kind), ReadableSize(stat.non_releasable_size), ReadableSize(stat.releasable_size));
- if (std::filesystem::exists(file_segment->getPath()))
- std::filesystem::remove(file_segment->getPath());
-
throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Failed to reserve {} bytes for {}: {}(segment info: {})",
bytes_to_write,
file_segment->getKind() == FileSegmentKind::Temporary ? "temporary file" : "the file in cache",
@@ -95,17 +92,37 @@ void WriteBufferToFileSegment::nextImpl()
try
{
- SwapHelper swap(*this, *impl);
/// Write data to the underlying buffer.
- impl->next();
+ file_segment->write(working_buffer.begin(), bytes_to_write, written_bytes);
+ written_bytes += bytes_to_write;
}
catch (...)
{
LOG_WARNING(getLogger("WriteBufferToFileSegment"), "Failed to write to the underlying buffer ({})", file_segment->getInfoForLog());
throw;
}
+}
- file_segment->setDownloadedSize(bytes_to_write);
+void WriteBufferToFileSegment::finalizeImpl()
+{
+ next();
+ auto cache_writer = file_segment->getLocalCacheWriter();
+ if (cache_writer)
+ {
+ SwapHelper swap(*this, *cache_writer);
+ cache_writer->finalize();
+ }
+}
+
+void WriteBufferToFileSegment::sync()
+{
+ next();
+ auto cache_writer = file_segment->getLocalCacheWriter();
+ if (cache_writer)
+ {
+ SwapHelper swap(*this, *cache_writer);
+ cache_writer->sync();
+ }
}
std::unique_ptr WriteBufferToFileSegment::getReadBufferImpl()
@@ -114,7 +131,10 @@ std::unique_ptr WriteBufferToFileSegment::getReadBufferImpl()
* because in case destructor called without `getReadBufferImpl` called, data won't be read.
*/
finalize();
- return std::make_unique(file_segment->getPath());
+ if (file_segment->getDownloadedSize() > 0)
+ return std::make_unique(file_segment->getPath());
+ else
+ return std::make_unique();
}
}
diff --git a/src/Interpreters/Cache/WriteBufferToFileSegment.h b/src/Interpreters/Cache/WriteBufferToFileSegment.h
index c4b0491f8c0..4719dd4be89 100644
--- a/src/Interpreters/Cache/WriteBufferToFileSegment.h
+++ b/src/Interpreters/Cache/WriteBufferToFileSegment.h
@@ -9,7 +9,7 @@ namespace DB
class FileSegment;
-class WriteBufferToFileSegment : public WriteBufferFromFileDecorator, public IReadableWriteBuffer
+class WriteBufferToFileSegment : public WriteBufferFromFileBase, public IReadableWriteBuffer
{
public:
explicit WriteBufferToFileSegment(FileSegment * file_segment_);
@@ -17,6 +17,13 @@ public:
void nextImpl() override;
+ std::string getFileName() const override { return file_segment->getPath(); }
+
+ void sync() override;
+
+protected:
+ void finalizeImpl() override;
+
private:
std::unique_ptr getReadBufferImpl() override;
@@ -29,6 +36,7 @@ private:
FileSegmentsHolderPtr segment_holder;
const size_t reserve_space_lock_wait_timeout_milliseconds;
+ size_t written_bytes = 0;
};
diff --git a/src/Interpreters/TemporaryDataOnDisk.cpp b/src/Interpreters/TemporaryDataOnDisk.cpp
index a74b5bba2b9..7f0fb8cd6ca 100644
--- a/src/Interpreters/TemporaryDataOnDisk.cpp
+++ b/src/Interpreters/TemporaryDataOnDisk.cpp
@@ -3,6 +3,8 @@
#include
#include
+#include
+#include
#include
#include
#include
@@ -224,25 +226,37 @@ struct TemporaryFileStream::OutputWriter
bool finalized = false;
};
-TemporaryFileStream::Reader::Reader(const String & path, const Block & header_, size_t size)
- : in_file_buf(path, size ? std::min(DBMS_DEFAULT_BUFFER_SIZE, size) : DBMS_DEFAULT_BUFFER_SIZE)
- , in_compressed_buf(in_file_buf)
- , in_reader(in_compressed_buf, header_, DBMS_TCP_PROTOCOL_VERSION)
+TemporaryFileStream::Reader::Reader(const String & path_, const Block & header_, size_t size_)
+ : path(path_)
+ , size(size_ ? std::min(size_, DBMS_DEFAULT_BUFFER_SIZE) : DBMS_DEFAULT_BUFFER_SIZE)
+ , header(header_)
{
LOG_TEST(getLogger("TemporaryFileStream"), "Reading {} from {}", header_.dumpStructure(), path);
}
-TemporaryFileStream::Reader::Reader(const String & path, size_t size)
- : in_file_buf(path, size ? std::min(DBMS_DEFAULT_BUFFER_SIZE, size) : DBMS_DEFAULT_BUFFER_SIZE)
- , in_compressed_buf(in_file_buf)
- , in_reader(in_compressed_buf, DBMS_TCP_PROTOCOL_VERSION)
+TemporaryFileStream::Reader::Reader(const String & path_, size_t size_)
+ : path(path_)
+ , size(size_ ? std::min(size_, DBMS_DEFAULT_BUFFER_SIZE) : DBMS_DEFAULT_BUFFER_SIZE)
{
LOG_TEST(getLogger("TemporaryFileStream"), "Reading from {}", path);
}
Block TemporaryFileStream::Reader::read()
{
- return in_reader.read();
+ if (!in_reader)
+ {
+ if (fs::exists(path))
+ in_file_buf = std::make_unique(path, size);
+ else
+ in_file_buf = std::make_unique();
+
+ in_compressed_buf = std::make_unique(*in_file_buf);
+ if (header.has_value())
+ in_reader = std::make_unique(*in_compressed_buf, header.value(), DBMS_TCP_PROTOCOL_VERSION);
+ else
+ in_reader = std::make_unique(*in_compressed_buf, DBMS_TCP_PROTOCOL_VERSION);
+ }
+ return in_reader->read();
}
TemporaryFileStream::TemporaryFileStream(TemporaryFileOnDiskHolder file_, const Block & header_, TemporaryDataOnDisk * parent_)
diff --git a/src/Interpreters/TemporaryDataOnDisk.h b/src/Interpreters/TemporaryDataOnDisk.h
index 488eed70da9..d541c93e031 100644
--- a/src/Interpreters/TemporaryDataOnDisk.h
+++ b/src/Interpreters/TemporaryDataOnDisk.h
@@ -151,9 +151,13 @@ public:
Block read();
- ReadBufferFromFile in_file_buf;
- CompressedReadBuffer in_compressed_buf;
- NativeReader in_reader;
+ const std::string path;
+ const size_t size;
+ const std::optional header;
+
+ std::unique_ptr in_file_buf;
+ std::unique_ptr in_compressed_buf;
+ std::unique_ptr in_reader;
};
struct Stat
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index f9cc65871fe..419e5dbbd86 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -1759,11 +1759,14 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optional runner(getActivePartsLoadingThreadPool().get(), "ActiveParts");
+ bool all_disks_are_readonly = true;
for (size_t i = 0; i < disks.size(); ++i)
{
const auto & disk_ptr = disks[i];
if (disk_ptr->isBroken())
continue;
+ if (!disk_ptr->isReadOnly())
+ all_disks_are_readonly = false;
auto & disk_parts = parts_to_load_by_disk[i];
auto & unexpected_disk_parts = unexpected_parts_to_load_by_disk[i];
@@ -1916,7 +1919,6 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optionalrenameToDetached("broken-on-start"); /// detached parts must not have '_' in prefixes
@@ -1961,7 +1963,8 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optional /dev/null"
+ test: >
+ ldapsearch -x -H ldap://localhost:$$LDAP_PORT_NUMBER -D $$LDAP_ADMIN_DN -w $$LDAP_ADMIN_PASSWORD -b $$LDAP_ROOT
+ | grep -c -E "member: cn=j(ohn|ane)doe"
+ | grep 2 >> /dev/null
interval: 10s
retries: 10
timeout: 2s
diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py
index 41c162217d2..544b06cca1b 100644
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@@ -2640,7 +2640,9 @@ class ClickHouseCluster:
[
"bash",
"-c",
- f"/opt/bitnami/openldap/bin/ldapsearch -x -H ldap://{self.ldap_host}:{self.ldap_port} -D cn=admin,dc=example,dc=org -w clickhouse -b dc=example,dc=org",
+ f"/opt/bitnami/openldap/bin/ldapsearch -x -H ldap://{self.ldap_host}:{self.ldap_port} -D cn=admin,dc=example,dc=org -w clickhouse -b dc=example,dc=org"
+ f'| grep -c -E "member: cn=j(ohn|ane)doe"'
+ f"| grep 2 >> /dev/null",
],
user="root",
)
diff --git a/tests/queries/0_stateless/00731_long_merge_tree_select_opened_files.sh b/tests/queries/0_stateless/00731_long_merge_tree_select_opened_files.sh
index 1bb4dbd34de..af746c43da9 100755
--- a/tests/queries/0_stateless/00731_long_merge_tree_select_opened_files.sh
+++ b/tests/queries/0_stateless/00731_long_merge_tree_select_opened_files.sh
@@ -1,5 +1,5 @@
#!/usr/bin/env bash
-# Tags: long, no-s3-storage
+# Tags: long, no-s3-storage, no-tsan
# no-s3 because read FileOpen metric
set -e
@@ -31,6 +31,6 @@ $CLICKHOUSE_CLIENT $settings -q "$touching_many_parts_query" &> /dev/null
$CLICKHOUSE_CLIENT $settings -q "SYSTEM FLUSH LOGS"
-$CLICKHOUSE_CLIENT $settings -q "SELECT ProfileEvents['FileOpen'] as opened_files FROM system.query_log WHERE query='$touching_many_parts_query' and current_database = currentDatabase() ORDER BY event_time DESC, opened_files DESC LIMIT 1;"
+$CLICKHOUSE_CLIENT $settings -q "SELECT ProfileEvents['FileOpen'] as opened_files FROM system.query_log WHERE query = '$touching_many_parts_query' AND current_database = currentDatabase() AND event_date >= yesterday() ORDER BY event_time DESC, opened_files DESC LIMIT 1;"
$CLICKHOUSE_CLIENT $settings -q "DROP TABLE IF EXISTS merge_tree_table;"
diff --git a/tests/queries/0_stateless/01103_check_cpu_instructions_at_startup.reference b/tests/queries/0_stateless/01103_check_cpu_instructions_at_startup.reference
index 8984d35930a..03ed07cf1a4 100644
--- a/tests/queries/0_stateless/01103_check_cpu_instructions_at_startup.reference
+++ b/tests/queries/0_stateless/01103_check_cpu_instructions_at_startup.reference
@@ -2,6 +2,4 @@ Instruction check fail. The CPU does not support SSSE3 instruction set.
Instruction check fail. The CPU does not support SSE4.1 instruction set.
Instruction check fail. The CPU does not support SSE4.2 instruction set.
Instruction check fail. The CPU does not support POPCNT instruction set.
-: MADV_DONTNEED does not work (memset will be used instead)
-: (This is the expected behaviour if you are running under QEMU)
1
diff --git a/tests/queries/0_stateless/01103_check_cpu_instructions_at_startup.sh b/tests/queries/0_stateless/01103_check_cpu_instructions_at_startup.sh
index 01047aeb9ab..c37f1f95374 100755
--- a/tests/queries/0_stateless/01103_check_cpu_instructions_at_startup.sh
+++ b/tests/queries/0_stateless/01103_check_cpu_instructions_at_startup.sh
@@ -1,6 +1,6 @@
#!/usr/bin/env bash
# Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug, no-fasttest, no-cpu-aarch64
-# Tag no-fasttest: avoid dependency on qemu -- invonvenient when running locally
+# Tag no-fasttest: avoid dependency on qemu -- inconvenient when running locally
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
diff --git a/tests/queries/0_stateless/01502_jemalloc_percpu_arena.reference b/tests/queries/0_stateless/01502_jemalloc_percpu_arena.reference
index fe093e39a56..5accb577786 100644
--- a/tests/queries/0_stateless/01502_jemalloc_percpu_arena.reference
+++ b/tests/queries/0_stateless/01502_jemalloc_percpu_arena.reference
@@ -1,5 +1,3 @@
-: Number of CPUs detected is not deterministic. Per-CPU arena disabled.
1
-: Number of CPUs detected is not deterministic. Per-CPU arena disabled.
100000000
1
diff --git a/tests/queries/0_stateless/01502_jemalloc_percpu_arena.sh b/tests/queries/0_stateless/01502_jemalloc_percpu_arena.sh
index b3ea6eca3f4..c1bd1e0e1fa 100755
--- a/tests/queries/0_stateless/01502_jemalloc_percpu_arena.sh
+++ b/tests/queries/0_stateless/01502_jemalloc_percpu_arena.sh
@@ -1,5 +1,5 @@
#!/usr/bin/env bash
-# Tags: no-tsan, no-asan, no-msan, no-ubsan, no-fasttest
+# Tags: no-tsan, no-asan, no-msan, no-ubsan, no-fasttest, no-debug
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# NOTE: jemalloc is disabled under sanitizers
diff --git a/tests/queries/0_stateless/01505_pipeline_executor_UAF.sh b/tests/queries/0_stateless/01505_pipeline_executor_UAF.sh
index c2750ad31b2..35c2b796570 100755
--- a/tests/queries/0_stateless/01505_pipeline_executor_UAF.sh
+++ b/tests/queries/0_stateless/01505_pipeline_executor_UAF.sh
@@ -14,7 +14,7 @@ for _ in {1..10}; do
${CLICKHOUSE_LOCAL} -q 'select * from numbers_mt(100000000) settings max_threads=100 FORMAT Null'
# Binding to specific CPU is not required, but this makes the test more reliable.
taskset --cpu-list 0 ${CLICKHOUSE_LOCAL} -q 'select * from numbers_mt(100000000) settings max_threads=100 FORMAT Null' 2>&1 | {
- # build with santiziers does not have jemalloc
+ # build with sanitiziers does not have jemalloc
# and for jemalloc we have separate test
# 01502_jemalloc_percpu_arena
grep -v ': Number of CPUs detected is not deterministic. Per-CPU arena disabled.'
diff --git a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sh b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sh
index 45e65b18e07..b1d1c483396 100755
--- a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sh
+++ b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sh
@@ -1,5 +1,5 @@
#!/usr/bin/env bash
-# Tags: no-ordinary-database, use-rocksdb
+# Tags: no-ordinary-database, use-rocksdb, no-random-settings
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
@@ -45,4 +45,3 @@ ${CLICKHOUSE_CLIENT} --query "INSERT INTO rocksdb_worm SELECT number, number+1 F
${CLICKHOUSE_CLIENT} --query "INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers_mt(1000000)" &
wait
${CLICKHOUSE_CLIENT} --query "SELECT count() FROM rocksdb_worm;"
-
diff --git a/tests/queries/0_stateless/02963_test_flexible_disk_configuration.sql b/tests/queries/0_stateless/02963_test_flexible_disk_configuration.sql
index 8f67cd7e030..7ebef866360 100644
--- a/tests/queries/0_stateless/02963_test_flexible_disk_configuration.sql
+++ b/tests/queries/0_stateless/02963_test_flexible_disk_configuration.sql
@@ -22,7 +22,7 @@ create table test (a Int32) engine = MergeTree() order by tuple()
settings disk=disk(name='test2',
type = object_storage,
object_storage_type = s3,
- metadata_storage_type = local,
+ metadata_type = local,
endpoint = 'http://localhost:11111/test/common/',
access_key_id = clickhouse,
secret_access_key = clickhouse);
@@ -32,7 +32,7 @@ create table test (a Int32) engine = MergeTree() order by tuple()
settings disk=disk(name='test3',
type = object_storage,
object_storage_type = s3,
- metadata_storage_type = local,
+ metadata_type = local,
metadata_keep_free_space_bytes = 1024,
endpoint = 'http://localhost:11111/test/common/',
access_key_id = clickhouse,
@@ -43,7 +43,7 @@ create table test (a Int32) engine = MergeTree() order by tuple()
settings disk=disk(name='test4',
type = object_storage,
object_storage_type = s3,
- metadata_storage_type = local,
+ metadata_type = local,
metadata_keep_free_space_bytes = 0,
endpoint = 'http://localhost:11111/test/common/',
access_key_id = clickhouse,
diff --git a/utils/backup/backup b/utils/backup/backup
new file mode 100755
index 00000000000..6aa9c179033
--- /dev/null
+++ b/utils/backup/backup
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+user="default"
+path="."
+
+usage() {
+ echo
+ echo "A trivial script to upload your files into ClickHouse."
+ echo "You might want to use something like Dropbox instead, but..."
+ echo
+ echo "Usage: $0 --host [--user ] --password "
+ exit 1
+}
+
+while [[ "$#" -gt 0 ]]; do
+ case "$1" in
+ --host)
+ host="$2"
+ shift 2
+ ;;
+ --user)
+ user="$2"
+ shift 2
+ ;;
+ --password)
+ password="$2"
+ shift 2
+ ;;
+ --help)
+ usage
+ ;;
+ *)
+ path="$1"
+ shift 1
+ ;;
+ esac
+done
+
+if [ -z "$host" ] || [ -z "$password" ]; then
+ echo "Error: --host and --password are mandatory."
+ usage
+fi
+
+clickhouse-client --host "$host" --user "$user" --password "$password" --secure --query "CREATE TABLE IF NOT EXISTS default.files (time DEFAULT now(), path String, content String CODEC(ZSTD(6))) ENGINE = MergeTree ORDER BY (path, time)" &&
+find "$path" -type f | clickhouse-local --input-format LineAsString \
+ --max-block-size 1 --min-insert-block-size-rows 0 --min-insert-block-size-bytes '100M' --max-insert-threads 1 \
+ --query "INSERT INTO FUNCTION remoteSecure('$host', default.files, '$user', '$password') (path, content) SELECT line, file(line) FROM table" --progress
diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt
index 5c3991ed293..229eccefa48 100644
--- a/utils/check-style/aspell-ignore/en/aspell-dict.txt
+++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt
@@ -575,6 +575,7 @@ MySQLDump
MySQLThreads
NATS
NCHAR
+NDJSON
NEKUDOTAYIM
NEWDATE
NEWDECIMAL
@@ -717,6 +718,8 @@ PlantUML
PointDistKm
PointDistM
PointDistRads
+PostHistory
+PostLink
PostgreSQLConnection
PostgreSQLThreads
Postgres
@@ -2516,6 +2519,7 @@ sqlite
sqrt
src
srcReplicas
+stackoverflow
stacktrace
stacktraces
startsWith
@@ -2854,6 +2858,7 @@ userver
utils
uuid
uuidv
+vCPU
varPop
varPopStable
varSamp