Merge branch 'master' of github.com:ClickHouse/ClickHouse

This commit is contained in:
alexX512 2023-08-29 13:37:58 +00:00
commit 2670af53df
16 changed files with 229 additions and 83 deletions

11
.gitmodules vendored
View File

@ -13,7 +13,6 @@
[submodule "contrib/zlib-ng"]
path = contrib/zlib-ng
url = https://github.com/ClickHouse/zlib-ng
branch = clickhouse-2.0.x
[submodule "contrib/googletest"]
path = contrib/googletest
url = https://github.com/google/googletest
@ -47,7 +46,6 @@
[submodule "contrib/arrow"]
path = contrib/arrow
url = https://github.com/ClickHouse/arrow
branch = blessed/release-6.0.1
[submodule "contrib/thrift"]
path = contrib/thrift
url = https://github.com/apache/thrift
@ -93,7 +91,6 @@
[submodule "contrib/grpc"]
path = contrib/grpc
url = https://github.com/ClickHouse/grpc
branch = v1.33.2
[submodule "contrib/aws"]
path = contrib/aws
url = https://github.com/ClickHouse/aws-sdk-cpp
@ -140,11 +137,9 @@
[submodule "contrib/cassandra"]
path = contrib/cassandra
url = https://github.com/ClickHouse/cpp-driver
branch = clickhouse
[submodule "contrib/libuv"]
path = contrib/libuv
url = https://github.com/ClickHouse/libuv
branch = clickhouse
[submodule "contrib/fmtlib"]
path = contrib/fmtlib
url = https://github.com/fmtlib/fmt
@ -157,11 +152,9 @@
[submodule "contrib/cyrus-sasl"]
path = contrib/cyrus-sasl
url = https://github.com/ClickHouse/cyrus-sasl
branch = cyrus-sasl-2.1
[submodule "contrib/croaring"]
path = contrib/croaring
url = https://github.com/RoaringBitmap/CRoaring
branch = v0.2.66
[submodule "contrib/miniselect"]
path = contrib/miniselect
url = https://github.com/danlark1/miniselect
@ -174,7 +167,6 @@
[submodule "contrib/abseil-cpp"]
path = contrib/abseil-cpp
url = https://github.com/abseil/abseil-cpp
branch = lts_2021_11_02
[submodule "contrib/dragonbox"]
path = contrib/dragonbox
url = https://github.com/ClickHouse/dragonbox
@ -187,7 +179,6 @@
[submodule "contrib/boringssl"]
path = contrib/boringssl
url = https://github.com/ClickHouse/boringssl
branch = unknown_branch_from_artur
[submodule "contrib/NuRaft"]
path = contrib/NuRaft
url = https://github.com/ClickHouse/NuRaft
@ -248,7 +239,6 @@
[submodule "contrib/annoy"]
path = contrib/annoy
url = https://github.com/ClickHouse/annoy
branch = ClickHouse-master
[submodule "contrib/qpl"]
path = contrib/qpl
url = https://github.com/intel/qpl
@ -282,7 +272,6 @@
[submodule "contrib/openssl"]
path = contrib/openssl
url = https://github.com/openssl/openssl
branch = openssl-3.0
[submodule "contrib/google-benchmark"]
path = contrib/google-benchmark
url = https://github.com/google/benchmark

2
contrib/libpqxx vendored

@ -1 +1 @@
Subproject commit bdd6540fb95ff56c813691ceb5da5a3266cf235d
Subproject commit 791d68fd89902835133c50435e380ec7a73271b7

View File

@ -1,23 +1,21 @@
# Laion-400M dataset
The dataset contains 400 million images with English text. For more information follow this [link](https://laion.ai/blog/laion-400-open-dataset/). Laion provides even larger datasets (e.g. [5 billion](https://laion.ai/blog/laion-5b/)). Working with them will be similar.
The [Laion-400M dataset](https://laion.ai/blog/laion-400-open-dataset/) contains 400 million images with English image captions. Laion nowadays provides [an even larger dataset](https://laion.ai/blog/laion-5b/) but working with it will be similar.
The dataset has prepared embeddings for texts and images. This will be used to demonstrate [Approximate nearest neighbor search indexes](../../engines/table-engines/mergetree-family/annindexes.md).
The dataset contains the image URL, embeddings for both the image and the image caption, a similarity score between the image and the image caption, as well as metadata, e.g. the image width/height, the licence and a NSFW flag. We can use the dataset to demonstrate [approximate nearest neighbor search](../../engines/table-engines/mergetree-family/annindexes.md) in ClickHouse.
## Prepare data
## Data preparation
Embeddings are stored in `.npy` files, so we have to read them with python and merge with other data.
Download data and process it with simple `download.sh` script:
The embeddings and the metadata are stored in separate files in the raw data. A data preparation step downloads the data, merges the files,
converts them to CSV and imports them into ClickHouse. You can use the following `download.sh` script for that:
```bash
wget --tries=100 https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/img_emb/img_emb_${1}.npy
wget --tries=100 https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/metadata/metadata_${1}.parquet
wget --tries=100 https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/text_emb/text_emb_${1}.npy
python3 process.py ${1}
wget --tries=100 https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/img_emb/img_emb_${1}.npy # download image embedding
wget --tries=100 https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/text_emb/text_emb_${1}.npy # download text embedding
wget --tries=100 https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/metadata/metadata_${1}.parquet # download metadata
python3 process.py ${1} # merge files and convert to CSV
```
Where `process.py`:
Script `process.py` is defined as follows:
```python
import pandas as pd
@ -35,11 +33,11 @@ im_emb = np.load(npy_file)
text_emb = np.load(text_npy)
data = pd.read_parquet(metadata_file)
# combine them
# combine files
data = pd.concat([data, pd.DataFrame({"image_embedding" : [*im_emb]}), pd.DataFrame({"text_embedding" : [*text_emb]})], axis=1, copy=False)
# you can save more columns
data = data[['url', 'caption', 'similarity', "image_embedding", "text_embedding"]]
# columns to be imported into ClickHouse
data = data[['url', 'caption', 'NSFW', 'similarity', "image_embedding", "text_embedding"]]
# transform np.arrays to lists
data['image_embedding'] = data['image_embedding'].apply(lambda x: list(x))
@ -48,30 +46,32 @@ data['text_embedding'] = data['text_embedding'].apply(lambda x: list(x))
# this small hack is needed becase caption sometimes contains all kind of quotes
data['caption'] = data['caption'].apply(lambda x: x.replace("'", " ").replace('"', " "))
# save data to file
# export data as CSV file
data.to_csv(str_i + '.csv', header=False)
# previous files can be removed
# removed raw data files
os.system(f"rm {npy_file} {metadata_file} {text_npy}")
```
You can download data with
To start the data preparation pipeline, run:
```bash
seq 0 409 | xargs -P100 -I{} bash -c './download.sh {}'
```
The dataset is divided into 409 files. If you want to work only with a certain part of the dataset, just change the limits.
The dataset is split into 410 files, each file contains ca. 1 million rows. If you like to work with a smaller subset of the data, simply adjust the limits, e.g. `seq 0 9 | ...`.
## Create table for laion
## Create table
Without indexes table can be created by
To create a table without indexes, run:
```sql
CREATE TABLE laion_dataset
CREATE TABLE laion
(
`id` Int64,
`url` String,
`caption` String,
`NSFW` String,
`similarity` Float32,
`image_embedding` Array(Float32),
`text_embedding` Array(Float32)
@ -81,23 +81,23 @@ ORDER BY id
SETTINGS index_granularity = 8192
```
Fill table with data:
To import the CSV files into ClickHouse:
```sql
INSERT INTO laion_dataset FROM INFILE '{path_to_csv_files}/*.csv'
INSERT INTO laion FROM INFILE '{path_to_csv_files}/*.csv'
```
## Check data in table without indexes
## Run a brute-force ANN search (without ANN index)
Let's check the work of the following query on the part of the dataset (8 million records):
To run a brute-force approximate nearest neighbor search, run:
```sql
select url, caption from test_laion where similarity > 0.2 order by L2Distance(image_embedding, {target:Array(Float32)}) limit 30
SELECT url, caption FROM laion WHERE similarity > 0.2 ORDER BY L2Distance(image_embedding, {target:Array(Float32)}) LIMIT 30
```
Since the embeddings for images and texts may not match, let's also require a certain threshold of matching accuracy to get images that are more likely to satisfy our queries. The client parameter `target`, which is an array of 512 elements. See later in this article for a convenient way of obtaining such vectors. I used a random picture of a cat from the Internet as a target vector.
The filter on `similarity` makes sure that the images correspond to the image captions in the query results. `target` is an array of 512 elements and a client parameter. A convenient way to obtain such arrays will be presented at the end of the article. For now, we can run the embedding of a random cat picture as `target`.
**The result**
**Result**
```
┌─url───────────────────────────────────────────────────────────────────────────────────────────────────────────┬─caption────────────────────────────────────────────────────────────────┐
@ -114,32 +114,32 @@ Since the embeddings for images and texts may not match, let's also require a ce
8 rows in set. Elapsed: 6.432 sec. Processed 19.65 million rows, 43.96 GB (3.06 million rows/s., 6.84 GB/s.)
```
## Add indexes
## Run a ANN with an ANN index
Create a new table or follow instructions from [alter documentation](../../sql-reference/statements/alter/skipping-index.md).
Either create a new table or use [ALTER TABLE ADD INDEX](../../sql-reference/statements/alter/skipping-index.md) to add an ANN index:
```sql
CREATE TABLE laion_dataset
CREATE TABLE laion
(
`id` Int64,
`url` String,
`caption` String,
`NSFW` String,
`similarity` Float32,
`image_embedding` Array(Float32),
`text_embedding` Array(Float32),
INDEX annoy_image image_embedding TYPE annoy(1000) GRANULARITY 1000,
INDEX annoy_text text_embedding TYPE annoy(1000) GRANULARITY 1000
INDEX annoy_image image_embedding TYPE annoy(1000),
INDEX annoy_text text_embedding TYPE annoy(1000)
)
ENGINE = MergeTree
ORDER BY id
SETTINGS index_granularity = 8192
```
When created, the index will be built by L2Distance. You can read more about the parameters in the [annoy documentation](../../engines/table-engines/mergetree-family/annindexes.md#annoy-annoy). It makes sense to build indexes for a large number of granules. If you need good speed, then GRANULARITY should be several times larger than the expected number of results in the search.
Now let's check again with the same query:
By default, Annoy indexes use the L2 distance as metric. Further tuning knobs for index creation and search are described in the Annoy index [documentation](../../engines/table-engines/mergetree-family/annindexes.md). Let's check now again with the same query:
```sql
select url, caption from test_indexes_laion where similarity > 0.2 order by L2Distance(image_embedding, {target:Array(Float32)}) limit 8
SELECT url, caption FROM test_indexes_laion WHERE similarity > 0.2 ORDER BY l2Distance(image_embedding, {target:Array(Float32)}) LIMIT 8
```
**Result**
@ -159,15 +159,18 @@ select url, caption from test_indexes_laion where similarity > 0.2 order by L2Di
8 rows in set. Elapsed: 0.641 sec. Processed 22.06 thousand rows, 49.36 MB (91.53 thousand rows/s., 204.81 MB/s.)
```
The speed has increased significantly. But now, the results sometimes differ from what you are looking for. This is due to the approximation of the search and the quality of the constructed embedding. Note that the example was given for picture embeddings, but there are also text embeddings in the dataset, which can also be used for searching.
The speed increased significantly at the cost of less accurate results. This is because the ANN index only provide approximate search results. Note the example searched for similar image embeddings, yet it is also possible to search for positive image caption embeddings.
## Scripts for embeddings
## Creating embeddings with UDFs
Usually, we do not want to get embeddings from existing data, but to get them for new data and look for similar ones in old data. We can use [UDF](../../sql-reference/functions/index.md#sql-user-defined-functions) for this purpose. They will allow you to set the `target` vector without leaving the client. All of the following scripts will be written for the `ViT-B/32` model, as it was used for this dataset. You can use any model, but it is necessary to build embeddings in the dataset and for new objects using the same model.
One usually wants to create embeddings for new images or new image captions and search for similar image / image caption pairs in the data. We can use [UDF](../../sql-reference/functions/index.md#sql-user-defined-functions) to create the `target` vector without leaving the client. It is important to use the same model to create the data and new embeddings for searches. The following scripts utilize the `ViT-B/32` model which also underlies the dataset.
### Text embeddings
First, store the following Python script in the `user_scripts/` directory of your ClickHouse data path and make it executable (`chmod +x encode_text.py`).
`encode_text.py`:
```python
#!/usr/bin/python3
import clip
@ -182,10 +185,12 @@ if __name__ == '__main__':
inputs = clip.tokenize(text)
with torch.no_grad():
text_features = model.encode_text(inputs)[0].tolist()
print(text_features)
sys.stdout.flush()
```
`encode_text_function.xml`:
Then create `encode_text_function.xml` in a location referenced by `<user_defined_executable_functions_config>/path/to/*_function.xml</user_defined_executable_functions_config>` in your ClickHouse server configuration file.
```xml
<functions>
<function>
@ -203,19 +208,19 @@ if __name__ == '__main__':
</functions>
```
Now we can simply use:
You can now simply use:
```sql
SELECT encode_text('cat');
```
The first use will be slow because the model needs to be loaded. But repeated queries will be fast. Then we copy the results to ``set param_target=...`` and can easily write queries
The first run will be slow because it loads the model, but repeated runs will be fast. We can then copy the output to `SET param_target=...` and can easily write queries.
### Image embeddings
For pictures, the process is similar, but you send the path instead of the picture (if necessary, you can implement a download picture with processing, but it will take longer)
Image embeddings can be created similarly but we will provide the Python script the path to a local image instead of the image caption text.
`encode_image.py`
`encode_picture.py`
```python
#!/usr/bin/python3
import clip
@ -231,29 +236,31 @@ if __name__ == '__main__':
image = preprocess(Image.open(text.strip())).unsqueeze(0).to(device)
with torch.no_grad():
image_features = model.encode_image(image)[0].tolist()
print(image_features)
print(image_features)
sys.stdout.flush()
```
`encode_picture_function.xml`
`encode_image_function.xml`
```xml
<functions>
<function>
<type>executable_pool</type>
<name>encode_picture</name>
<name>encode_image</name>
<return_type>Array(Float32)</return_type>
<argument>
<type>String</type>
<name>path</name>
</argument>
<format>TabSeparated</format>
<command>encode_picture.py</command>
<command>encode_image.py</command>
<command_read_timeout>1000000</command_read_timeout>
</function>
</functions>
```
The query:
Then run this query:
```sql
SELECT encode_picture('some/path/to/your/picture');
SELECT encode_image('/path/to/your/image');
```

View File

@ -47,6 +47,8 @@ public:
void tryUpdateConnection();
bool isConnected() const { return connection != nullptr && connection->is_open(); }
const ConnectionInfo & getConnectionInfo() { return connection_info; }
String getInfoForLog() const { return connection_info.host_port; }

View File

@ -28,10 +28,25 @@ public:
ConnectionHolder(const ConnectionHolder & other) = delete;
void setBroken() { is_broken = true; }
~ConnectionHolder()
{
if (auto_close)
{
connection.reset();
}
else if (is_broken)
{
try
{
connection->getRef().reset();
}
catch (...)
{
connection.reset();
}
}
pool->returnObject(std::move(connection));
}
@ -49,6 +64,7 @@ private:
PoolPtr pool;
ConnectionPtr connection;
bool auto_close;
bool is_broken = false;
};
using ConnectionHolderPtr = std::unique_ptr<ConnectionHolder>;

View File

@ -86,4 +86,10 @@ std::shared_ptr<ReadBuffer> WriteBufferToFileSegment::getReadBufferImpl()
return std::make_shared<ReadBufferFromFile>(file_segment->getPathInLocalCache());
}
WriteBufferToFileSegment::~WriteBufferToFileSegment()
{
/// To be sure that file exists before destructor of segment_holder is called
WriteBufferFromFileDecorator::finalize();
}
}

View File

@ -16,6 +16,7 @@ public:
explicit WriteBufferToFileSegment(FileSegmentsHolderPtr segment_holder);
void nextImpl() override;
~WriteBufferToFileSegment() override;
private:

View File

@ -59,7 +59,6 @@ PostgreSQLSource<T>::PostgreSQLSource(
init(sample_block);
}
template<typename T>
void PostgreSQLSource<T>::init(const Block & sample_block)
{
@ -82,7 +81,8 @@ void PostgreSQLSource<T>::onStart()
{
try
{
tx = std::make_shared<T>(connection_holder->get());
auto & conn = connection_holder->get();
tx = std::make_shared<T>(conn);
}
catch (const pqxx::broken_connection &)
{
@ -180,6 +180,27 @@ void PostgreSQLSource<T>::onFinish()
if (tx && auto_commit)
tx->commit();
is_completed = true;
}
template<typename T>
PostgreSQLSource<T>::~PostgreSQLSource()
{
if (!is_completed)
{
try
{
stream.reset();
tx.reset();
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
connection_holder->setBroken();
}
}
template

View File

@ -28,6 +28,8 @@ public:
String getName() const override { return "PostgreSQL"; }
~PostgreSQLSource() override;
protected:
PostgreSQLSource(
std::shared_ptr<T> tx_,
@ -54,6 +56,7 @@ private:
ExternalResultDescription description;
bool started = false;
bool is_completed = false;
postgres::ConnectionHolderPtr connection_holder;

View File

@ -119,7 +119,11 @@ MergeTreeSequentialSource::MergeTreeSequentialSource(
addTotalRowsApprox(data_part->rows_count);
/// Add columns because we don't want to read empty blocks
injectRequiredColumns(LoadedMergeTreeDataPartInfoForReader(data_part, alter_conversions), storage_snapshot, /*with_subcolumns=*/ false, columns_to_read);
injectRequiredColumns(
LoadedMergeTreeDataPartInfoForReader(data_part, alter_conversions),
storage_snapshot,
storage.supportsSubcolumns(),
columns_to_read);
NamesAndTypesList columns_for_reader;
if (take_column_types_from_storage)
@ -127,6 +131,8 @@ MergeTreeSequentialSource::MergeTreeSequentialSource(
auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical)
.withExtendedObjects()
.withSystemColumns();
if (storage.supportsSubcolumns())
options.withSubcolumns();
columns_for_reader = storage_snapshot->getColumnsByNames(options, columns_to_read);
}
else

View File

@ -89,6 +89,8 @@ public:
bool supportsDynamicSubcolumns() const override { return true; }
bool supportsSubcolumns() const override { return true; }
bool mayBenefitFromIndexForIn(
const ASTPtr & left_in_operand, ContextPtr query_context, const StorageMetadataPtr & metadata_snapshot) const override
{
@ -110,6 +112,12 @@ public:
return storage.getPartitionIDFromQuery(ast, context);
}
StorageSnapshotPtr getStorageSnapshotForQuery(
const StorageMetadataPtr & metadata_snapshot, const ASTPtr & /*query*/, ContextPtr query_context) const override
{
return storage.getStorageSnapshot(metadata_snapshot, query_context);
}
bool materializeTTLRecalculateOnly() const
{
if (parts.empty())

View File

@ -618,6 +618,9 @@ class SettingsRandomizer:
"America/Mazatlan",
"America/Hermosillo",
"Mexico/BajaSur",
# These timezones had DST transitions on some unusual dates (e.g. 2000-01-15 12:00:00).
"Africa/Khartoum",
"Africa/Juba",
# server default that is randomized across all timezones
# NOTE: due to lots of trickery we cannot use empty timezone here, but this should be the same.
get_localzone(),

View File

@ -9,7 +9,6 @@ import os
import random
import re
import shutil
import string
import subprocess
import time
import shlex
@ -431,12 +430,19 @@ class ClickhouseIntegrationTestsRunner:
def _get_all_tests(self, repo_path):
image_cmd = self._get_runner_image_cmd(repo_path)
runner_opts = self._get_runner_opts()
out_file = "all_tests.txt"
out_file_full = os.path.join(self.result_path, "runner_get_all_tests.log")
cmd = (
f"cd {repo_path}/tests/integration && "
f"timeout --signal=KILL 1h ./runner {runner_opts} {image_cmd} -- --setup-plan "
f"| tee '{out_file_full}'"
"cd {repo_path}/tests/integration && "
"timeout --signal=KILL 1h ./runner {runner_opts} {image_cmd} -- --setup-plan "
"| tee '{out_file_full}' | grep -F '::' | sed -r 's/ \(fixtures used:.*//g; s/^ *//g; s/ *$//g' "
"| grep -v -F 'SKIPPED' | sort --unique > {out_file}".format(
repo_path=repo_path,
runner_opts=self._get_runner_opts(),
image_cmd=image_cmd,
out_file=out_file,
out_file_full=out_file_full,
)
)
logging.info("Getting all tests with cmd '%s'", cmd)
@ -444,19 +450,34 @@ class ClickhouseIntegrationTestsRunner:
cmd, shell=True
)
all_tests = set()
with open(out_file_full, "r", encoding="utf-8") as all_tests_fd:
for line in all_tests_fd:
if (
line[0] in string.whitespace # test names at the start of lines
or "::test" not in line # test names contain '::test'
or "SKIPPED" in line # pytest.mark.skip/-if
):
continue
all_tests.add(line.strip())
all_tests_file_path = "{repo_path}/tests/integration/{out_file}".format(
repo_path=repo_path, out_file=out_file
)
if (
not os.path.isfile(all_tests_file_path)
or os.path.getsize(all_tests_file_path) == 0
):
if os.path.isfile(out_file_full):
# log runner output
logging.info("runner output:")
with open(out_file_full, "r") as all_tests_full_file:
for line in all_tests_full_file:
line = line.rstrip()
if line:
logging.info("runner output: %s", line)
else:
logging.info("runner output '%s' is empty", out_file_full)
assert all_tests
raise Exception(
"There is something wrong with getting all tests list: file '{}' is empty or does not exist.".format(
all_tests_file_path
)
)
all_tests = []
with open(all_tests_file_path, "r") as all_tests_file:
for line in all_tests_file:
all_tests.append(line.strip())
return list(sorted(all_tests))
def _get_parallel_tests_skip_list(self, repo_path):

View File

@ -1,3 +1,5 @@
SET session_timezone = 'Etc/UTC';
SELECT toDateTime('2017-10-30 08:18:19') + INTERVAL 1 DAY + INTERVAL 1 MONTH - INTERVAL 1 YEAR;
SELECT toDateTime('2017-10-30 08:18:19') + INTERVAL 1 HOUR + INTERVAL 1000 MINUTE + INTERVAL 10 SECOND;
SELECT toDateTime('2017-10-30 08:18:19') + INTERVAL 1 DAY + INTERVAL number MONTH FROM system.numbers LIMIT 20;

View File

@ -0,0 +1,10 @@
6 1
5 2
4 3
3 4
4 ttt
5 ttt
6 ttt
{"a":"1","obj":{"k1":1,"k2":0,"k3":0}}
{"a":"3","obj":{"k1":0,"k2":0,"k3":1}}
{"a":"1","obj":{"k1":1,"k2":0,"k3":0}}

View File

@ -0,0 +1,51 @@
-- Tags: no-replicated-database
-- It won't work in case there are misssing subcolumns in different shards
DROP TABLE IF EXISTS t_mutations_subcolumns;
SET allow_experimental_object_type = 1;
CREATE TABLE t_mutations_subcolumns (id UInt64, n String, obj JSON)
ENGINE = MergeTree ORDER BY id;
INSERT INTO t_mutations_subcolumns VALUES (1, 'aaa', '{"k1": {"k2": "foo"}, "k3": 5}');
INSERT INTO t_mutations_subcolumns VALUES (2, 'bbb', '{"k1": {"k2": "fee"}, "k3": 4}');
INSERT INTO t_mutations_subcolumns VALUES (3, 'ccc', '{"k1": {"k2": "foo", "k4": "baz"}, "k3": 4}');
INSERT INTO t_mutations_subcolumns VALUES (4, 'ddd', '{"k1": {"k2": "foo"}, "k3": 4}');
INSERT INTO t_mutations_subcolumns VALUES (5, 'eee', '{"k1": {"k2": "foo"}, "k3": 4}');
INSERT INTO t_mutations_subcolumns VALUES (6, 'fff', '{"k1": {"k2": "foo"}, "k3": 4}');
OPTIMIZE TABLE t_mutations_subcolumns FINAL;
SELECT count(), min(id) FROM t_mutations_subcolumns;
SET mutations_sync = 2;
ALTER TABLE t_mutations_subcolumns DELETE WHERE obj.k3 = 5;
SELECT count(), min(id) FROM t_mutations_subcolumns;
DELETE FROM t_mutations_subcolumns WHERE obj.k1.k2 = 'fee';
SELECT count(), min(id) FROM t_mutations_subcolumns;
ALTER TABLE t_mutations_subcolumns DELETE WHERE obj.k1 = ('foo', 'baz');
SELECT count(), min(id) FROM t_mutations_subcolumns;
ALTER TABLE t_mutations_subcolumns UPDATE n = 'ttt' WHERE obj.k1.k2 = 'foo';
SELECT id, n FROM t_mutations_subcolumns;
DROP TABLE IF EXISTS t_mutations_subcolumns;
CREATE TABLE t_mutations_subcolumns (a UInt64, obj JSON)
ENGINE = MergeTree ORDER BY a PARTITION BY a;
INSERT INTO t_mutations_subcolumns VALUES (1, '{"k1": 1}');
INSERT INTO t_mutations_subcolumns VALUES (2, '{"k2": 1}');
INSERT INTO t_mutations_subcolumns VALUES (3, '{"k3": 1}');
ALTER TABLE t_mutations_subcolumns DELETE WHERE obj.k2 = 1;
SELECT * FROM t_mutations_subcolumns ORDER BY a FORMAT JSONEachRow;
ALTER TABLE t_mutations_subcolumns DELETE WHERE obj.k1 = 0;
SELECT * FROM t_mutations_subcolumns ORDER BY a FORMAT JSONEachRow;
DROP TABLE t_mutations_subcolumns;