mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-30 11:32:03 +00:00
Merge branch 'master' of github.com:ClickHouse/ClickHouse
This commit is contained in:
commit
2670af53df
11
.gitmodules
vendored
11
.gitmodules
vendored
@ -13,7 +13,6 @@
|
||||
[submodule "contrib/zlib-ng"]
|
||||
path = contrib/zlib-ng
|
||||
url = https://github.com/ClickHouse/zlib-ng
|
||||
branch = clickhouse-2.0.x
|
||||
[submodule "contrib/googletest"]
|
||||
path = contrib/googletest
|
||||
url = https://github.com/google/googletest
|
||||
@ -47,7 +46,6 @@
|
||||
[submodule "contrib/arrow"]
|
||||
path = contrib/arrow
|
||||
url = https://github.com/ClickHouse/arrow
|
||||
branch = blessed/release-6.0.1
|
||||
[submodule "contrib/thrift"]
|
||||
path = contrib/thrift
|
||||
url = https://github.com/apache/thrift
|
||||
@ -93,7 +91,6 @@
|
||||
[submodule "contrib/grpc"]
|
||||
path = contrib/grpc
|
||||
url = https://github.com/ClickHouse/grpc
|
||||
branch = v1.33.2
|
||||
[submodule "contrib/aws"]
|
||||
path = contrib/aws
|
||||
url = https://github.com/ClickHouse/aws-sdk-cpp
|
||||
@ -140,11 +137,9 @@
|
||||
[submodule "contrib/cassandra"]
|
||||
path = contrib/cassandra
|
||||
url = https://github.com/ClickHouse/cpp-driver
|
||||
branch = clickhouse
|
||||
[submodule "contrib/libuv"]
|
||||
path = contrib/libuv
|
||||
url = https://github.com/ClickHouse/libuv
|
||||
branch = clickhouse
|
||||
[submodule "contrib/fmtlib"]
|
||||
path = contrib/fmtlib
|
||||
url = https://github.com/fmtlib/fmt
|
||||
@ -157,11 +152,9 @@
|
||||
[submodule "contrib/cyrus-sasl"]
|
||||
path = contrib/cyrus-sasl
|
||||
url = https://github.com/ClickHouse/cyrus-sasl
|
||||
branch = cyrus-sasl-2.1
|
||||
[submodule "contrib/croaring"]
|
||||
path = contrib/croaring
|
||||
url = https://github.com/RoaringBitmap/CRoaring
|
||||
branch = v0.2.66
|
||||
[submodule "contrib/miniselect"]
|
||||
path = contrib/miniselect
|
||||
url = https://github.com/danlark1/miniselect
|
||||
@ -174,7 +167,6 @@
|
||||
[submodule "contrib/abseil-cpp"]
|
||||
path = contrib/abseil-cpp
|
||||
url = https://github.com/abseil/abseil-cpp
|
||||
branch = lts_2021_11_02
|
||||
[submodule "contrib/dragonbox"]
|
||||
path = contrib/dragonbox
|
||||
url = https://github.com/ClickHouse/dragonbox
|
||||
@ -187,7 +179,6 @@
|
||||
[submodule "contrib/boringssl"]
|
||||
path = contrib/boringssl
|
||||
url = https://github.com/ClickHouse/boringssl
|
||||
branch = unknown_branch_from_artur
|
||||
[submodule "contrib/NuRaft"]
|
||||
path = contrib/NuRaft
|
||||
url = https://github.com/ClickHouse/NuRaft
|
||||
@ -248,7 +239,6 @@
|
||||
[submodule "contrib/annoy"]
|
||||
path = contrib/annoy
|
||||
url = https://github.com/ClickHouse/annoy
|
||||
branch = ClickHouse-master
|
||||
[submodule "contrib/qpl"]
|
||||
path = contrib/qpl
|
||||
url = https://github.com/intel/qpl
|
||||
@ -282,7 +272,6 @@
|
||||
[submodule "contrib/openssl"]
|
||||
path = contrib/openssl
|
||||
url = https://github.com/openssl/openssl
|
||||
branch = openssl-3.0
|
||||
[submodule "contrib/google-benchmark"]
|
||||
path = contrib/google-benchmark
|
||||
url = https://github.com/google/benchmark
|
||||
|
2
contrib/libpqxx
vendored
2
contrib/libpqxx
vendored
@ -1 +1 @@
|
||||
Subproject commit bdd6540fb95ff56c813691ceb5da5a3266cf235d
|
||||
Subproject commit 791d68fd89902835133c50435e380ec7a73271b7
|
@ -1,23 +1,21 @@
|
||||
# Laion-400M dataset
|
||||
|
||||
The dataset contains 400 million images with English text. For more information follow this [link](https://laion.ai/blog/laion-400-open-dataset/). Laion provides even larger datasets (e.g. [5 billion](https://laion.ai/blog/laion-5b/)). Working with them will be similar.
|
||||
The [Laion-400M dataset](https://laion.ai/blog/laion-400-open-dataset/) contains 400 million images with English image captions. Laion nowadays provides [an even larger dataset](https://laion.ai/blog/laion-5b/) but working with it will be similar.
|
||||
|
||||
The dataset has prepared embeddings for texts and images. This will be used to demonstrate [Approximate nearest neighbor search indexes](../../engines/table-engines/mergetree-family/annindexes.md).
|
||||
The dataset contains the image URL, embeddings for both the image and the image caption, a similarity score between the image and the image caption, as well as metadata, e.g. the image width/height, the licence and a NSFW flag. We can use the dataset to demonstrate [approximate nearest neighbor search](../../engines/table-engines/mergetree-family/annindexes.md) in ClickHouse.
|
||||
|
||||
## Prepare data
|
||||
## Data preparation
|
||||
|
||||
Embeddings are stored in `.npy` files, so we have to read them with python and merge with other data.
|
||||
|
||||
Download data and process it with simple `download.sh` script:
|
||||
The embeddings and the metadata are stored in separate files in the raw data. A data preparation step downloads the data, merges the files,
|
||||
converts them to CSV and imports them into ClickHouse. You can use the following `download.sh` script for that:
|
||||
|
||||
```bash
|
||||
wget --tries=100 https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/img_emb/img_emb_${1}.npy
|
||||
wget --tries=100 https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/metadata/metadata_${1}.parquet
|
||||
wget --tries=100 https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/text_emb/text_emb_${1}.npy
|
||||
python3 process.py ${1}
|
||||
wget --tries=100 https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/img_emb/img_emb_${1}.npy # download image embedding
|
||||
wget --tries=100 https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/text_emb/text_emb_${1}.npy # download text embedding
|
||||
wget --tries=100 https://deploy.laion.ai/8f83b608504d46bb81708ec86e912220/embeddings/metadata/metadata_${1}.parquet # download metadata
|
||||
python3 process.py ${1} # merge files and convert to CSV
|
||||
```
|
||||
|
||||
Where `process.py`:
|
||||
Script `process.py` is defined as follows:
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
@ -35,11 +33,11 @@ im_emb = np.load(npy_file)
|
||||
text_emb = np.load(text_npy)
|
||||
data = pd.read_parquet(metadata_file)
|
||||
|
||||
# combine them
|
||||
# combine files
|
||||
data = pd.concat([data, pd.DataFrame({"image_embedding" : [*im_emb]}), pd.DataFrame({"text_embedding" : [*text_emb]})], axis=1, copy=False)
|
||||
|
||||
# you can save more columns
|
||||
data = data[['url', 'caption', 'similarity', "image_embedding", "text_embedding"]]
|
||||
# columns to be imported into ClickHouse
|
||||
data = data[['url', 'caption', 'NSFW', 'similarity', "image_embedding", "text_embedding"]]
|
||||
|
||||
# transform np.arrays to lists
|
||||
data['image_embedding'] = data['image_embedding'].apply(lambda x: list(x))
|
||||
@ -48,30 +46,32 @@ data['text_embedding'] = data['text_embedding'].apply(lambda x: list(x))
|
||||
# this small hack is needed becase caption sometimes contains all kind of quotes
|
||||
data['caption'] = data['caption'].apply(lambda x: x.replace("'", " ").replace('"', " "))
|
||||
|
||||
# save data to file
|
||||
# export data as CSV file
|
||||
data.to_csv(str_i + '.csv', header=False)
|
||||
|
||||
# previous files can be removed
|
||||
# removed raw data files
|
||||
os.system(f"rm {npy_file} {metadata_file} {text_npy}")
|
||||
```
|
||||
|
||||
You can download data with
|
||||
To start the data preparation pipeline, run:
|
||||
|
||||
```bash
|
||||
seq 0 409 | xargs -P100 -I{} bash -c './download.sh {}'
|
||||
```
|
||||
|
||||
The dataset is divided into 409 files. If you want to work only with a certain part of the dataset, just change the limits.
|
||||
The dataset is split into 410 files, each file contains ca. 1 million rows. If you like to work with a smaller subset of the data, simply adjust the limits, e.g. `seq 0 9 | ...`.
|
||||
|
||||
## Create table for laion
|
||||
## Create table
|
||||
|
||||
Without indexes table can be created by
|
||||
To create a table without indexes, run:
|
||||
|
||||
```sql
|
||||
CREATE TABLE laion_dataset
|
||||
CREATE TABLE laion
|
||||
(
|
||||
`id` Int64,
|
||||
`url` String,
|
||||
`caption` String,
|
||||
`NSFW` String,
|
||||
`similarity` Float32,
|
||||
`image_embedding` Array(Float32),
|
||||
`text_embedding` Array(Float32)
|
||||
@ -81,23 +81,23 @@ ORDER BY id
|
||||
SETTINGS index_granularity = 8192
|
||||
```
|
||||
|
||||
Fill table with data:
|
||||
To import the CSV files into ClickHouse:
|
||||
|
||||
```sql
|
||||
INSERT INTO laion_dataset FROM INFILE '{path_to_csv_files}/*.csv'
|
||||
INSERT INTO laion FROM INFILE '{path_to_csv_files}/*.csv'
|
||||
```
|
||||
|
||||
## Check data in table without indexes
|
||||
## Run a brute-force ANN search (without ANN index)
|
||||
|
||||
Let's check the work of the following query on the part of the dataset (8 million records):
|
||||
To run a brute-force approximate nearest neighbor search, run:
|
||||
|
||||
```sql
|
||||
select url, caption from test_laion where similarity > 0.2 order by L2Distance(image_embedding, {target:Array(Float32)}) limit 30
|
||||
SELECT url, caption FROM laion WHERE similarity > 0.2 ORDER BY L2Distance(image_embedding, {target:Array(Float32)}) LIMIT 30
|
||||
```
|
||||
|
||||
Since the embeddings for images and texts may not match, let's also require a certain threshold of matching accuracy to get images that are more likely to satisfy our queries. The client parameter `target`, which is an array of 512 elements. See later in this article for a convenient way of obtaining such vectors. I used a random picture of a cat from the Internet as a target vector.
|
||||
The filter on `similarity` makes sure that the images correspond to the image captions in the query results. `target` is an array of 512 elements and a client parameter. A convenient way to obtain such arrays will be presented at the end of the article. For now, we can run the embedding of a random cat picture as `target`.
|
||||
|
||||
**The result**
|
||||
**Result**
|
||||
|
||||
```
|
||||
┌─url───────────────────────────────────────────────────────────────────────────────────────────────────────────┬─caption────────────────────────────────────────────────────────────────┐
|
||||
@ -114,32 +114,32 @@ Since the embeddings for images and texts may not match, let's also require a ce
|
||||
8 rows in set. Elapsed: 6.432 sec. Processed 19.65 million rows, 43.96 GB (3.06 million rows/s., 6.84 GB/s.)
|
||||
```
|
||||
|
||||
## Add indexes
|
||||
## Run a ANN with an ANN index
|
||||
|
||||
Create a new table or follow instructions from [alter documentation](../../sql-reference/statements/alter/skipping-index.md).
|
||||
Either create a new table or use [ALTER TABLE ADD INDEX](../../sql-reference/statements/alter/skipping-index.md) to add an ANN index:
|
||||
|
||||
```sql
|
||||
CREATE TABLE laion_dataset
|
||||
CREATE TABLE laion
|
||||
(
|
||||
`id` Int64,
|
||||
`url` String,
|
||||
`caption` String,
|
||||
`NSFW` String,
|
||||
`similarity` Float32,
|
||||
`image_embedding` Array(Float32),
|
||||
`text_embedding` Array(Float32),
|
||||
INDEX annoy_image image_embedding TYPE annoy(1000) GRANULARITY 1000,
|
||||
INDEX annoy_text text_embedding TYPE annoy(1000) GRANULARITY 1000
|
||||
INDEX annoy_image image_embedding TYPE annoy(1000),
|
||||
INDEX annoy_text text_embedding TYPE annoy(1000)
|
||||
)
|
||||
ENGINE = MergeTree
|
||||
ORDER BY id
|
||||
SETTINGS index_granularity = 8192
|
||||
```
|
||||
|
||||
When created, the index will be built by L2Distance. You can read more about the parameters in the [annoy documentation](../../engines/table-engines/mergetree-family/annindexes.md#annoy-annoy). It makes sense to build indexes for a large number of granules. If you need good speed, then GRANULARITY should be several times larger than the expected number of results in the search.
|
||||
Now let's check again with the same query:
|
||||
By default, Annoy indexes use the L2 distance as metric. Further tuning knobs for index creation and search are described in the Annoy index [documentation](../../engines/table-engines/mergetree-family/annindexes.md). Let's check now again with the same query:
|
||||
|
||||
```sql
|
||||
select url, caption from test_indexes_laion where similarity > 0.2 order by L2Distance(image_embedding, {target:Array(Float32)}) limit 8
|
||||
SELECT url, caption FROM test_indexes_laion WHERE similarity > 0.2 ORDER BY l2Distance(image_embedding, {target:Array(Float32)}) LIMIT 8
|
||||
```
|
||||
|
||||
**Result**
|
||||
@ -159,15 +159,18 @@ select url, caption from test_indexes_laion where similarity > 0.2 order by L2Di
|
||||
8 rows in set. Elapsed: 0.641 sec. Processed 22.06 thousand rows, 49.36 MB (91.53 thousand rows/s., 204.81 MB/s.)
|
||||
```
|
||||
|
||||
The speed has increased significantly. But now, the results sometimes differ from what you are looking for. This is due to the approximation of the search and the quality of the constructed embedding. Note that the example was given for picture embeddings, but there are also text embeddings in the dataset, which can also be used for searching.
|
||||
The speed increased significantly at the cost of less accurate results. This is because the ANN index only provide approximate search results. Note the example searched for similar image embeddings, yet it is also possible to search for positive image caption embeddings.
|
||||
|
||||
## Scripts for embeddings
|
||||
## Creating embeddings with UDFs
|
||||
|
||||
Usually, we do not want to get embeddings from existing data, but to get them for new data and look for similar ones in old data. We can use [UDF](../../sql-reference/functions/index.md#sql-user-defined-functions) for this purpose. They will allow you to set the `target` vector without leaving the client. All of the following scripts will be written for the `ViT-B/32` model, as it was used for this dataset. You can use any model, but it is necessary to build embeddings in the dataset and for new objects using the same model.
|
||||
One usually wants to create embeddings for new images or new image captions and search for similar image / image caption pairs in the data. We can use [UDF](../../sql-reference/functions/index.md#sql-user-defined-functions) to create the `target` vector without leaving the client. It is important to use the same model to create the data and new embeddings for searches. The following scripts utilize the `ViT-B/32` model which also underlies the dataset.
|
||||
|
||||
### Text embeddings
|
||||
|
||||
First, store the following Python script in the `user_scripts/` directory of your ClickHouse data path and make it executable (`chmod +x encode_text.py`).
|
||||
|
||||
`encode_text.py`:
|
||||
|
||||
```python
|
||||
#!/usr/bin/python3
|
||||
import clip
|
||||
@ -182,10 +185,12 @@ if __name__ == '__main__':
|
||||
inputs = clip.tokenize(text)
|
||||
with torch.no_grad():
|
||||
text_features = model.encode_text(inputs)[0].tolist()
|
||||
print(text_features)
|
||||
sys.stdout.flush()
|
||||
```
|
||||
|
||||
`encode_text_function.xml`:
|
||||
Then create `encode_text_function.xml` in a location referenced by `<user_defined_executable_functions_config>/path/to/*_function.xml</user_defined_executable_functions_config>` in your ClickHouse server configuration file.
|
||||
|
||||
```xml
|
||||
<functions>
|
||||
<function>
|
||||
@ -203,19 +208,19 @@ if __name__ == '__main__':
|
||||
</functions>
|
||||
```
|
||||
|
||||
Now we can simply use:
|
||||
You can now simply use:
|
||||
|
||||
```sql
|
||||
SELECT encode_text('cat');
|
||||
```
|
||||
|
||||
The first use will be slow because the model needs to be loaded. But repeated queries will be fast. Then we copy the results to ``set param_target=...`` and can easily write queries
|
||||
The first run will be slow because it loads the model, but repeated runs will be fast. We can then copy the output to `SET param_target=...` and can easily write queries.
|
||||
|
||||
### Image embeddings
|
||||
|
||||
For pictures, the process is similar, but you send the path instead of the picture (if necessary, you can implement a download picture with processing, but it will take longer)
|
||||
Image embeddings can be created similarly but we will provide the Python script the path to a local image instead of the image caption text.
|
||||
|
||||
`encode_image.py`
|
||||
|
||||
`encode_picture.py`
|
||||
```python
|
||||
#!/usr/bin/python3
|
||||
import clip
|
||||
@ -231,29 +236,31 @@ if __name__ == '__main__':
|
||||
image = preprocess(Image.open(text.strip())).unsqueeze(0).to(device)
|
||||
with torch.no_grad():
|
||||
image_features = model.encode_image(image)[0].tolist()
|
||||
print(image_features)
|
||||
print(image_features)
|
||||
sys.stdout.flush()
|
||||
```
|
||||
|
||||
`encode_picture_function.xml`
|
||||
`encode_image_function.xml`
|
||||
|
||||
```xml
|
||||
<functions>
|
||||
<function>
|
||||
<type>executable_pool</type>
|
||||
<name>encode_picture</name>
|
||||
<name>encode_image</name>
|
||||
<return_type>Array(Float32)</return_type>
|
||||
<argument>
|
||||
<type>String</type>
|
||||
<name>path</name>
|
||||
</argument>
|
||||
<format>TabSeparated</format>
|
||||
<command>encode_picture.py</command>
|
||||
<command>encode_image.py</command>
|
||||
<command_read_timeout>1000000</command_read_timeout>
|
||||
</function>
|
||||
</functions>
|
||||
```
|
||||
|
||||
The query:
|
||||
Then run this query:
|
||||
|
||||
```sql
|
||||
SELECT encode_picture('some/path/to/your/picture');
|
||||
SELECT encode_image('/path/to/your/image');
|
||||
```
|
||||
|
@ -47,6 +47,8 @@ public:
|
||||
|
||||
void tryUpdateConnection();
|
||||
|
||||
bool isConnected() const { return connection != nullptr && connection->is_open(); }
|
||||
|
||||
const ConnectionInfo & getConnectionInfo() { return connection_info; }
|
||||
|
||||
String getInfoForLog() const { return connection_info.host_port; }
|
||||
|
@ -28,10 +28,25 @@ public:
|
||||
|
||||
ConnectionHolder(const ConnectionHolder & other) = delete;
|
||||
|
||||
void setBroken() { is_broken = true; }
|
||||
|
||||
~ConnectionHolder()
|
||||
{
|
||||
if (auto_close)
|
||||
{
|
||||
connection.reset();
|
||||
}
|
||||
else if (is_broken)
|
||||
{
|
||||
try
|
||||
{
|
||||
connection->getRef().reset();
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
connection.reset();
|
||||
}
|
||||
}
|
||||
pool->returnObject(std::move(connection));
|
||||
}
|
||||
|
||||
@ -49,6 +64,7 @@ private:
|
||||
PoolPtr pool;
|
||||
ConnectionPtr connection;
|
||||
bool auto_close;
|
||||
bool is_broken = false;
|
||||
};
|
||||
|
||||
using ConnectionHolderPtr = std::unique_ptr<ConnectionHolder>;
|
||||
|
@ -86,4 +86,10 @@ std::shared_ptr<ReadBuffer> WriteBufferToFileSegment::getReadBufferImpl()
|
||||
return std::make_shared<ReadBufferFromFile>(file_segment->getPathInLocalCache());
|
||||
}
|
||||
|
||||
WriteBufferToFileSegment::~WriteBufferToFileSegment()
|
||||
{
|
||||
/// To be sure that file exists before destructor of segment_holder is called
|
||||
WriteBufferFromFileDecorator::finalize();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -16,6 +16,7 @@ public:
|
||||
explicit WriteBufferToFileSegment(FileSegmentsHolderPtr segment_holder);
|
||||
|
||||
void nextImpl() override;
|
||||
~WriteBufferToFileSegment() override;
|
||||
|
||||
private:
|
||||
|
||||
|
@ -59,7 +59,6 @@ PostgreSQLSource<T>::PostgreSQLSource(
|
||||
init(sample_block);
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
void PostgreSQLSource<T>::init(const Block & sample_block)
|
||||
{
|
||||
@ -82,7 +81,8 @@ void PostgreSQLSource<T>::onStart()
|
||||
{
|
||||
try
|
||||
{
|
||||
tx = std::make_shared<T>(connection_holder->get());
|
||||
auto & conn = connection_holder->get();
|
||||
tx = std::make_shared<T>(conn);
|
||||
}
|
||||
catch (const pqxx::broken_connection &)
|
||||
{
|
||||
@ -180,6 +180,27 @@ void PostgreSQLSource<T>::onFinish()
|
||||
|
||||
if (tx && auto_commit)
|
||||
tx->commit();
|
||||
|
||||
is_completed = true;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
PostgreSQLSource<T>::~PostgreSQLSource()
|
||||
{
|
||||
if (!is_completed)
|
||||
{
|
||||
try
|
||||
{
|
||||
stream.reset();
|
||||
tx.reset();
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
connection_holder->setBroken();
|
||||
}
|
||||
}
|
||||
|
||||
template
|
||||
|
@ -28,6 +28,8 @@ public:
|
||||
|
||||
String getName() const override { return "PostgreSQL"; }
|
||||
|
||||
~PostgreSQLSource() override;
|
||||
|
||||
protected:
|
||||
PostgreSQLSource(
|
||||
std::shared_ptr<T> tx_,
|
||||
@ -54,6 +56,7 @@ private:
|
||||
ExternalResultDescription description;
|
||||
|
||||
bool started = false;
|
||||
bool is_completed = false;
|
||||
|
||||
postgres::ConnectionHolderPtr connection_holder;
|
||||
|
||||
|
@ -119,7 +119,11 @@ MergeTreeSequentialSource::MergeTreeSequentialSource(
|
||||
addTotalRowsApprox(data_part->rows_count);
|
||||
|
||||
/// Add columns because we don't want to read empty blocks
|
||||
injectRequiredColumns(LoadedMergeTreeDataPartInfoForReader(data_part, alter_conversions), storage_snapshot, /*with_subcolumns=*/ false, columns_to_read);
|
||||
injectRequiredColumns(
|
||||
LoadedMergeTreeDataPartInfoForReader(data_part, alter_conversions),
|
||||
storage_snapshot,
|
||||
storage.supportsSubcolumns(),
|
||||
columns_to_read);
|
||||
|
||||
NamesAndTypesList columns_for_reader;
|
||||
if (take_column_types_from_storage)
|
||||
@ -127,6 +131,8 @@ MergeTreeSequentialSource::MergeTreeSequentialSource(
|
||||
auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical)
|
||||
.withExtendedObjects()
|
||||
.withSystemColumns();
|
||||
if (storage.supportsSubcolumns())
|
||||
options.withSubcolumns();
|
||||
columns_for_reader = storage_snapshot->getColumnsByNames(options, columns_to_read);
|
||||
}
|
||||
else
|
||||
|
@ -89,6 +89,8 @@ public:
|
||||
|
||||
bool supportsDynamicSubcolumns() const override { return true; }
|
||||
|
||||
bool supportsSubcolumns() const override { return true; }
|
||||
|
||||
bool mayBenefitFromIndexForIn(
|
||||
const ASTPtr & left_in_operand, ContextPtr query_context, const StorageMetadataPtr & metadata_snapshot) const override
|
||||
{
|
||||
@ -110,6 +112,12 @@ public:
|
||||
return storage.getPartitionIDFromQuery(ast, context);
|
||||
}
|
||||
|
||||
StorageSnapshotPtr getStorageSnapshotForQuery(
|
||||
const StorageMetadataPtr & metadata_snapshot, const ASTPtr & /*query*/, ContextPtr query_context) const override
|
||||
{
|
||||
return storage.getStorageSnapshot(metadata_snapshot, query_context);
|
||||
}
|
||||
|
||||
bool materializeTTLRecalculateOnly() const
|
||||
{
|
||||
if (parts.empty())
|
||||
|
@ -618,6 +618,9 @@ class SettingsRandomizer:
|
||||
"America/Mazatlan",
|
||||
"America/Hermosillo",
|
||||
"Mexico/BajaSur",
|
||||
# These timezones had DST transitions on some unusual dates (e.g. 2000-01-15 12:00:00).
|
||||
"Africa/Khartoum",
|
||||
"Africa/Juba",
|
||||
# server default that is randomized across all timezones
|
||||
# NOTE: due to lots of trickery we cannot use empty timezone here, but this should be the same.
|
||||
get_localzone(),
|
||||
|
@ -9,7 +9,6 @@ import os
|
||||
import random
|
||||
import re
|
||||
import shutil
|
||||
import string
|
||||
import subprocess
|
||||
import time
|
||||
import shlex
|
||||
@ -431,12 +430,19 @@ class ClickhouseIntegrationTestsRunner:
|
||||
|
||||
def _get_all_tests(self, repo_path):
|
||||
image_cmd = self._get_runner_image_cmd(repo_path)
|
||||
runner_opts = self._get_runner_opts()
|
||||
out_file = "all_tests.txt"
|
||||
out_file_full = os.path.join(self.result_path, "runner_get_all_tests.log")
|
||||
cmd = (
|
||||
f"cd {repo_path}/tests/integration && "
|
||||
f"timeout --signal=KILL 1h ./runner {runner_opts} {image_cmd} -- --setup-plan "
|
||||
f"| tee '{out_file_full}'"
|
||||
"cd {repo_path}/tests/integration && "
|
||||
"timeout --signal=KILL 1h ./runner {runner_opts} {image_cmd} -- --setup-plan "
|
||||
"| tee '{out_file_full}' | grep -F '::' | sed -r 's/ \(fixtures used:.*//g; s/^ *//g; s/ *$//g' "
|
||||
"| grep -v -F 'SKIPPED' | sort --unique > {out_file}".format(
|
||||
repo_path=repo_path,
|
||||
runner_opts=self._get_runner_opts(),
|
||||
image_cmd=image_cmd,
|
||||
out_file=out_file,
|
||||
out_file_full=out_file_full,
|
||||
)
|
||||
)
|
||||
|
||||
logging.info("Getting all tests with cmd '%s'", cmd)
|
||||
@ -444,19 +450,34 @@ class ClickhouseIntegrationTestsRunner:
|
||||
cmd, shell=True
|
||||
)
|
||||
|
||||
all_tests = set()
|
||||
with open(out_file_full, "r", encoding="utf-8") as all_tests_fd:
|
||||
for line in all_tests_fd:
|
||||
if (
|
||||
line[0] in string.whitespace # test names at the start of lines
|
||||
or "::test" not in line # test names contain '::test'
|
||||
or "SKIPPED" in line # pytest.mark.skip/-if
|
||||
):
|
||||
continue
|
||||
all_tests.add(line.strip())
|
||||
all_tests_file_path = "{repo_path}/tests/integration/{out_file}".format(
|
||||
repo_path=repo_path, out_file=out_file
|
||||
)
|
||||
if (
|
||||
not os.path.isfile(all_tests_file_path)
|
||||
or os.path.getsize(all_tests_file_path) == 0
|
||||
):
|
||||
if os.path.isfile(out_file_full):
|
||||
# log runner output
|
||||
logging.info("runner output:")
|
||||
with open(out_file_full, "r") as all_tests_full_file:
|
||||
for line in all_tests_full_file:
|
||||
line = line.rstrip()
|
||||
if line:
|
||||
logging.info("runner output: %s", line)
|
||||
else:
|
||||
logging.info("runner output '%s' is empty", out_file_full)
|
||||
|
||||
assert all_tests
|
||||
raise Exception(
|
||||
"There is something wrong with getting all tests list: file '{}' is empty or does not exist.".format(
|
||||
all_tests_file_path
|
||||
)
|
||||
)
|
||||
|
||||
all_tests = []
|
||||
with open(all_tests_file_path, "r") as all_tests_file:
|
||||
for line in all_tests_file:
|
||||
all_tests.append(line.strip())
|
||||
return list(sorted(all_tests))
|
||||
|
||||
def _get_parallel_tests_skip_list(self, repo_path):
|
||||
|
@ -1,3 +1,5 @@
|
||||
SET session_timezone = 'Etc/UTC';
|
||||
|
||||
SELECT toDateTime('2017-10-30 08:18:19') + INTERVAL 1 DAY + INTERVAL 1 MONTH - INTERVAL 1 YEAR;
|
||||
SELECT toDateTime('2017-10-30 08:18:19') + INTERVAL 1 HOUR + INTERVAL 1000 MINUTE + INTERVAL 10 SECOND;
|
||||
SELECT toDateTime('2017-10-30 08:18:19') + INTERVAL 1 DAY + INTERVAL number MONTH FROM system.numbers LIMIT 20;
|
||||
|
@ -0,0 +1,10 @@
|
||||
6 1
|
||||
5 2
|
||||
4 3
|
||||
3 4
|
||||
4 ttt
|
||||
5 ttt
|
||||
6 ttt
|
||||
{"a":"1","obj":{"k1":1,"k2":0,"k3":0}}
|
||||
{"a":"3","obj":{"k1":0,"k2":0,"k3":1}}
|
||||
{"a":"1","obj":{"k1":1,"k2":0,"k3":0}}
|
51
tests/queries/0_stateless/02864_mutations_subcolumns.sql
Normal file
51
tests/queries/0_stateless/02864_mutations_subcolumns.sql
Normal file
@ -0,0 +1,51 @@
|
||||
-- Tags: no-replicated-database
|
||||
-- It won't work in case there are misssing subcolumns in different shards
|
||||
|
||||
DROP TABLE IF EXISTS t_mutations_subcolumns;
|
||||
|
||||
SET allow_experimental_object_type = 1;
|
||||
|
||||
CREATE TABLE t_mutations_subcolumns (id UInt64, n String, obj JSON)
|
||||
ENGINE = MergeTree ORDER BY id;
|
||||
|
||||
INSERT INTO t_mutations_subcolumns VALUES (1, 'aaa', '{"k1": {"k2": "foo"}, "k3": 5}');
|
||||
INSERT INTO t_mutations_subcolumns VALUES (2, 'bbb', '{"k1": {"k2": "fee"}, "k3": 4}');
|
||||
INSERT INTO t_mutations_subcolumns VALUES (3, 'ccc', '{"k1": {"k2": "foo", "k4": "baz"}, "k3": 4}');
|
||||
INSERT INTO t_mutations_subcolumns VALUES (4, 'ddd', '{"k1": {"k2": "foo"}, "k3": 4}');
|
||||
INSERT INTO t_mutations_subcolumns VALUES (5, 'eee', '{"k1": {"k2": "foo"}, "k3": 4}');
|
||||
INSERT INTO t_mutations_subcolumns VALUES (6, 'fff', '{"k1": {"k2": "foo"}, "k3": 4}');
|
||||
|
||||
OPTIMIZE TABLE t_mutations_subcolumns FINAL;
|
||||
|
||||
SELECT count(), min(id) FROM t_mutations_subcolumns;
|
||||
|
||||
SET mutations_sync = 2;
|
||||
|
||||
ALTER TABLE t_mutations_subcolumns DELETE WHERE obj.k3 = 5;
|
||||
SELECT count(), min(id) FROM t_mutations_subcolumns;
|
||||
|
||||
DELETE FROM t_mutations_subcolumns WHERE obj.k1.k2 = 'fee';
|
||||
SELECT count(), min(id) FROM t_mutations_subcolumns;
|
||||
|
||||
ALTER TABLE t_mutations_subcolumns DELETE WHERE obj.k1 = ('foo', 'baz');
|
||||
SELECT count(), min(id) FROM t_mutations_subcolumns;
|
||||
|
||||
ALTER TABLE t_mutations_subcolumns UPDATE n = 'ttt' WHERE obj.k1.k2 = 'foo';
|
||||
SELECT id, n FROM t_mutations_subcolumns;
|
||||
|
||||
DROP TABLE IF EXISTS t_mutations_subcolumns;
|
||||
|
||||
CREATE TABLE t_mutations_subcolumns (a UInt64, obj JSON)
|
||||
ENGINE = MergeTree ORDER BY a PARTITION BY a;
|
||||
|
||||
INSERT INTO t_mutations_subcolumns VALUES (1, '{"k1": 1}');
|
||||
INSERT INTO t_mutations_subcolumns VALUES (2, '{"k2": 1}');
|
||||
INSERT INTO t_mutations_subcolumns VALUES (3, '{"k3": 1}');
|
||||
|
||||
ALTER TABLE t_mutations_subcolumns DELETE WHERE obj.k2 = 1;
|
||||
SELECT * FROM t_mutations_subcolumns ORDER BY a FORMAT JSONEachRow;
|
||||
|
||||
ALTER TABLE t_mutations_subcolumns DELETE WHERE obj.k1 = 0;
|
||||
SELECT * FROM t_mutations_subcolumns ORDER BY a FORMAT JSONEachRow;
|
||||
|
||||
DROP TABLE t_mutations_subcolumns;
|
Loading…
Reference in New Issue
Block a user