mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 08:40:50 +00:00
Merge branch 'master' into fast-count-from-files
This commit is contained in:
commit
7e362a2110
@ -219,6 +219,10 @@ LIMIT N
|
||||
SETTINGS annoy_index_search_k_nodes=100;
|
||||
```
|
||||
|
||||
:::note
|
||||
The Annoy index currently does not work with per-table, non-default `index_granularity` settings (see
|
||||
[here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml.
|
||||
:::
|
||||
## USearch {#usearch}
|
||||
|
||||
This type of ANN index is based on the [the USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW
|
||||
@ -274,4 +278,4 @@ USearch currently supports two distance functions:
|
||||
([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)).
|
||||
|
||||
For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no
|
||||
distance function was specified during index creation, `L2Distance` is used as default.
|
||||
distance function was specified during index creation, `L2Distance` is used as default.
|
@ -217,6 +217,14 @@ Type: UInt32
|
||||
Default: 1024
|
||||
|
||||
|
||||
## index_mark_cache_policy
|
||||
|
||||
Index mark cache policy name.
|
||||
|
||||
Type: String
|
||||
|
||||
Default: SLRU
|
||||
|
||||
## index_mark_cache_size
|
||||
|
||||
Size of cache for index marks. Zero means disabled.
|
||||
@ -229,6 +237,21 @@ Type: UInt64
|
||||
|
||||
Default: 0
|
||||
|
||||
## index_mark_cache_size_ratio
|
||||
|
||||
The size of the protected queue in the index mark cache relative to the cache's total size.
|
||||
|
||||
Type: Double
|
||||
|
||||
Default: 0.5
|
||||
|
||||
## index_uncompressed_cache_policy
|
||||
|
||||
Index uncompressed cache policy name.
|
||||
|
||||
Type: String
|
||||
|
||||
Default: SLRU
|
||||
|
||||
## index_uncompressed_cache_size
|
||||
|
||||
@ -242,6 +265,13 @@ Type: UInt64
|
||||
|
||||
Default: 0
|
||||
|
||||
## index_uncompressed_cache_size_ratio
|
||||
|
||||
The size of the protected queue in the index uncompressed cache relative to the cache's total size.
|
||||
|
||||
Type: Double
|
||||
|
||||
Default: 0.5
|
||||
|
||||
## io_thread_pool_queue_size
|
||||
|
||||
@ -271,6 +301,14 @@ Type: UInt64
|
||||
|
||||
Default: 5368709120
|
||||
|
||||
## mark_cache_size_ratio
|
||||
|
||||
The size of the protected queue in the mark cache relative to the cache's total size.
|
||||
|
||||
Type: Double
|
||||
|
||||
Default: 0.5
|
||||
|
||||
## max_backup_bandwidth_for_server
|
||||
|
||||
The maximum read speed in bytes per second for all backups on server. Zero means unlimited.
|
||||
@ -629,6 +667,14 @@ Type: UInt64
|
||||
|
||||
Default: 0
|
||||
|
||||
## uncompressed_cache_size_ratio
|
||||
|
||||
The size of the protected queue in the uncompressed cache relative to the cache's total size.
|
||||
|
||||
Type: Double
|
||||
|
||||
Default: 0.5
|
||||
|
||||
## builtin_dictionaries_reload_interval {#builtin-dictionaries-reload-interval}
|
||||
|
||||
The interval in seconds before reloading built-in dictionaries.
|
||||
|
@ -657,21 +657,23 @@ void LocalServer::processConfig()
|
||||
/// There is no need for concurrent queries, override max_concurrent_queries.
|
||||
global_context->getProcessList().setMaxSize(0);
|
||||
|
||||
const size_t memory_amount = getMemoryAmount();
|
||||
const size_t physical_server_memory = getMemoryAmount();
|
||||
const double cache_size_to_ram_max_ratio = config().getDouble("cache_size_to_ram_max_ratio", 0.5);
|
||||
const size_t max_cache_size = static_cast<size_t>(memory_amount * cache_size_to_ram_max_ratio);
|
||||
const size_t max_cache_size = static_cast<size_t>(physical_server_memory * cache_size_to_ram_max_ratio);
|
||||
|
||||
String uncompressed_cache_policy = config().getString("uncompressed_cache_policy", DEFAULT_UNCOMPRESSED_CACHE_POLICY);
|
||||
size_t uncompressed_cache_size = config().getUInt64("uncompressed_cache_size", DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE);
|
||||
double uncompressed_cache_size_ratio = config().getDouble("uncompressed_cache_size_ratio", DEFAULT_UNCOMPRESSED_CACHE_SIZE_RATIO);
|
||||
if (uncompressed_cache_size > max_cache_size)
|
||||
{
|
||||
uncompressed_cache_size = max_cache_size;
|
||||
LOG_INFO(log, "Lowered uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
|
||||
}
|
||||
global_context->setUncompressedCache(uncompressed_cache_policy, uncompressed_cache_size);
|
||||
global_context->setUncompressedCache(uncompressed_cache_policy, uncompressed_cache_size, uncompressed_cache_size_ratio);
|
||||
|
||||
String mark_cache_policy = config().getString("mark_cache_policy", DEFAULT_MARK_CACHE_POLICY);
|
||||
size_t mark_cache_size = config().getUInt64("mark_cache_size", DEFAULT_MARK_CACHE_MAX_SIZE);
|
||||
double mark_cache_size_ratio = config().getDouble("mark_cache_size_ratio", DEFAULT_MARK_CACHE_SIZE_RATIO);
|
||||
if (!mark_cache_size)
|
||||
LOG_ERROR(log, "Too low mark cache size will lead to severe performance degradation.");
|
||||
if (mark_cache_size > max_cache_size)
|
||||
@ -679,23 +681,27 @@ void LocalServer::processConfig()
|
||||
mark_cache_size = max_cache_size;
|
||||
LOG_INFO(log, "Lowered mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(mark_cache_size));
|
||||
}
|
||||
global_context->setMarkCache(mark_cache_policy, mark_cache_size);
|
||||
global_context->setMarkCache(mark_cache_policy, mark_cache_size, mark_cache_size_ratio);
|
||||
|
||||
String index_uncompressed_cache_policy = config().getString("index_uncompressed_cache_policy", DEFAULT_INDEX_UNCOMPRESSED_CACHE_POLICY);
|
||||
size_t index_uncompressed_cache_size = config().getUInt64("index_uncompressed_cache_size", DEFAULT_INDEX_UNCOMPRESSED_CACHE_MAX_SIZE);
|
||||
double index_uncompressed_cache_size_ratio = config().getDouble("index_uncompressed_cache_size_ratio", DEFAULT_INDEX_UNCOMPRESSED_CACHE_SIZE_RATIO);
|
||||
if (index_uncompressed_cache_size > max_cache_size)
|
||||
{
|
||||
index_uncompressed_cache_size = max_cache_size;
|
||||
LOG_INFO(log, "Lowered index uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
|
||||
}
|
||||
global_context->setIndexUncompressedCache(index_uncompressed_cache_size);
|
||||
global_context->setIndexUncompressedCache(index_uncompressed_cache_policy, index_uncompressed_cache_size, index_uncompressed_cache_size_ratio);
|
||||
|
||||
String index_mark_cache_policy = config().getString("index_mark_cache_policy", DEFAULT_INDEX_MARK_CACHE_POLICY);
|
||||
size_t index_mark_cache_size = config().getUInt64("index_mark_cache_size", DEFAULT_INDEX_MARK_CACHE_MAX_SIZE);
|
||||
double index_mark_cache_size_ratio = config().getDouble("index_mark_cache_size_ratio", DEFAULT_INDEX_MARK_CACHE_SIZE_RATIO);
|
||||
if (index_mark_cache_size > max_cache_size)
|
||||
{
|
||||
index_mark_cache_size = max_cache_size;
|
||||
LOG_INFO(log, "Lowered index mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
|
||||
}
|
||||
global_context->setIndexMarkCache(index_mark_cache_size);
|
||||
global_context->setIndexMarkCache(index_mark_cache_policy, index_mark_cache_size, index_mark_cache_size_ratio);
|
||||
|
||||
size_t mmap_cache_size = config().getUInt64("mmap_cache_size", DEFAULT_MMAP_CACHE_MAX_SIZE);
|
||||
if (mmap_cache_size > max_cache_size)
|
||||
|
@ -1111,37 +1111,43 @@ try
|
||||
|
||||
String uncompressed_cache_policy = server_settings.uncompressed_cache_policy;
|
||||
size_t uncompressed_cache_size = server_settings.uncompressed_cache_size;
|
||||
double uncompressed_cache_size_ratio = server_settings.uncompressed_cache_size_ratio;
|
||||
if (uncompressed_cache_size > max_cache_size)
|
||||
{
|
||||
uncompressed_cache_size = max_cache_size;
|
||||
LOG_INFO(log, "Lowered uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
|
||||
}
|
||||
global_context->setUncompressedCache(uncompressed_cache_policy, uncompressed_cache_size);
|
||||
global_context->setUncompressedCache(uncompressed_cache_policy, uncompressed_cache_size, uncompressed_cache_size_ratio);
|
||||
|
||||
String mark_cache_policy = server_settings.mark_cache_policy;
|
||||
size_t mark_cache_size = server_settings.mark_cache_size;
|
||||
double mark_cache_size_ratio = server_settings.mark_cache_size_ratio;
|
||||
if (mark_cache_size > max_cache_size)
|
||||
{
|
||||
mark_cache_size = max_cache_size;
|
||||
LOG_INFO(log, "Lowered mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(mark_cache_size));
|
||||
}
|
||||
global_context->setMarkCache(mark_cache_policy, mark_cache_size);
|
||||
global_context->setMarkCache(mark_cache_policy, mark_cache_size, mark_cache_size_ratio);
|
||||
|
||||
String index_uncompressed_cache_policy = server_settings.index_uncompressed_cache_policy;
|
||||
size_t index_uncompressed_cache_size = server_settings.index_uncompressed_cache_size;
|
||||
double index_uncompressed_cache_size_ratio = server_settings.index_uncompressed_cache_size_ratio;
|
||||
if (index_uncompressed_cache_size > max_cache_size)
|
||||
{
|
||||
index_uncompressed_cache_size = max_cache_size;
|
||||
LOG_INFO(log, "Lowered index uncompressed cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
|
||||
}
|
||||
global_context->setIndexUncompressedCache(index_uncompressed_cache_size);
|
||||
global_context->setIndexUncompressedCache(index_uncompressed_cache_policy, index_uncompressed_cache_size, index_uncompressed_cache_size_ratio);
|
||||
|
||||
String index_mark_cache_policy = server_settings.index_mark_cache_policy;
|
||||
size_t index_mark_cache_size = server_settings.index_mark_cache_size;
|
||||
double index_mark_cache_size_ratio = server_settings.index_mark_cache_size_ratio;
|
||||
if (index_mark_cache_size > max_cache_size)
|
||||
{
|
||||
index_mark_cache_size = max_cache_size;
|
||||
LOG_INFO(log, "Lowered index mark cache size to {} because the system has limited RAM", formatReadableSizeWithBinarySuffix(uncompressed_cache_size));
|
||||
}
|
||||
global_context->setIndexMarkCache(index_mark_cache_size);
|
||||
global_context->setIndexMarkCache(index_mark_cache_policy, index_mark_cache_size, index_mark_cache_size_ratio);
|
||||
|
||||
size_t mmap_cache_size = server_settings.mmap_cache_size;
|
||||
if (mmap_cache_size > max_cache_size)
|
||||
|
@ -40,14 +40,17 @@ public:
|
||||
using MappedPtr = typename CachePolicy::MappedPtr;
|
||||
using KeyMapped = typename CachePolicy::KeyMapped;
|
||||
|
||||
/// Use this ctor if you don't care about the internal cache policy.
|
||||
explicit CacheBase(size_t max_size_in_bytes, size_t max_count = 0, double size_ratio = 0.5)
|
||||
static constexpr auto NO_MAX_COUNT = 0uz;
|
||||
static constexpr auto DEFAULT_SIZE_RATIO = 0.5l;
|
||||
|
||||
/// Use this ctor if you only care about the cache size but not internals like the cache policy.
|
||||
explicit CacheBase(size_t max_size_in_bytes, size_t max_count = NO_MAX_COUNT, double size_ratio = DEFAULT_SIZE_RATIO)
|
||||
: CacheBase("SLRU", max_size_in_bytes, max_count, size_ratio)
|
||||
{
|
||||
}
|
||||
|
||||
/// Use this ctor if you want the user to configure the cache policy via some setting. Supports only general-purpose policies LRU and SLRU.
|
||||
explicit CacheBase(std::string_view cache_policy_name, size_t max_size_in_bytes, size_t max_count = 0, double size_ratio = 0.5)
|
||||
/// Use this ctor if the user should be able to configure the cache policy and cache sizes via settings. Supports only general-purpose policies LRU and SLRU.
|
||||
explicit CacheBase(std::string_view cache_policy_name, size_t max_size_in_bytes, size_t max_count, double size_ratio)
|
||||
{
|
||||
auto on_weight_loss_function = [&](size_t weight_loss) { onRemoveOverflowWeightLoss(weight_loss); };
|
||||
|
||||
@ -79,7 +82,7 @@ public:
|
||||
MappedPtr get(const Key & key)
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
auto res = cache_policy->get(key, lock);
|
||||
auto res = cache_policy->get(key);
|
||||
if (res)
|
||||
++hits;
|
||||
else
|
||||
@ -90,7 +93,7 @@ public:
|
||||
std::optional<KeyMapped> getWithKey(const Key & key)
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
auto res = cache_policy->getWithKey(key, lock);
|
||||
auto res = cache_policy->getWithKey(key);
|
||||
if (res.has_value())
|
||||
++hits;
|
||||
else
|
||||
@ -101,7 +104,7 @@ public:
|
||||
void set(const Key & key, const MappedPtr & mapped)
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
cache_policy->set(key, mapped, lock);
|
||||
cache_policy->set(key, mapped);
|
||||
}
|
||||
|
||||
/// If the value for the key is in the cache, returns it. If it is not, calls load_func() to
|
||||
@ -118,7 +121,7 @@ public:
|
||||
InsertTokenHolder token_holder;
|
||||
{
|
||||
std::lock_guard cache_lock(mutex);
|
||||
auto val = cache_policy->get(key, cache_lock);
|
||||
auto val = cache_policy->get(key);
|
||||
if (val)
|
||||
{
|
||||
++hits;
|
||||
@ -156,7 +159,7 @@ public:
|
||||
auto token_it = insert_tokens.find(key);
|
||||
if (token_it != insert_tokens.end() && token_it->second.get() == token)
|
||||
{
|
||||
cache_policy->set(key, token->value, cache_lock);
|
||||
cache_policy->set(key, token->value);
|
||||
result = true;
|
||||
}
|
||||
|
||||
@ -185,49 +188,49 @@ public:
|
||||
insert_tokens.clear();
|
||||
hits = 0;
|
||||
misses = 0;
|
||||
cache_policy->clear(lock);
|
||||
cache_policy->clear();
|
||||
}
|
||||
|
||||
void remove(const Key & key)
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
cache_policy->remove(key, lock);
|
||||
cache_policy->remove(key);
|
||||
}
|
||||
|
||||
size_t weight() const
|
||||
size_t sizeInBytes() const
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
return cache_policy->weight(lock);
|
||||
return cache_policy->sizeInBytes();
|
||||
}
|
||||
|
||||
size_t count() const
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
return cache_policy->count(lock);
|
||||
return cache_policy->count();
|
||||
}
|
||||
|
||||
size_t maxSize() const
|
||||
size_t maxSizeInBytes() const
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
return cache_policy->maxSize(lock);
|
||||
return cache_policy->maxSizeInBytes();
|
||||
}
|
||||
|
||||
void setMaxCount(size_t max_count)
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
cache_policy->setMaxCount(max_count, lock);
|
||||
cache_policy->setMaxCount(max_count);
|
||||
}
|
||||
|
||||
void setMaxSize(size_t max_size_in_bytes)
|
||||
void setMaxSizeInBytes(size_t max_size_in_bytes)
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
cache_policy->setMaxSize(max_size_in_bytes, lock);
|
||||
cache_policy->setMaxSizeInBytes(max_size_in_bytes);
|
||||
}
|
||||
|
||||
void setQuotaForUser(const String & user_name, size_t max_size_in_bytes, size_t max_entries)
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
cache_policy->setQuotaForUser(user_name, max_size_in_bytes, max_entries, lock);
|
||||
cache_policy->setQuotaForUser(user_name, max_size_in_bytes, max_entries);
|
||||
}
|
||||
|
||||
virtual ~CacheBase() = default;
|
||||
|
@ -37,25 +37,25 @@ public:
|
||||
explicit ICachePolicy(CachePolicyUserQuotaPtr user_quotas_) : user_quotas(std::move(user_quotas_)) {}
|
||||
virtual ~ICachePolicy() = default;
|
||||
|
||||
virtual size_t weight(std::lock_guard<std::mutex> & /*cache_lock*/) const = 0;
|
||||
virtual size_t count(std::lock_guard<std::mutex> & /*cache_lock*/) const = 0;
|
||||
virtual size_t maxSize(std::lock_guard<std::mutex>& /*cache_lock*/) const = 0;
|
||||
virtual size_t sizeInBytes() const = 0;
|
||||
virtual size_t count() const = 0;
|
||||
virtual size_t maxSizeInBytes() const = 0;
|
||||
|
||||
virtual void setMaxCount(size_t /*max_count*/, std::lock_guard<std::mutex> & /* cache_lock */) = 0;
|
||||
virtual void setMaxSize(size_t /*max_size_in_bytes*/, std::lock_guard<std::mutex> & /* cache_lock */) = 0;
|
||||
virtual void setQuotaForUser(const String & user_name, size_t max_size_in_bytes, size_t max_entries, std::lock_guard<std::mutex> & /*cache_lock*/) { user_quotas->setQuotaForUser(user_name, max_size_in_bytes, max_entries); }
|
||||
virtual void setMaxCount(size_t /*max_count*/) = 0;
|
||||
virtual void setMaxSizeInBytes(size_t /*max_size_in_bytes*/) = 0;
|
||||
virtual void setQuotaForUser(const String & user_name, size_t max_size_in_bytes, size_t max_entries) { user_quotas->setQuotaForUser(user_name, max_size_in_bytes, max_entries); }
|
||||
|
||||
/// HashFunction usually hashes the entire key and the found key will be equal the provided key. In such cases, use get(). It is also
|
||||
/// possible to store other, non-hashed data in the key. In that case, the found key is potentially different from the provided key.
|
||||
/// Then use getWithKey() to also return the found key including it's non-hashed data.
|
||||
virtual MappedPtr get(const Key & key, std::lock_guard<std::mutex> & /* cache_lock */) = 0;
|
||||
virtual std::optional<KeyMapped> getWithKey(const Key &, std::lock_guard<std::mutex> & /*cache_lock*/) = 0;
|
||||
virtual MappedPtr get(const Key & key) = 0;
|
||||
virtual std::optional<KeyMapped> getWithKey(const Key &) = 0;
|
||||
|
||||
virtual void set(const Key & key, const MappedPtr & mapped, std::lock_guard<std::mutex> & /*cache_lock*/) = 0;
|
||||
virtual void set(const Key & key, const MappedPtr & mapped) = 0;
|
||||
|
||||
virtual void remove(const Key & key, std::lock_guard<std::mutex> & /*cache_lock*/) = 0;
|
||||
virtual void remove(const Key & key) = 0;
|
||||
|
||||
virtual void clear(std::lock_guard<std::mutex> & /*cache_lock*/) = 0;
|
||||
virtual void clear() = 0;
|
||||
virtual std::vector<KeyMapped> dump() const = 0;
|
||||
|
||||
protected:
|
||||
|
@ -34,41 +34,41 @@ public:
|
||||
{
|
||||
}
|
||||
|
||||
size_t weight(std::lock_guard<std::mutex> & /* cache_lock */) const override
|
||||
size_t sizeInBytes() const override
|
||||
{
|
||||
return current_size_in_bytes;
|
||||
}
|
||||
|
||||
size_t count(std::lock_guard<std::mutex> & /* cache_lock */) const override
|
||||
size_t count() const override
|
||||
{
|
||||
return cells.size();
|
||||
}
|
||||
|
||||
size_t maxSize(std::lock_guard<std::mutex> & /* cache_lock */) const override
|
||||
size_t maxSizeInBytes() const override
|
||||
{
|
||||
return max_size_in_bytes;
|
||||
}
|
||||
|
||||
void setMaxCount(size_t max_count_, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
void setMaxCount(size_t max_count_) override
|
||||
{
|
||||
max_count = max_count_;
|
||||
removeOverflow();
|
||||
}
|
||||
|
||||
void setMaxSize(size_t max_size_in_bytes_, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
void setMaxSizeInBytes(size_t max_size_in_bytes_) override
|
||||
{
|
||||
max_size_in_bytes = max_size_in_bytes_;
|
||||
removeOverflow();
|
||||
}
|
||||
|
||||
void clear(std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
void clear() override
|
||||
{
|
||||
queue.clear();
|
||||
cells.clear();
|
||||
current_size_in_bytes = 0;
|
||||
}
|
||||
|
||||
void remove(const Key & key, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
void remove(const Key & key) override
|
||||
{
|
||||
auto it = cells.find(key);
|
||||
if (it == cells.end())
|
||||
@ -79,7 +79,7 @@ public:
|
||||
cells.erase(it);
|
||||
}
|
||||
|
||||
MappedPtr get(const Key & key, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
MappedPtr get(const Key & key) override
|
||||
{
|
||||
auto it = cells.find(key);
|
||||
if (it == cells.end())
|
||||
@ -93,7 +93,7 @@ public:
|
||||
return cell.value;
|
||||
}
|
||||
|
||||
std::optional<KeyMapped> getWithKey(const Key & key, std::lock_guard<std::mutex> & /*cache_lock*/) override
|
||||
std::optional<KeyMapped> getWithKey(const Key & key) override
|
||||
{
|
||||
auto it = cells.find(key);
|
||||
if (it == cells.end())
|
||||
@ -107,7 +107,7 @@ public:
|
||||
return std::make_optional<KeyMapped>({it->first, cell.value});
|
||||
}
|
||||
|
||||
void set(const Key & key, const MappedPtr & mapped, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
void set(const Key & key, const MappedPtr & mapped) override
|
||||
{
|
||||
auto [it, inserted] = cells.emplace(std::piecewise_construct,
|
||||
std::forward_as_tuple(key),
|
||||
|
@ -366,6 +366,8 @@ The server successfully detected this situation and will download merged part fr
|
||||
M(DiskS3PutObject, "Number of DiskS3 API PutObject calls.") \
|
||||
M(DiskS3GetObject, "Number of DiskS3 API GetObject calls.") \
|
||||
\
|
||||
M(EngineFileLikeReadFiles, "Number of files read in table engines working with files (like File/S3/URL/HDFS).") \
|
||||
\
|
||||
M(ReadBufferFromS3Microseconds, "Time spent on reading from S3.") \
|
||||
M(ReadBufferFromS3InitMicroseconds, "Time spent initializing connection to S3.") \
|
||||
M(ReadBufferFromS3Bytes, "Bytes read from S3.") \
|
||||
|
@ -31,45 +31,45 @@ public:
|
||||
/// TODO: construct from special struct with cache policy parameters (also with max_protected_size).
|
||||
SLRUCachePolicy(size_t max_size_in_bytes_, size_t max_count_, double size_ratio_, OnWeightLossFunction on_weight_loss_function_)
|
||||
: Base(std::make_unique<NoCachePolicyUserQuota>())
|
||||
, size_ratio(size_ratio_)
|
||||
, max_protected_size(static_cast<size_t>(max_size_in_bytes_ * std::min(1.0, size_ratio)))
|
||||
, max_size_in_bytes(max_size_in_bytes_)
|
||||
, max_protected_size(calculateMaxProtectedSize(max_size_in_bytes_, size_ratio_))
|
||||
, max_count(max_count_)
|
||||
, size_ratio(size_ratio_)
|
||||
, on_weight_loss_function(on_weight_loss_function_)
|
||||
{
|
||||
}
|
||||
|
||||
size_t weight(std::lock_guard<std::mutex> & /* cache_lock */) const override
|
||||
size_t sizeInBytes() const override
|
||||
{
|
||||
return current_size_in_bytes;
|
||||
}
|
||||
|
||||
size_t count(std::lock_guard<std::mutex> & /* cache_lock */) const override
|
||||
size_t count() const override
|
||||
{
|
||||
return cells.size();
|
||||
}
|
||||
|
||||
size_t maxSize(std::lock_guard<std::mutex> & /* cache_lock */) const override
|
||||
size_t maxSizeInBytes() const override
|
||||
{
|
||||
return max_size_in_bytes;
|
||||
}
|
||||
|
||||
void setMaxCount(size_t max_count_, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
void setMaxCount(size_t max_count_) override
|
||||
{
|
||||
max_count = max_count_;
|
||||
removeOverflow(protected_queue, max_protected_size, current_protected_size, /*is_protected=*/true);
|
||||
removeOverflow(probationary_queue, max_size_in_bytes, current_size_in_bytes, /*is_protected=*/false);
|
||||
}
|
||||
|
||||
void setMaxSize(size_t max_size_in_bytes_, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
void setMaxSizeInBytes(size_t max_size_in_bytes_) override
|
||||
{
|
||||
max_protected_size = static_cast<size_t>(max_size_in_bytes_ * std::min(1.0, size_ratio));
|
||||
max_protected_size = calculateMaxProtectedSize(max_size_in_bytes_, size_ratio);
|
||||
max_size_in_bytes = max_size_in_bytes_;
|
||||
removeOverflow(protected_queue, max_protected_size, current_protected_size, /*is_protected=*/true);
|
||||
removeOverflow(probationary_queue, max_size_in_bytes, current_size_in_bytes, /*is_protected=*/false);
|
||||
}
|
||||
|
||||
void clear(std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
void clear() override
|
||||
{
|
||||
cells.clear();
|
||||
probationary_queue.clear();
|
||||
@ -78,7 +78,7 @@ public:
|
||||
current_protected_size = 0;
|
||||
}
|
||||
|
||||
void remove(const Key & key, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
void remove(const Key & key) override
|
||||
{
|
||||
auto it = cells.find(key);
|
||||
if (it == cells.end())
|
||||
@ -95,7 +95,7 @@ public:
|
||||
cells.erase(it);
|
||||
}
|
||||
|
||||
MappedPtr get(const Key & key, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
MappedPtr get(const Key & key) override
|
||||
{
|
||||
auto it = cells.find(key);
|
||||
if (it == cells.end())
|
||||
@ -116,7 +116,7 @@ public:
|
||||
return cell.value;
|
||||
}
|
||||
|
||||
std::optional<KeyMapped> getWithKey(const Key & key, std::lock_guard<std::mutex> & /*cache_lock*/) override
|
||||
std::optional<KeyMapped> getWithKey(const Key & key) override
|
||||
{
|
||||
auto it = cells.find(key);
|
||||
if (it == cells.end())
|
||||
@ -137,7 +137,7 @@ public:
|
||||
return std::make_optional<KeyMapped>({it->first, cell.value});
|
||||
}
|
||||
|
||||
void set(const Key & key, const MappedPtr & mapped, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
void set(const Key & key, const MappedPtr & mapped) override
|
||||
{
|
||||
auto [it, inserted] = cells.emplace(std::piecewise_construct,
|
||||
std::forward_as_tuple(key),
|
||||
@ -208,16 +208,21 @@ private:
|
||||
|
||||
Cells cells;
|
||||
|
||||
size_t max_size_in_bytes;
|
||||
size_t max_protected_size;
|
||||
size_t max_count;
|
||||
const double size_ratio;
|
||||
size_t current_protected_size = 0;
|
||||
size_t current_size_in_bytes = 0;
|
||||
size_t max_protected_size;
|
||||
size_t max_size_in_bytes;
|
||||
size_t max_count;
|
||||
|
||||
WeightFunction weight_function;
|
||||
OnWeightLossFunction on_weight_loss_function;
|
||||
|
||||
static size_t calculateMaxProtectedSize(size_t max_size_in_bytes, double size_ratio)
|
||||
{
|
||||
return static_cast<size_t>(max_size_in_bytes * std::max(0.0, std::min(1.0, size_ratio)));
|
||||
}
|
||||
|
||||
void removeOverflow(SLRUQueue & queue, size_t max_weight_size, size_t & current_weight_size, bool is_protected)
|
||||
{
|
||||
size_t current_weight_lost = 0;
|
||||
|
@ -94,39 +94,39 @@ public:
|
||||
{
|
||||
}
|
||||
|
||||
size_t weight(std::lock_guard<std::mutex> & /* cache_lock */) const override
|
||||
size_t sizeInBytes() const override
|
||||
{
|
||||
return size_in_bytes;
|
||||
}
|
||||
|
||||
size_t count(std::lock_guard<std::mutex> & /* cache_lock */) const override
|
||||
size_t count() const override
|
||||
{
|
||||
return cache.size();
|
||||
}
|
||||
|
||||
size_t maxSize(std::lock_guard<std::mutex> & /* cache_lock */) const override
|
||||
size_t maxSizeInBytes() const override
|
||||
{
|
||||
return max_size_in_bytes;
|
||||
}
|
||||
|
||||
void setMaxCount(size_t max_count_, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
void setMaxCount(size_t max_count_) override
|
||||
{
|
||||
/// lazy behavior: the cache only shrinks upon the next insert
|
||||
max_count = max_count_;
|
||||
}
|
||||
|
||||
void setMaxSize(size_t max_size_in_bytes_, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
void setMaxSizeInBytes(size_t max_size_in_bytes_) override
|
||||
{
|
||||
/// lazy behavior: the cache only shrinks upon the next insert
|
||||
max_size_in_bytes = max_size_in_bytes_;
|
||||
}
|
||||
|
||||
void clear(std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
void clear() override
|
||||
{
|
||||
cache.clear();
|
||||
}
|
||||
|
||||
void remove(const Key & key, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
void remove(const Key & key) override
|
||||
{
|
||||
auto it = cache.find(key);
|
||||
if (it == cache.end())
|
||||
@ -137,7 +137,7 @@ public:
|
||||
size_in_bytes -= sz;
|
||||
}
|
||||
|
||||
MappedPtr get(const Key & key, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
MappedPtr get(const Key & key) override
|
||||
{
|
||||
auto it = cache.find(key);
|
||||
if (it == cache.end())
|
||||
@ -145,7 +145,7 @@ public:
|
||||
return it->second;
|
||||
}
|
||||
|
||||
std::optional<KeyMapped> getWithKey(const Key & key, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
std::optional<KeyMapped> getWithKey(const Key & key) override
|
||||
{
|
||||
auto it = cache.find(key);
|
||||
if (it == cache.end())
|
||||
@ -154,7 +154,7 @@ public:
|
||||
}
|
||||
|
||||
/// Evicts on a best-effort basis. If there are too many non-stale entries, the new entry may not be cached at all!
|
||||
void set(const Key & key, const MappedPtr & mapped, std::lock_guard<std::mutex> & /* cache_lock */) override
|
||||
void set(const Key & key, const MappedPtr & mapped) override
|
||||
{
|
||||
chassert(mapped.get());
|
||||
|
||||
|
@ -5,11 +5,11 @@
|
||||
TEST(LRUCache, set)
|
||||
{
|
||||
using SimpleCacheBase = DB::CacheBase<int, int>;
|
||||
auto lru_cache = SimpleCacheBase("LRU", /*max_size_in_bytes*/ 10, /*max_count*/ 10);
|
||||
auto lru_cache = SimpleCacheBase("LRU", /*max_size_in_bytes*/ 10, /*max_count*/ 10, /*size_ratio*/ 0.5);
|
||||
lru_cache.set(1, std::make_shared<int>(2));
|
||||
lru_cache.set(2, std::make_shared<int>(3));
|
||||
|
||||
auto w = lru_cache.weight();
|
||||
auto w = lru_cache.sizeInBytes();
|
||||
auto n = lru_cache.count();
|
||||
ASSERT_EQ(w, 2);
|
||||
ASSERT_EQ(n, 2);
|
||||
@ -18,7 +18,7 @@ TEST(LRUCache, set)
|
||||
TEST(LRUCache, update)
|
||||
{
|
||||
using SimpleCacheBase = DB::CacheBase<int, int>;
|
||||
auto lru_cache = SimpleCacheBase("LRU", /*max_size_in_bytes*/ 10, /*max_count*/ 10);
|
||||
auto lru_cache = SimpleCacheBase("LRU", /*max_size_in_bytes*/ 10, /*max_count*/ 10, /*size_ratio*/ 0.5);
|
||||
lru_cache.set(1, std::make_shared<int>(2));
|
||||
lru_cache.set(1, std::make_shared<int>(3));
|
||||
auto val = lru_cache.get(1);
|
||||
@ -29,7 +29,7 @@ TEST(LRUCache, update)
|
||||
TEST(LRUCache, get)
|
||||
{
|
||||
using SimpleCacheBase = DB::CacheBase<int, int>;
|
||||
auto lru_cache = SimpleCacheBase("LRU", /*max_size_in_bytes*/ 10, /*max_count*/ 10);
|
||||
auto lru_cache = SimpleCacheBase("LRU", /*max_size_in_bytes*/ 10, /*max_count*/ 10, /*size_ratio*/ 0.5);
|
||||
lru_cache.set(1, std::make_shared<int>(2));
|
||||
lru_cache.set(2, std::make_shared<int>(3));
|
||||
SimpleCacheBase::MappedPtr value = lru_cache.get(1);
|
||||
@ -49,7 +49,7 @@ struct ValueWeight
|
||||
TEST(LRUCache, evictOnSize)
|
||||
{
|
||||
using SimpleCacheBase = DB::CacheBase<int, size_t>;
|
||||
auto lru_cache = SimpleCacheBase("LRU", /*max_size_in_bytes*/ 20, /*max_count*/ 3);
|
||||
auto lru_cache = SimpleCacheBase("LRU", /*max_size_in_bytes*/ 20, /*max_count*/ 3, /*size_ratio*/ 0.5);
|
||||
lru_cache.set(1, std::make_shared<size_t>(2));
|
||||
lru_cache.set(2, std::make_shared<size_t>(3));
|
||||
lru_cache.set(3, std::make_shared<size_t>(4));
|
||||
@ -65,7 +65,7 @@ TEST(LRUCache, evictOnSize)
|
||||
TEST(LRUCache, evictOnWeight)
|
||||
{
|
||||
using SimpleCacheBase = DB::CacheBase<int, size_t, std::hash<int>, ValueWeight>;
|
||||
auto lru_cache = SimpleCacheBase("LRU", /*max_size_in_bytes*/ 10, /*max_count*/ 10);
|
||||
auto lru_cache = SimpleCacheBase("LRU", /*max_size_in_bytes*/ 10, /*max_count*/ 10, /*size_ratio*/ 0.5);
|
||||
lru_cache.set(1, std::make_shared<size_t>(2));
|
||||
lru_cache.set(2, std::make_shared<size_t>(3));
|
||||
lru_cache.set(3, std::make_shared<size_t>(4));
|
||||
@ -74,7 +74,7 @@ TEST(LRUCache, evictOnWeight)
|
||||
auto n = lru_cache.count();
|
||||
ASSERT_EQ(n, 2);
|
||||
|
||||
auto w = lru_cache.weight();
|
||||
auto w = lru_cache.sizeInBytes();
|
||||
ASSERT_EQ(w, 9);
|
||||
|
||||
auto value = lru_cache.get(1);
|
||||
@ -86,7 +86,7 @@ TEST(LRUCache, evictOnWeight)
|
||||
TEST(LRUCache, getOrSet)
|
||||
{
|
||||
using SimpleCacheBase = DB::CacheBase<int, size_t, std::hash<int>, ValueWeight>;
|
||||
auto lru_cache = SimpleCacheBase("LRU", /*max_size_in_bytes*/ 10, /*max_count*/ 10);
|
||||
auto lru_cache = SimpleCacheBase("LRU", /*max_size_in_bytes*/ 10, /*max_count*/ 10, /*size_ratio*/ 0.5);
|
||||
size_t x = 10;
|
||||
auto load_func = [&] { return std::make_shared<size_t>(x); };
|
||||
auto [value, loaded] = lru_cache.getOrSet(1, load_func);
|
||||
|
@ -9,7 +9,7 @@ TEST(SLRUCache, set)
|
||||
slru_cache.set(1, std::make_shared<int>(2));
|
||||
slru_cache.set(2, std::make_shared<int>(3));
|
||||
|
||||
auto w = slru_cache.weight();
|
||||
auto w = slru_cache.sizeInBytes();
|
||||
auto n = slru_cache.count();
|
||||
ASSERT_EQ(w, 2);
|
||||
ASSERT_EQ(n, 2);
|
||||
@ -125,7 +125,7 @@ TEST(SLRUCache, evictOnElements)
|
||||
auto n = slru_cache.count();
|
||||
ASSERT_EQ(n, 1);
|
||||
|
||||
auto w = slru_cache.weight();
|
||||
auto w = slru_cache.sizeInBytes();
|
||||
ASSERT_EQ(w, 3);
|
||||
|
||||
auto value = slru_cache.get(1);
|
||||
@ -148,7 +148,7 @@ TEST(SLRUCache, evictOnWeight)
|
||||
auto n = slru_cache.count();
|
||||
ASSERT_EQ(n, 2);
|
||||
|
||||
auto w = slru_cache.weight();
|
||||
auto w = slru_cache.sizeInBytes();
|
||||
ASSERT_EQ(w, 9);
|
||||
|
||||
auto value = slru_cache.get(1);
|
||||
|
@ -23,7 +23,7 @@ int main(int argc, char ** argv)
|
||||
|
||||
try
|
||||
{
|
||||
UncompressedCache cache(1024);
|
||||
UncompressedCache cache("SLRU", 1024, 0.5);
|
||||
std::string path = argv[1];
|
||||
|
||||
std::cerr << std::fixed << std::setprecision(3);
|
||||
|
@ -66,12 +66,18 @@
|
||||
#define DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH 1000
|
||||
|
||||
/// Default maximum (total and entry) sizes and policies of various caches
|
||||
static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE = 0_MiB;
|
||||
static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_POLICY = "SLRU";
|
||||
static constexpr auto DEFAULT_MARK_CACHE_MAX_SIZE = 5368_MiB;
|
||||
static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE = 0_MiB;
|
||||
static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_SIZE_RATIO = 0.5l;
|
||||
static constexpr auto DEFAULT_MARK_CACHE_POLICY = "SLRU";
|
||||
static constexpr auto DEFAULT_MARK_CACHE_MAX_SIZE = 5368_MiB;
|
||||
static constexpr auto DEFAULT_MARK_CACHE_SIZE_RATIO = 0.5l;
|
||||
static constexpr auto DEFAULT_INDEX_UNCOMPRESSED_CACHE_POLICY = "SLRU";
|
||||
static constexpr auto DEFAULT_INDEX_UNCOMPRESSED_CACHE_MAX_SIZE = 0_MiB;
|
||||
static constexpr auto DEFAULT_INDEX_UNCOMPRESSED_CACHE_SIZE_RATIO = 0.5l;
|
||||
static constexpr auto DEFAULT_INDEX_MARK_CACHE_POLICY = "SLRU";
|
||||
static constexpr auto DEFAULT_INDEX_MARK_CACHE_MAX_SIZE = 0_MiB;
|
||||
static constexpr auto DEFAULT_INDEX_MARK_CACHE_SIZE_RATIO = 0.5l;
|
||||
static constexpr auto DEFAULT_MMAP_CACHE_MAX_SIZE = 1_KiB; /// chosen by rolling dice
|
||||
static constexpr auto DEFAULT_COMPILED_EXPRESSION_CACHE_MAX_SIZE = 128_MiB;
|
||||
static constexpr auto DEFAULT_COMPILED_EXPRESSION_CACHE_MAX_ENTRIES = 10'000;
|
||||
|
@ -60,10 +60,16 @@ namespace DB
|
||||
M(Double, cache_size_to_ram_max_ratio, 0.5, "Set cache size ro RAM max ratio. Allows to lower cache size on low-memory systems.", 0) \
|
||||
M(String, uncompressed_cache_policy, DEFAULT_UNCOMPRESSED_CACHE_POLICY, "Uncompressed cache policy name.", 0) \
|
||||
M(UInt64, uncompressed_cache_size, DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE, "Size of cache for uncompressed blocks. Zero means disabled.", 0) \
|
||||
M(UInt64, mark_cache_size, DEFAULT_MARK_CACHE_MAX_SIZE, "Size of cache for marks (index of MergeTree family of tables).", 0) \
|
||||
M(Double, uncompressed_cache_size_ratio, DEFAULT_UNCOMPRESSED_CACHE_SIZE_RATIO, "The size of the protected queue in the uncompressed cache relative to the cache's total size.", 0) \
|
||||
M(String, mark_cache_policy, DEFAULT_MARK_CACHE_POLICY, "Mark cache policy name.", 0) \
|
||||
M(UInt64, mark_cache_size, DEFAULT_MARK_CACHE_MAX_SIZE, "Size of cache for marks (index of MergeTree family of tables).", 0) \
|
||||
M(Double, mark_cache_size_ratio, DEFAULT_MARK_CACHE_SIZE_RATIO, "The size of the protected queue in the mark cache relative to the cache's total size.", 0) \
|
||||
M(String, index_uncompressed_cache_policy, DEFAULT_INDEX_UNCOMPRESSED_CACHE_POLICY, "Index uncompressed cache policy name.", 0) \
|
||||
M(UInt64, index_uncompressed_cache_size, DEFAULT_INDEX_UNCOMPRESSED_CACHE_MAX_SIZE, "Size of cache for uncompressed blocks of MergeTree indices. Zero means disabled.", 0) \
|
||||
M(Double, index_uncompressed_cache_size_ratio, DEFAULT_INDEX_UNCOMPRESSED_CACHE_SIZE_RATIO, "The size of the protected queue in the index uncompressed cache relative to the cache's total size.", 0) \
|
||||
M(String, index_mark_cache_policy, DEFAULT_INDEX_MARK_CACHE_POLICY, "Index mark cache policy name.", 0) \
|
||||
M(UInt64, index_mark_cache_size, DEFAULT_INDEX_MARK_CACHE_MAX_SIZE, "Size of cache for index marks. Zero means disabled.", 0) \
|
||||
M(Double, index_mark_cache_size_ratio, DEFAULT_INDEX_MARK_CACHE_SIZE_RATIO, "The size of the protected queue in the index mark cache relative to the cache's total size.", 0) \
|
||||
M(UInt64, mmap_cache_size, DEFAULT_MMAP_CACHE_MAX_SIZE, "A cache for mmapped files.", 0) \
|
||||
\
|
||||
M(Bool, disable_internal_dns_cache, false, "Disable internal DNS caching at all.", 0) \
|
||||
|
@ -42,11 +42,8 @@ private:
|
||||
using Base = CacheBase<UInt128, UncompressedCacheCell, UInt128TrivialHash, UncompressedSizeWeightFunction>;
|
||||
|
||||
public:
|
||||
explicit UncompressedCache(size_t max_size_in_bytes)
|
||||
: Base(max_size_in_bytes) {}
|
||||
|
||||
UncompressedCache(const String & uncompressed_cache_policy, size_t max_size_in_bytes)
|
||||
: Base(uncompressed_cache_policy, max_size_in_bytes) {}
|
||||
UncompressedCache(const String & cache_policy, size_t max_size_in_bytes, double size_ratio)
|
||||
: Base(cache_policy, max_size_in_bytes, 0, size_ratio) {}
|
||||
|
||||
/// Calculate key from path to file and offset.
|
||||
static UInt128 hash(const String & path_to_file, size_t offset)
|
||||
|
@ -175,7 +175,7 @@ public:
|
||||
private:
|
||||
CachePtr getHashTableStatsCache(const Params & params, const std::lock_guard<std::mutex> &)
|
||||
{
|
||||
if (!hash_table_stats || hash_table_stats->maxSize() != params.max_entries_for_hash_table_stats)
|
||||
if (!hash_table_stats || hash_table_stats->maxSizeInBytes() != params.max_entries_for_hash_table_stats)
|
||||
hash_table_stats = std::make_shared<Cache>(params.max_entries_for_hash_table_stats);
|
||||
return hash_table_stats;
|
||||
}
|
||||
|
@ -480,7 +480,7 @@ QueryCache::QueryCache(size_t max_size_in_bytes, size_t max_entries, size_t max_
|
||||
void QueryCache::updateConfiguration(size_t max_size_in_bytes, size_t max_entries, size_t max_entry_size_in_bytes_, size_t max_entry_size_in_rows_)
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
cache.setMaxSize(max_size_in_bytes);
|
||||
cache.setMaxSizeInBytes(max_size_in_bytes);
|
||||
cache.setMaxCount(max_entries);
|
||||
max_entry_size_in_bytes = max_entry_size_in_bytes_;
|
||||
max_entry_size_in_rows = max_entry_size_in_rows_;
|
||||
@ -510,9 +510,9 @@ void QueryCache::clear()
|
||||
times_executed.clear();
|
||||
}
|
||||
|
||||
size_t QueryCache::weight() const
|
||||
size_t QueryCache::sizeInBytes() const
|
||||
{
|
||||
return cache.weight();
|
||||
return cache.sizeInBytes();
|
||||
}
|
||||
|
||||
size_t QueryCache::count() const
|
||||
|
@ -182,7 +182,7 @@ public:
|
||||
|
||||
void clear();
|
||||
|
||||
size_t weight() const;
|
||||
size_t sizeInBytes() const;
|
||||
size_t count() const;
|
||||
|
||||
/// Record new execution of query represented by key. Returns number of executions so far.
|
||||
|
@ -2268,14 +2268,14 @@ QueryStatusPtr Context::getProcessListElement() const
|
||||
}
|
||||
|
||||
|
||||
void Context::setUncompressedCache(const String & uncompressed_cache_policy, size_t max_size_in_bytes)
|
||||
void Context::setUncompressedCache(const String & cache_policy, size_t max_size_in_bytes, double size_ratio)
|
||||
{
|
||||
auto lock = getLock();
|
||||
|
||||
if (shared->uncompressed_cache)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Uncompressed cache has been already created.");
|
||||
|
||||
shared->uncompressed_cache = std::make_shared<UncompressedCache>(uncompressed_cache_policy, max_size_in_bytes);
|
||||
shared->uncompressed_cache = std::make_shared<UncompressedCache>(cache_policy, max_size_in_bytes, size_ratio);
|
||||
}
|
||||
|
||||
void Context::updateUncompressedCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
|
||||
@ -2286,7 +2286,7 @@ void Context::updateUncompressedCacheConfiguration(const Poco::Util::AbstractCon
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Uncompressed cache was not created yet.");
|
||||
|
||||
size_t max_size_in_bytes = config.getUInt64("uncompressed_cache_size", DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE);
|
||||
shared->uncompressed_cache->setMaxSize(max_size_in_bytes);
|
||||
shared->uncompressed_cache->setMaxSizeInBytes(max_size_in_bytes);
|
||||
}
|
||||
|
||||
UncompressedCachePtr Context::getUncompressedCache() const
|
||||
@ -2303,14 +2303,14 @@ void Context::clearUncompressedCache() const
|
||||
shared->uncompressed_cache->clear();
|
||||
}
|
||||
|
||||
void Context::setMarkCache(const String & mark_cache_policy, size_t cache_size_in_bytes)
|
||||
void Context::setMarkCache(const String & cache_policy, size_t max_cache_size_in_bytes, double size_ratio)
|
||||
{
|
||||
auto lock = getLock();
|
||||
|
||||
if (shared->mark_cache)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mark cache has been already created.");
|
||||
|
||||
shared->mark_cache = std::make_shared<MarkCache>(mark_cache_policy, cache_size_in_bytes);
|
||||
shared->mark_cache = std::make_shared<MarkCache>(cache_policy, max_cache_size_in_bytes, size_ratio);
|
||||
}
|
||||
|
||||
void Context::updateMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
|
||||
@ -2321,7 +2321,7 @@ void Context::updateMarkCacheConfiguration(const Poco::Util::AbstractConfigurati
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mark cache was not created yet.");
|
||||
|
||||
size_t max_size_in_bytes = config.getUInt64("mark_cache_size", DEFAULT_MARK_CACHE_MAX_SIZE);
|
||||
shared->mark_cache->setMaxSize(max_size_in_bytes);
|
||||
shared->mark_cache->setMaxSizeInBytes(max_size_in_bytes);
|
||||
}
|
||||
|
||||
MarkCachePtr Context::getMarkCache() const
|
||||
@ -2353,14 +2353,14 @@ ThreadPool & Context::getLoadMarksThreadpool() const
|
||||
return *shared->load_marks_threadpool;
|
||||
}
|
||||
|
||||
void Context::setIndexUncompressedCache(size_t max_size_in_bytes)
|
||||
void Context::setIndexUncompressedCache(const String & cache_policy, size_t max_size_in_bytes, double size_ratio)
|
||||
{
|
||||
auto lock = getLock();
|
||||
|
||||
if (shared->index_uncompressed_cache)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Index uncompressed cache has been already created.");
|
||||
|
||||
shared->index_uncompressed_cache = std::make_shared<UncompressedCache>(max_size_in_bytes);
|
||||
shared->index_uncompressed_cache = std::make_shared<UncompressedCache>(cache_policy, max_size_in_bytes, size_ratio);
|
||||
}
|
||||
|
||||
void Context::updateIndexUncompressedCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
|
||||
@ -2371,7 +2371,7 @@ void Context::updateIndexUncompressedCacheConfiguration(const Poco::Util::Abstra
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Index uncompressed cache was not created yet.");
|
||||
|
||||
size_t max_size_in_bytes = config.getUInt64("index_uncompressed_cache_size", DEFAULT_INDEX_UNCOMPRESSED_CACHE_MAX_SIZE);
|
||||
shared->index_uncompressed_cache->setMaxSize(max_size_in_bytes);
|
||||
shared->index_uncompressed_cache->setMaxSizeInBytes(max_size_in_bytes);
|
||||
}
|
||||
|
||||
UncompressedCachePtr Context::getIndexUncompressedCache() const
|
||||
@ -2388,14 +2388,14 @@ void Context::clearIndexUncompressedCache() const
|
||||
shared->index_uncompressed_cache->clear();
|
||||
}
|
||||
|
||||
void Context::setIndexMarkCache(size_t cache_size_in_bytes)
|
||||
void Context::setIndexMarkCache(const String & cache_policy, size_t max_cache_size_in_bytes, double size_ratio)
|
||||
{
|
||||
auto lock = getLock();
|
||||
|
||||
if (shared->index_mark_cache)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Index mark cache has been already created.");
|
||||
|
||||
shared->index_mark_cache = std::make_shared<MarkCache>(cache_size_in_bytes);
|
||||
shared->index_mark_cache = std::make_shared<MarkCache>(cache_policy, max_cache_size_in_bytes, size_ratio);
|
||||
}
|
||||
|
||||
void Context::updateIndexMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
|
||||
@ -2406,7 +2406,7 @@ void Context::updateIndexMarkCacheConfiguration(const Poco::Util::AbstractConfig
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Index mark cache was not created yet.");
|
||||
|
||||
size_t max_size_in_bytes = config.getUInt64("index_mark_cache_size", DEFAULT_INDEX_MARK_CACHE_MAX_SIZE);
|
||||
shared->index_mark_cache->setMaxSize(max_size_in_bytes);
|
||||
shared->index_mark_cache->setMaxSizeInBytes(max_size_in_bytes);
|
||||
}
|
||||
|
||||
MarkCachePtr Context::getIndexMarkCache() const
|
||||
@ -2423,14 +2423,14 @@ void Context::clearIndexMarkCache() const
|
||||
shared->index_mark_cache->clear();
|
||||
}
|
||||
|
||||
void Context::setMMappedFileCache(size_t cache_size_in_num_entries)
|
||||
void Context::setMMappedFileCache(size_t max_cache_size_in_num_entries)
|
||||
{
|
||||
auto lock = getLock();
|
||||
|
||||
if (shared->mmap_cache)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mapped file cache has been already created.");
|
||||
|
||||
shared->mmap_cache = std::make_shared<MMappedFileCache>(cache_size_in_num_entries);
|
||||
shared->mmap_cache = std::make_shared<MMappedFileCache>(max_cache_size_in_num_entries);
|
||||
}
|
||||
|
||||
void Context::updateMMappedFileCacheConfiguration(const Poco::Util::AbstractConfiguration & config)
|
||||
@ -2441,7 +2441,7 @@ void Context::updateMMappedFileCacheConfiguration(const Poco::Util::AbstractConf
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mapped file cache was not created yet.");
|
||||
|
||||
size_t max_size_in_bytes = config.getUInt64("mmap_cache_size", DEFAULT_MMAP_CACHE_MAX_SIZE);
|
||||
shared->mmap_cache->setMaxSize(max_size_in_bytes);
|
||||
shared->mmap_cache->setMaxSizeInBytes(max_size_in_bytes);
|
||||
}
|
||||
|
||||
MMappedFileCachePtr Context::getMMappedFileCache() const
|
||||
|
@ -922,28 +922,28 @@ public:
|
||||
|
||||
/// --- Caches ------------------------------------------------------------------------------------------
|
||||
|
||||
void setUncompressedCache(const String & uncompressed_cache_policy, size_t max_size_in_bytes);
|
||||
void setUncompressedCache(const String & cache_policy, size_t max_size_in_bytes, double size_ratio);
|
||||
void updateUncompressedCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
|
||||
std::shared_ptr<UncompressedCache> getUncompressedCache() const;
|
||||
void clearUncompressedCache() const;
|
||||
|
||||
void setMarkCache(const String & mark_cache_policy, size_t cache_size_in_bytes);
|
||||
void setMarkCache(const String & cache_policy, size_t max_cache_size_in_bytes, double size_ratio);
|
||||
void updateMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
|
||||
std::shared_ptr<MarkCache> getMarkCache() const;
|
||||
void clearMarkCache() const;
|
||||
ThreadPool & getLoadMarksThreadpool() const;
|
||||
|
||||
void setIndexUncompressedCache(size_t max_size_in_bytes);
|
||||
void setIndexUncompressedCache(const String & cache_policy, size_t max_size_in_bytes, double size_ratio);
|
||||
void updateIndexUncompressedCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
|
||||
std::shared_ptr<UncompressedCache> getIndexUncompressedCache() const;
|
||||
void clearIndexUncompressedCache() const;
|
||||
|
||||
void setIndexMarkCache(size_t cache_size_in_bytes);
|
||||
void setIndexMarkCache(const String & cache_policy, size_t max_cache_size_in_bytes, double size_ratio);
|
||||
void updateIndexMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
|
||||
std::shared_ptr<MarkCache> getIndexMarkCache() const;
|
||||
void clearIndexMarkCache() const;
|
||||
|
||||
void setMMappedFileCache(size_t cache_size_in_num_entries);
|
||||
void setMMappedFileCache(size_t max_cache_size_in_num_entries);
|
||||
void updateMMappedFileCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
|
||||
std::shared_ptr<MMappedFileCache> getMMappedFileCache() const;
|
||||
void clearMMappedFileCache() const;
|
||||
|
@ -68,13 +68,13 @@ void ServerAsynchronousMetrics::updateImpl(AsynchronousMetricValues & new_values
|
||||
{
|
||||
if (auto mark_cache = getContext()->getMarkCache())
|
||||
{
|
||||
new_values["MarkCacheBytes"] = { mark_cache->weight(), "Total size of mark cache in bytes" };
|
||||
new_values["MarkCacheBytes"] = { mark_cache->sizeInBytes(), "Total size of mark cache in bytes" };
|
||||
new_values["MarkCacheFiles"] = { mark_cache->count(), "Total number of mark files cached in the mark cache" };
|
||||
}
|
||||
|
||||
if (auto uncompressed_cache = getContext()->getUncompressedCache())
|
||||
{
|
||||
new_values["UncompressedCacheBytes"] = { uncompressed_cache->weight(),
|
||||
new_values["UncompressedCacheBytes"] = { uncompressed_cache->sizeInBytes(),
|
||||
"Total size of uncompressed cache in bytes. Uncompressed cache does not usually improve the performance and should be mostly avoided." };
|
||||
new_values["UncompressedCacheCells"] = { uncompressed_cache->count(),
|
||||
"Total number of entries in the uncompressed cache. Each entry represents a decompressed block of data. Uncompressed cache does not usually improve performance and should be mostly avoided." };
|
||||
@ -82,13 +82,13 @@ void ServerAsynchronousMetrics::updateImpl(AsynchronousMetricValues & new_values
|
||||
|
||||
if (auto index_mark_cache = getContext()->getIndexMarkCache())
|
||||
{
|
||||
new_values["IndexMarkCacheBytes"] = { index_mark_cache->weight(), "Total size of mark cache for secondary indices in bytes." };
|
||||
new_values["IndexMarkCacheBytes"] = { index_mark_cache->sizeInBytes(), "Total size of mark cache for secondary indices in bytes." };
|
||||
new_values["IndexMarkCacheFiles"] = { index_mark_cache->count(), "Total number of mark files cached in the mark cache for secondary indices." };
|
||||
}
|
||||
|
||||
if (auto index_uncompressed_cache = getContext()->getIndexUncompressedCache())
|
||||
{
|
||||
new_values["IndexUncompressedCacheBytes"] = { index_uncompressed_cache->weight(),
|
||||
new_values["IndexUncompressedCacheBytes"] = { index_uncompressed_cache->sizeInBytes(),
|
||||
"Total size of uncompressed cache in bytes for secondary indices. Uncompressed cache does not usually improve the performance and should be mostly avoided." };
|
||||
new_values["IndexUncompressedCacheCells"] = { index_uncompressed_cache->count(),
|
||||
"Total number of entries in the uncompressed cache for secondary indices. Each entry represents a decompressed block of data. Uncompressed cache does not usually improve performance and should be mostly avoided." };
|
||||
@ -104,7 +104,7 @@ void ServerAsynchronousMetrics::updateImpl(AsynchronousMetricValues & new_values
|
||||
|
||||
if (auto query_cache = getContext()->getQueryCache())
|
||||
{
|
||||
new_values["QueryCacheBytes"] = { query_cache->weight(), "Total size of the query cache in bytes." };
|
||||
new_values["QueryCacheBytes"] = { query_cache->sizeInBytes(), "Total size of the query cache in bytes." };
|
||||
new_values["QueryCacheEntries"] = { query_cache->count(), "Total number of entries in the query cache." };
|
||||
}
|
||||
|
||||
@ -136,7 +136,7 @@ void ServerAsynchronousMetrics::updateImpl(AsynchronousMetricValues & new_values
|
||||
#if USE_EMBEDDED_COMPILER
|
||||
if (auto * compiled_expression_cache = CompiledExpressionCacheFactory::instance().tryGetCache())
|
||||
{
|
||||
new_values["CompiledExpressionCacheBytes"] = { compiled_expression_cache->weight(),
|
||||
new_values["CompiledExpressionCacheBytes"] = { compiled_expression_cache->sizeInBytes(),
|
||||
"Total bytes used for the cache of JIT-compiled code." };
|
||||
new_values["CompiledExpressionCacheCount"] = { compiled_expression_cache->count(),
|
||||
"Total entries in the cache of JIT-compiled code." };
|
||||
|
@ -29,7 +29,7 @@
|
||||
#include <Storages/HDFS/ReadBufferFromHDFS.h>
|
||||
#include <Storages/HDFS/WriteBufferFromHDFS.h>
|
||||
#include <Storages/PartitionedSink.h>
|
||||
#include <Storages/getVirtualsForStorage.h>
|
||||
#include <Storages/VirtualColumnUtils.h>
|
||||
#include <Storages/checkAndGetLiteralArgument.h>
|
||||
|
||||
#include <Formats/ReadSchemaUtils.h>
|
||||
@ -50,6 +50,11 @@
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
namespace ProfileEvents
|
||||
{
|
||||
extern const Event EngineFileLikeReadFiles;
|
||||
}
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace ErrorCodes
|
||||
@ -291,12 +296,7 @@ StorageHDFS::StorageHDFS(
|
||||
storage_metadata.setComment(comment);
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
|
||||
auto default_virtuals = NamesAndTypesList{
|
||||
{"_path", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
|
||||
{"_file", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())}};
|
||||
|
||||
auto columns = storage_metadata.getSampleBlock().getNamesAndTypesList();
|
||||
virtual_columns = getVirtualsForStorage(columns, default_virtuals);
|
||||
virtual_columns = VirtualColumnUtils::getPathAndFileVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList());
|
||||
}
|
||||
|
||||
ColumnsDescription StorageHDFS::getTableStructureFromData(
|
||||
@ -363,11 +363,25 @@ ColumnsDescription StorageHDFS::getTableStructureFromData(
|
||||
class HDFSSource::DisclosedGlobIterator::Impl
|
||||
{
|
||||
public:
|
||||
Impl(ContextPtr context_, const String & uri)
|
||||
Impl(const String & uri, const ASTPtr & query, const NamesAndTypesList & virtual_columns, const ContextPtr & context)
|
||||
{
|
||||
const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri);
|
||||
uris = getPathsList(path_from_uri, uri_without_path, context_);
|
||||
auto file_progress_callback = context_->getFileProgressCallback();
|
||||
uris = getPathsList(path_from_uri, uri_without_path, context);
|
||||
ASTPtr filter_ast;
|
||||
if (!uris.empty())
|
||||
filter_ast = VirtualColumnUtils::createPathAndFileFilterAst(query, virtual_columns, uris[0].path, context);
|
||||
|
||||
if (filter_ast)
|
||||
{
|
||||
std::vector<String> paths;
|
||||
paths.reserve(uris.size());
|
||||
for (const auto & path_with_info : uris)
|
||||
paths.push_back(path_with_info.path);
|
||||
|
||||
VirtualColumnUtils::filterByPathOrFile(uris, paths, query, virtual_columns, context, filter_ast);
|
||||
}
|
||||
auto file_progress_callback = context->getFileProgressCallback();
|
||||
|
||||
for (auto & elem : uris)
|
||||
{
|
||||
elem.path = uri_without_path + elem.path;
|
||||
@ -397,9 +411,23 @@ private:
|
||||
class HDFSSource::URISIterator::Impl : WithContext
|
||||
{
|
||||
public:
|
||||
explicit Impl(const std::vector<String> & uris_, ContextPtr context_)
|
||||
explicit Impl(const std::vector<String> & uris_, const ASTPtr & query, const NamesAndTypesList & virtual_columns, const ContextPtr & context_)
|
||||
: WithContext(context_), uris(uris_), file_progress_callback(context_->getFileProgressCallback())
|
||||
{
|
||||
ASTPtr filter_ast;
|
||||
if (!uris.empty())
|
||||
filter_ast = VirtualColumnUtils::createPathAndFileFilterAst(query, virtual_columns, getPathFromUriAndUriWithoutPath(uris[0]).first, getContext());
|
||||
|
||||
if (filter_ast)
|
||||
{
|
||||
std::vector<String> paths;
|
||||
paths.reserve(uris.size());
|
||||
for (const auto & uri : uris)
|
||||
paths.push_back(getPathFromUriAndUriWithoutPath(uri).first);
|
||||
|
||||
VirtualColumnUtils::filterByPathOrFile(uris, paths, query, virtual_columns, getContext(), filter_ast);
|
||||
}
|
||||
|
||||
if (!uris.empty())
|
||||
{
|
||||
auto path_and_uri = getPathFromUriAndUriWithoutPath(uris[0]);
|
||||
@ -444,16 +472,16 @@ private:
|
||||
std::function<void(FileProgress)> file_progress_callback;
|
||||
};
|
||||
|
||||
HDFSSource::DisclosedGlobIterator::DisclosedGlobIterator(ContextPtr context_, const String & uri)
|
||||
: pimpl(std::make_shared<HDFSSource::DisclosedGlobIterator::Impl>(context_, uri)) {}
|
||||
HDFSSource::DisclosedGlobIterator::DisclosedGlobIterator(const String & uri, const ASTPtr & query, const NamesAndTypesList & virtual_columns, const ContextPtr & context)
|
||||
: pimpl(std::make_shared<HDFSSource::DisclosedGlobIterator::Impl>(uri, query, virtual_columns, context)) {}
|
||||
|
||||
StorageHDFS::PathWithInfo HDFSSource::DisclosedGlobIterator::next()
|
||||
{
|
||||
return pimpl->next();
|
||||
}
|
||||
|
||||
HDFSSource::URISIterator::URISIterator(const std::vector<String> & uris_, ContextPtr context)
|
||||
: pimpl(std::make_shared<HDFSSource::URISIterator::Impl>(uris_, context))
|
||||
HDFSSource::URISIterator::URISIterator(const std::vector<String> & uris_, const ASTPtr & query, const NamesAndTypesList & virtual_columns, const ContextPtr & context)
|
||||
: pimpl(std::make_shared<HDFSSource::URISIterator::Impl>(uris_, query, virtual_columns, context))
|
||||
{
|
||||
}
|
||||
|
||||
@ -547,6 +575,8 @@ bool HDFSSource::initialize()
|
||||
|
||||
pipeline = std::make_unique<QueryPipeline>(QueryPipelineBuilder::getPipeline(std::move(builder)));
|
||||
reader = std::make_unique<PullingPipelineExecutor>(*pipeline);
|
||||
|
||||
ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -569,29 +599,11 @@ Chunk HDFSSource::generate()
|
||||
Chunk chunk;
|
||||
if (reader->pull(chunk))
|
||||
{
|
||||
Columns columns = chunk.getColumns();
|
||||
UInt64 num_rows = chunk.getNumRows();
|
||||
size_t chunk_size = input_format->getApproxBytesReadForChunk();
|
||||
progress(num_rows, chunk_size ? chunk_size : chunk.bytes());
|
||||
|
||||
for (const auto & virtual_column : requested_virtual_columns)
|
||||
{
|
||||
if (virtual_column.name == "_path")
|
||||
{
|
||||
auto column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumnConst(num_rows, current_path);
|
||||
columns.push_back(column->convertToFullColumnIfConst());
|
||||
}
|
||||
else if (virtual_column.name == "_file")
|
||||
{
|
||||
size_t last_slash_pos = current_path.find_last_of('/');
|
||||
auto file_name = current_path.substr(last_slash_pos + 1);
|
||||
|
||||
auto column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumnConst(num_rows, std::move(file_name));
|
||||
columns.push_back(column->convertToFullColumnIfConst());
|
||||
}
|
||||
}
|
||||
|
||||
return Chunk(std::move(columns), num_rows);
|
||||
VirtualColumnUtils::addRequestedPathAndFileVirtualsToChunk(chunk, requested_virtual_columns, current_path);
|
||||
return chunk;
|
||||
}
|
||||
|
||||
reader.reset();
|
||||
@ -756,7 +768,7 @@ Pipe StorageHDFS::read(
|
||||
else if (is_path_with_globs)
|
||||
{
|
||||
/// Iterate through disclosed globs and make a source for each file
|
||||
auto glob_iterator = std::make_shared<HDFSSource::DisclosedGlobIterator>(context_, uris[0]);
|
||||
auto glob_iterator = std::make_shared<HDFSSource::DisclosedGlobIterator>(uris[0], query_info.query, virtual_columns, context_);
|
||||
iterator_wrapper = std::make_shared<HDFSSource::IteratorWrapper>([glob_iterator]()
|
||||
{
|
||||
return glob_iterator->next();
|
||||
@ -764,7 +776,7 @@ Pipe StorageHDFS::read(
|
||||
}
|
||||
else
|
||||
{
|
||||
auto uris_iterator = std::make_shared<HDFSSource::URISIterator>(uris, context_);
|
||||
auto uris_iterator = std::make_shared<HDFSSource::URISIterator>(uris, query_info.query, virtual_columns, context_);
|
||||
iterator_wrapper = std::make_shared<HDFSSource::IteratorWrapper>([uris_iterator]()
|
||||
{
|
||||
return uris_iterator->next();
|
||||
|
@ -126,7 +126,7 @@ public:
|
||||
class DisclosedGlobIterator
|
||||
{
|
||||
public:
|
||||
DisclosedGlobIterator(ContextPtr context_, const String & uri_);
|
||||
DisclosedGlobIterator(const String & uri_, const ASTPtr & query, const NamesAndTypesList & virtual_columns, const ContextPtr & context);
|
||||
StorageHDFS::PathWithInfo next();
|
||||
private:
|
||||
class Impl;
|
||||
@ -137,7 +137,7 @@ public:
|
||||
class URISIterator
|
||||
{
|
||||
public:
|
||||
URISIterator(const std::vector<String> & uris_, ContextPtr context);
|
||||
URISIterator(const std::vector<String> & uris_, const ASTPtr & query, const NamesAndTypesList & virtual_columns, const ContextPtr & context);
|
||||
StorageHDFS::PathWithInfo next();
|
||||
private:
|
||||
class Impl;
|
||||
|
@ -65,6 +65,8 @@ StorageHDFSCluster::StorageHDFSCluster(
|
||||
|
||||
storage_metadata.setConstraints(constraints_);
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
|
||||
virtual_columns = VirtualColumnUtils::getPathAndFileVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList());
|
||||
}
|
||||
|
||||
void StorageHDFSCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context)
|
||||
@ -77,9 +79,9 @@ void StorageHDFSCluster::addColumnsStructureToQuery(ASTPtr & query, const String
|
||||
}
|
||||
|
||||
|
||||
RemoteQueryExecutor::Extension StorageHDFSCluster::getTaskIteratorExtension(ASTPtr, const ContextPtr & context) const
|
||||
RemoteQueryExecutor::Extension StorageHDFSCluster::getTaskIteratorExtension(ASTPtr query, const ContextPtr & context) const
|
||||
{
|
||||
auto iterator = std::make_shared<HDFSSource::DisclosedGlobIterator>(context, uri);
|
||||
auto iterator = std::make_shared<HDFSSource::DisclosedGlobIterator>(uri, query, virtual_columns, context);
|
||||
auto callback = std::make_shared<std::function<String()>>([iter = std::move(iterator)]() mutable -> String { return iter->next().path; });
|
||||
return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)};
|
||||
}
|
||||
|
@ -47,6 +47,7 @@ private:
|
||||
String uri;
|
||||
String format_name;
|
||||
String compression_method;
|
||||
NamesAndTypesList virtual_columns;
|
||||
};
|
||||
|
||||
|
||||
|
@ -41,11 +41,8 @@ private:
|
||||
using Base = CacheBase<UInt128, MarksInCompressedFile, UInt128TrivialHash, MarksWeightFunction>;
|
||||
|
||||
public:
|
||||
explicit MarkCache(size_t max_size_in_bytes)
|
||||
: Base(max_size_in_bytes) {}
|
||||
|
||||
MarkCache(const String & mark_cache_policy, size_t max_size_in_bytes)
|
||||
: Base(mark_cache_policy, max_size_in_bytes) {}
|
||||
MarkCache(const String & cache_policy, size_t max_size_in_bytes, double size_ratio)
|
||||
: Base(cache_policy, max_size_in_bytes, 0, size_ratio) {}
|
||||
|
||||
/// Calculate key from path to file and offset.
|
||||
static UInt128 hash(const String & path_to_file)
|
||||
|
@ -1642,7 +1642,7 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex(
|
||||
{
|
||||
if (index_mark != index_range.begin || !granule || last_index_mark != index_range.begin)
|
||||
granule = reader.read();
|
||||
// Cast to Ann condition
|
||||
|
||||
auto ann_condition = std::dynamic_pointer_cast<IMergeTreeIndexConditionApproximateNearestNeighbor>(condition);
|
||||
if (ann_condition != nullptr)
|
||||
{
|
||||
|
@ -166,7 +166,8 @@ void MergeTreeIndexAggregatorAnnoy<Distance>::update(const Block & block, size_t
|
||||
if (offsets[i + 1] - offsets[i] != size)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
|
||||
|
||||
index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(size);
|
||||
if (!index)
|
||||
index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(size);
|
||||
|
||||
/// Add all rows of block
|
||||
index->add_item(index->get_n_items(), array.data());
|
||||
@ -189,7 +190,8 @@ void MergeTreeIndexAggregatorAnnoy<Distance>::update(const Block & block, size_t
|
||||
if (data.empty())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Tuple has 0 rows, {} rows expected", rows_read);
|
||||
|
||||
index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(data[0].size());
|
||||
if (!index)
|
||||
index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(data[0].size());
|
||||
|
||||
for (const auto & item : data)
|
||||
index->add_item(index->get_n_items(), item.data());
|
||||
|
@ -160,8 +160,8 @@ void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t
|
||||
if (offsets[i + 1] - offsets[i] != size)
|
||||
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
|
||||
|
||||
|
||||
index = std::make_shared<USearchIndexWithSerialization<Metric>>(size);
|
||||
if (!index)
|
||||
index = std::make_shared<USearchIndexWithSerialization<Metric>>(size);
|
||||
|
||||
/// Add all rows of block
|
||||
if (!index->reserve(unum::usearch::ceil2(index->size() + num_rows)))
|
||||
@ -188,7 +188,8 @@ void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t
|
||||
if (data.empty())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Tuple has 0 rows, {} rows expected", rows_read);
|
||||
|
||||
index = std::make_shared<USearchIndexWithSerialization<Metric>>(data[0].size());
|
||||
if (!index)
|
||||
index = std::make_shared<USearchIndexWithSerialization<Metric>>(data[0].size());
|
||||
|
||||
if (!index->reserve(unum::usearch::ceil2(index->size() + data.size())))
|
||||
throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for usearch index");
|
||||
|
@ -25,7 +25,6 @@
|
||||
# include <Storages/StorageS3.h>
|
||||
# include <Storages/StorageS3Settings.h>
|
||||
# include <Storages/VirtualColumnUtils.h>
|
||||
# include <Storages/getVirtualsForStorage.h>
|
||||
|
||||
# include <Formats/FormatFactory.h>
|
||||
|
||||
@ -70,13 +69,13 @@ StorageS3QueueSource::QueueGlobIterator::QueueGlobIterator(
|
||||
const S3::Client & client_,
|
||||
const S3::URI & globbed_uri_,
|
||||
ASTPtr query,
|
||||
const Block & virtual_header,
|
||||
const NamesAndTypesList & virtual_columns,
|
||||
ContextPtr context,
|
||||
UInt64 & max_poll_size_,
|
||||
const S3Settings::RequestSettings & request_settings_)
|
||||
: max_poll_size(max_poll_size_)
|
||||
, glob_iterator(std::make_unique<StorageS3QueueSource::DisclosedGlobIterator>(
|
||||
client_, globbed_uri_, query, virtual_header, context, nullptr, request_settings_))
|
||||
client_, globbed_uri_, query, virtual_columns, context, nullptr, request_settings_))
|
||||
{
|
||||
/// todo(kssenii): remove this loop, it should not be here
|
||||
while (true)
|
||||
|
@ -44,7 +44,7 @@ public:
|
||||
const S3::Client & client_,
|
||||
const S3::URI & globbed_uri_,
|
||||
ASTPtr query,
|
||||
const Block & virtual_header,
|
||||
const NamesAndTypesList & virtual_columns,
|
||||
ContextPtr context,
|
||||
UInt64 & max_poll_size_,
|
||||
const S3Settings::RequestSettings & request_settings_ = {});
|
||||
|
@ -32,7 +32,6 @@
|
||||
# include <Storages/StorageS3.h>
|
||||
# include <Storages/StorageSnapshot.h>
|
||||
# include <Storages/VirtualColumnUtils.h>
|
||||
# include <Storages/getVirtualsForStorage.h>
|
||||
# include <Storages/prepareReadingFromFormat.h>
|
||||
# include <Common/NamedCollections/NamedCollections.h>
|
||||
|
||||
@ -171,15 +170,7 @@ StorageS3Queue::StorageS3Queue(
|
||||
}
|
||||
|
||||
files_metadata = std::make_shared<S3QueueFilesMetadata>(this, *s3queue_settings);
|
||||
|
||||
auto default_virtuals = NamesAndTypesList{
|
||||
{"_path", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
|
||||
{"_file", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())}};
|
||||
|
||||
auto columns = storage_metadata.getSampleBlock().getNamesAndTypesList();
|
||||
virtual_columns = getVirtualsForStorage(columns, default_virtuals);
|
||||
for (const auto & column : virtual_columns)
|
||||
virtual_block.insert({column.type->createColumn(), column.type, column.name});
|
||||
virtual_columns = VirtualColumnUtils::getPathAndFileVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList());
|
||||
|
||||
auto poll_thread = getContext()->getSchedulePool().createTask("S3QueueStreamingTask", [this] { threadFunc(); });
|
||||
task = std::make_shared<TaskContext>(std::move(poll_thread));
|
||||
@ -527,7 +518,7 @@ StorageS3Queue::createFileIterator(ContextPtr local_context, ASTPtr query)
|
||||
*configuration.client,
|
||||
configuration.url,
|
||||
query,
|
||||
virtual_block,
|
||||
virtual_columns,
|
||||
local_context,
|
||||
s3queue_settings->s3queue_polling_size.value,
|
||||
configuration.request_settings);
|
||||
|
@ -93,7 +93,6 @@ private:
|
||||
std::shared_ptr<S3QueueFilesMetadata> files_metadata;
|
||||
Configuration configuration;
|
||||
NamesAndTypesList virtual_columns;
|
||||
Block virtual_block;
|
||||
UInt64 reschedule_processing_interval_ms;
|
||||
|
||||
std::optional<FormatSettings> format_settings;
|
||||
|
@ -8,7 +8,6 @@
|
||||
#include <Parsers/ASTFunction.h>
|
||||
#include <Parsers/ASTInsertQuery.h>
|
||||
|
||||
#include <IO/ParallelReadBuffer.h>
|
||||
#include <IO/SharedThreadPools.h>
|
||||
|
||||
#include <Parsers/ASTCreateQuery.h>
|
||||
@ -18,7 +17,6 @@
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <re2/re2.h>
|
||||
|
||||
#include <azure/identity/managed_identity_credential.hpp>
|
||||
#include <azure/storage/common/storage_credential.hpp>
|
||||
#include <Processors/Transforms/AddingDefaultsTransform.h>
|
||||
#include <Processors/Transforms/ExtractColumnsTransform.h>
|
||||
@ -29,7 +27,6 @@
|
||||
#include <Storages/StorageSnapshot.h>
|
||||
#include <Storages/PartitionedSink.h>
|
||||
#include <Storages/VirtualColumnUtils.h>
|
||||
#include <Storages/getVirtualsForStorage.h>
|
||||
#include <Storages/StorageURL.h>
|
||||
#include <Storages/NamedCollectionsHelpers.h>
|
||||
#include <Common/parseGlobs.h>
|
||||
@ -51,6 +48,11 @@ namespace CurrentMetrics
|
||||
extern const Metric ObjectStorageAzureThreadsActive;
|
||||
}
|
||||
|
||||
namespace ProfileEvents
|
||||
{
|
||||
extern const Event EngineFileLikeReadFiles;
|
||||
}
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
@ -479,15 +481,7 @@ StorageAzureBlob::StorageAzureBlob(
|
||||
for (const auto & key : configuration.blobs_paths)
|
||||
objects.emplace_back(key);
|
||||
|
||||
auto default_virtuals = NamesAndTypesList{
|
||||
{"_path", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
|
||||
{"_file", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())}};
|
||||
|
||||
auto columns = storage_metadata.getSampleBlock().getNamesAndTypesList();
|
||||
|
||||
virtual_columns = getVirtualsForStorage(columns, default_virtuals);
|
||||
for (const auto & column : virtual_columns)
|
||||
virtual_block.insert({column.type->createColumn(), column.type, column.name});
|
||||
virtual_columns = VirtualColumnUtils::getPathAndFileVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList());
|
||||
}
|
||||
|
||||
void StorageAzureBlob::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &)
|
||||
@ -684,13 +678,13 @@ Pipe StorageAzureBlob::read(
|
||||
/// Iterate through disclosed globs and make a source for each file
|
||||
iterator_wrapper = std::make_shared<StorageAzureBlobSource::GlobIterator>(
|
||||
object_storage.get(), configuration.container, configuration.blob_path,
|
||||
query_info.query, virtual_block, local_context, nullptr, local_context->getFileProgressCallback());
|
||||
query_info.query, virtual_columns, local_context, nullptr, local_context->getFileProgressCallback());
|
||||
}
|
||||
else
|
||||
{
|
||||
iterator_wrapper = std::make_shared<StorageAzureBlobSource::KeysIterator>(
|
||||
object_storage.get(), configuration.container, configuration.blobs_paths,
|
||||
query_info.query, virtual_block, local_context, nullptr, local_context->getFileProgressCallback());
|
||||
query_info.query, virtual_columns, local_context, nullptr, local_context->getFileProgressCallback());
|
||||
}
|
||||
|
||||
auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, supportsSubsetOfColumns(), getVirtuals());
|
||||
@ -812,29 +806,12 @@ bool StorageAzureBlob::parallelizeOutputAfterReading(ContextPtr context) const
|
||||
return FormatFactory::instance().checkParallelizeOutputAfterReading(configuration.format, context);
|
||||
}
|
||||
|
||||
static void addPathToVirtualColumns(Block & block, const String & path, size_t idx)
|
||||
{
|
||||
if (block.has("_path"))
|
||||
block.getByName("_path").column->assumeMutableRef().insert(path);
|
||||
|
||||
if (block.has("_file"))
|
||||
{
|
||||
auto pos = path.find_last_of('/');
|
||||
assert(pos != std::string::npos);
|
||||
|
||||
auto file = path.substr(pos + 1);
|
||||
block.getByName("_file").column->assumeMutableRef().insert(file);
|
||||
}
|
||||
|
||||
block.getByName("_idx").column->assumeMutableRef().insert(idx);
|
||||
}
|
||||
|
||||
StorageAzureBlobSource::GlobIterator::GlobIterator(
|
||||
AzureObjectStorage * object_storage_,
|
||||
const std::string & container_,
|
||||
String blob_path_with_globs_,
|
||||
ASTPtr query_,
|
||||
const Block & virtual_header_,
|
||||
const NamesAndTypesList & virtual_columns_,
|
||||
ContextPtr context_,
|
||||
RelativePathsWithMetadata * outer_blobs_,
|
||||
std::function<void(FileProgress)> file_progress_callback_)
|
||||
@ -843,7 +820,7 @@ StorageAzureBlobSource::GlobIterator::GlobIterator(
|
||||
, container(container_)
|
||||
, blob_path_with_globs(blob_path_with_globs_)
|
||||
, query(query_)
|
||||
, virtual_header(virtual_header_)
|
||||
, virtual_columns(virtual_columns_)
|
||||
, outer_blobs(outer_blobs_)
|
||||
, file_progress_callback(file_progress_callback_)
|
||||
{
|
||||
@ -911,40 +888,28 @@ RelativePathWithMetadata StorageAzureBlobSource::GlobIterator::next()
|
||||
index = 0;
|
||||
if (!is_initialized)
|
||||
{
|
||||
createFilterAST(new_batch.front().relative_path);
|
||||
filter_ast = VirtualColumnUtils::createPathAndFileFilterAst(query, virtual_columns, fs::path(container) / new_batch.front().relative_path, getContext());
|
||||
is_initialized = true;
|
||||
}
|
||||
|
||||
if (filter_ast)
|
||||
{
|
||||
auto block = virtual_header.cloneEmpty();
|
||||
for (size_t i = 0; i < new_batch.size(); ++i)
|
||||
addPathToVirtualColumns(block, fs::path(container) / new_batch[i].relative_path, i);
|
||||
std::vector<String> paths;
|
||||
paths.reserve(new_batch.size());
|
||||
for (auto & path_with_metadata : new_batch)
|
||||
paths.push_back(fs::path(container) / path_with_metadata.relative_path);
|
||||
|
||||
VirtualColumnUtils::filterBlockWithQuery(query, block, getContext(), filter_ast);
|
||||
const auto & idxs = typeid_cast<const ColumnUInt64 &>(*block.getByName("_idx").column);
|
||||
|
||||
blobs_with_metadata.clear();
|
||||
for (UInt64 idx : idxs.getData())
|
||||
{
|
||||
if (file_progress_callback)
|
||||
file_progress_callback(FileProgress(0, new_batch[idx].metadata.size_bytes));
|
||||
blobs_with_metadata.emplace_back(std::move(new_batch[idx]));
|
||||
if (outer_blobs)
|
||||
outer_blobs->emplace_back(blobs_with_metadata.back());
|
||||
}
|
||||
VirtualColumnUtils::filterByPathOrFile(new_batch, paths, query, virtual_columns, getContext(), filter_ast);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (outer_blobs)
|
||||
outer_blobs->insert(outer_blobs->end(), new_batch.begin(), new_batch.end());
|
||||
|
||||
blobs_with_metadata = std::move(new_batch);
|
||||
if (file_progress_callback)
|
||||
{
|
||||
for (const auto & [_, info] : blobs_with_metadata)
|
||||
file_progress_callback(FileProgress(0, info.size_bytes));
|
||||
}
|
||||
if (outer_blobs)
|
||||
outer_blobs->insert(outer_blobs->end(), new_batch.begin(), new_batch.end());
|
||||
|
||||
blobs_with_metadata = std::move(new_batch);
|
||||
if (file_progress_callback)
|
||||
{
|
||||
for (const auto & [_, info] : blobs_with_metadata)
|
||||
file_progress_callback(FileProgress(0, info.size_bytes));
|
||||
}
|
||||
}
|
||||
|
||||
@ -954,28 +919,12 @@ RelativePathWithMetadata StorageAzureBlobSource::GlobIterator::next()
|
||||
return blobs_with_metadata[current_index];
|
||||
}
|
||||
|
||||
|
||||
void StorageAzureBlobSource::GlobIterator::createFilterAST(const String & any_key)
|
||||
{
|
||||
if (!query || !virtual_header)
|
||||
return;
|
||||
|
||||
/// Create a virtual block with one row to construct filter
|
||||
/// Append "idx" column as the filter result
|
||||
virtual_header.insert({ColumnUInt64::create(), std::make_shared<DataTypeUInt64>(), "_idx"});
|
||||
|
||||
auto block = virtual_header.cloneEmpty();
|
||||
addPathToVirtualColumns(block, fs::path(container) / any_key, 0);
|
||||
VirtualColumnUtils::prepareFilterBlockWithQuery(query, getContext(), block, filter_ast);
|
||||
}
|
||||
|
||||
|
||||
StorageAzureBlobSource::KeysIterator::KeysIterator(
|
||||
AzureObjectStorage * object_storage_,
|
||||
const std::string & container_,
|
||||
const Strings & keys_,
|
||||
ASTPtr query_,
|
||||
const Block & virtual_header_,
|
||||
const NamesAndTypesList & virtual_columns_,
|
||||
ContextPtr context_,
|
||||
RelativePathsWithMetadata * outer_blobs,
|
||||
std::function<void(FileProgress)> file_progress_callback)
|
||||
@ -983,37 +932,22 @@ StorageAzureBlobSource::KeysIterator::KeysIterator(
|
||||
, object_storage(object_storage_)
|
||||
, container(container_)
|
||||
, query(query_)
|
||||
, virtual_header(virtual_header_)
|
||||
, virtual_columns(virtual_columns_)
|
||||
{
|
||||
Strings all_keys = keys_;
|
||||
|
||||
/// Create a virtual block with one row to construct filter
|
||||
if (query && virtual_header && !all_keys.empty())
|
||||
ASTPtr filter_ast;
|
||||
if (!all_keys.empty())
|
||||
filter_ast = VirtualColumnUtils::createPathAndFileFilterAst(query, virtual_columns, fs::path(container) / all_keys[0], getContext());
|
||||
|
||||
if (filter_ast)
|
||||
{
|
||||
/// Append "idx" column as the filter result
|
||||
virtual_header.insert({ColumnUInt64::create(), std::make_shared<DataTypeUInt64>(), "_idx"});
|
||||
Strings paths;
|
||||
paths.reserve(all_keys.size());
|
||||
for (const auto & key : all_keys)
|
||||
paths.push_back(fs::path(container) / key);
|
||||
|
||||
auto block = virtual_header.cloneEmpty();
|
||||
addPathToVirtualColumns(block, fs::path(container) / all_keys.front(), 0);
|
||||
|
||||
VirtualColumnUtils::prepareFilterBlockWithQuery(query, getContext(), block, filter_ast);
|
||||
|
||||
if (filter_ast)
|
||||
{
|
||||
block = virtual_header.cloneEmpty();
|
||||
for (size_t i = 0; i < all_keys.size(); ++i)
|
||||
addPathToVirtualColumns(block, fs::path(container) / all_keys[i], i);
|
||||
|
||||
VirtualColumnUtils::filterBlockWithQuery(query, block, getContext(), filter_ast);
|
||||
const auto & idxs = typeid_cast<const ColumnUInt64 &>(*block.getByName("_idx").column);
|
||||
|
||||
Strings filtered_keys;
|
||||
filtered_keys.reserve(block.rows());
|
||||
for (UInt64 idx : idxs.getData())
|
||||
filtered_keys.emplace_back(std::move(all_keys[idx]));
|
||||
|
||||
all_keys = std::move(filtered_keys);
|
||||
}
|
||||
VirtualColumnUtils::filterByPathOrFile(all_keys, paths, query, virtual_columns, getContext(), filter_ast);
|
||||
}
|
||||
|
||||
for (auto && key : all_keys)
|
||||
@ -1054,22 +988,7 @@ Chunk StorageAzureBlobSource::generate()
|
||||
UInt64 num_rows = chunk.getNumRows();
|
||||
size_t chunk_size = reader.getInputFormat()->getApproxBytesReadForChunk();
|
||||
progress(num_rows, chunk_size ? chunk_size : chunk.bytes());
|
||||
|
||||
const auto & file_path = reader.getPath();
|
||||
for (const auto & virtual_column : requested_virtual_columns)
|
||||
{
|
||||
if (virtual_column.name == "_path")
|
||||
{
|
||||
chunk.addColumn(virtual_column.type->createColumnConst(num_rows, file_path)->convertToFullColumnIfConst());
|
||||
}
|
||||
else if (virtual_column.name == "_file")
|
||||
{
|
||||
size_t last_slash_pos = file_path.find_last_of('/');
|
||||
auto column = virtual_column.type->createColumnConst(num_rows, file_path.substr(last_slash_pos + 1));
|
||||
chunk.addColumn(column->convertToFullColumnIfConst());
|
||||
}
|
||||
}
|
||||
|
||||
VirtualColumnUtils::addRequestedPathAndFileVirtualsToChunk(chunk, requested_virtual_columns, reader.getPath());
|
||||
return chunk;
|
||||
}
|
||||
|
||||
@ -1181,6 +1100,8 @@ StorageAzureBlobSource::ReaderHolder StorageAzureBlobSource::createReader()
|
||||
auto pipeline = std::make_unique<QueryPipeline>(QueryPipelineBuilder::getPipeline(std::move(builder)));
|
||||
auto current_reader = std::make_unique<PullingPipelineExecutor>(*pipeline);
|
||||
|
||||
ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles);
|
||||
|
||||
return ReaderHolder{fs::path(container) / current_key, std::move(read_buf), std::move(input_format), std::move(pipeline), std::move(current_reader)};
|
||||
}
|
||||
|
||||
@ -1225,12 +1146,12 @@ ColumnsDescription StorageAzureBlob::getTableStructureFromData(
|
||||
else if (configuration.withGlobs())
|
||||
{
|
||||
file_iterator = std::make_shared<StorageAzureBlobSource::GlobIterator>(
|
||||
object_storage, configuration.container, configuration.blob_path, nullptr, Block{}, ctx, &read_keys);
|
||||
object_storage, configuration.container, configuration.blob_path, nullptr, NamesAndTypesList{}, ctx, &read_keys);
|
||||
}
|
||||
else
|
||||
{
|
||||
file_iterator = std::make_shared<StorageAzureBlobSource::KeysIterator>(
|
||||
object_storage, configuration.container, configuration.blobs_paths, nullptr, Block{}, ctx, &read_keys);
|
||||
object_storage, configuration.container, configuration.blobs_paths, nullptr, NamesAndTypesList{}, ctx, &read_keys);
|
||||
}
|
||||
|
||||
std::optional<ColumnsDescription> columns_from_cache;
|
||||
|
@ -118,7 +118,6 @@ private:
|
||||
Configuration configuration;
|
||||
std::unique_ptr<AzureObjectStorage> object_storage;
|
||||
NamesAndTypesList virtual_columns;
|
||||
Block virtual_block;
|
||||
|
||||
const bool distributed_processing;
|
||||
std::optional<FormatSettings> format_settings;
|
||||
@ -163,7 +162,7 @@ public:
|
||||
const std::string & container_,
|
||||
String blob_path_with_globs_,
|
||||
ASTPtr query_,
|
||||
const Block & virtual_header_,
|
||||
const NamesAndTypesList & virtual_columns_,
|
||||
ContextPtr context_,
|
||||
RelativePathsWithMetadata * outer_blobs_,
|
||||
std::function<void(FileProgress)> file_progress_callback_ = {});
|
||||
@ -177,7 +176,7 @@ public:
|
||||
String blob_path_with_globs;
|
||||
ASTPtr query;
|
||||
ASTPtr filter_ast;
|
||||
Block virtual_header;
|
||||
NamesAndTypesList virtual_columns;
|
||||
|
||||
size_t index = 0;
|
||||
|
||||
@ -219,7 +218,7 @@ public:
|
||||
const std::string & container_,
|
||||
const Strings & keys_,
|
||||
ASTPtr query_,
|
||||
const Block & virtual_header_,
|
||||
const NamesAndTypesList & virtual_columns_,
|
||||
ContextPtr context_,
|
||||
RelativePathsWithMetadata * outer_blobs,
|
||||
std::function<void(FileProgress)> file_progress_callback = {});
|
||||
@ -233,8 +232,7 @@ public:
|
||||
RelativePathsWithMetadata keys;
|
||||
|
||||
ASTPtr query;
|
||||
ASTPtr filter_ast;
|
||||
Block virtual_header;
|
||||
NamesAndTypesList virtual_columns;
|
||||
|
||||
std::atomic<size_t> index = 0;
|
||||
};
|
||||
|
@ -4,8 +4,6 @@
|
||||
|
||||
#if USE_AZURE_BLOB_STORAGE
|
||||
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <IO/ConnectionTimeouts.h>
|
||||
#include <Interpreters/AddDefaultDatabaseVisitor.h>
|
||||
#include <Interpreters/InterpreterSelectQuery.h>
|
||||
#include <Processors/Sources/RemoteSource.h>
|
||||
@ -13,10 +11,9 @@
|
||||
#include <QueryPipeline/RemoteQueryExecutor.h>
|
||||
#include <Storages/IStorage.h>
|
||||
#include <Storages/StorageURL.h>
|
||||
#include <Storages/SelectQueryInfo.h>
|
||||
#include <Storages/StorageDictionary.h>
|
||||
#include <Storages/extractTableFunctionArgumentsFromSelectQuery.h>
|
||||
#include <Storages/getVirtualsForStorage.h>
|
||||
#include <Storages/VirtualColumnUtils.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <Parsers/queryToString.h>
|
||||
#include <TableFunctions/TableFunctionAzureBlobStorageCluster.h>
|
||||
@ -60,14 +57,7 @@ StorageAzureBlobCluster::StorageAzureBlobCluster(
|
||||
storage_metadata.setConstraints(constraints_);
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
|
||||
auto default_virtuals = NamesAndTypesList{
|
||||
{"_path", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
|
||||
{"_file", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())}};
|
||||
|
||||
auto columns = storage_metadata.getSampleBlock().getNamesAndTypesList();
|
||||
virtual_columns = getVirtualsForStorage(columns, default_virtuals);
|
||||
for (const auto & column : virtual_columns)
|
||||
virtual_block.insert({column.type->createColumn(), column.type, column.name});
|
||||
virtual_columns = VirtualColumnUtils::getPathAndFileVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList());
|
||||
}
|
||||
|
||||
void StorageAzureBlobCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context)
|
||||
@ -83,7 +73,7 @@ RemoteQueryExecutor::Extension StorageAzureBlobCluster::getTaskIteratorExtension
|
||||
{
|
||||
auto iterator = std::make_shared<StorageAzureBlobSource::GlobIterator>(
|
||||
object_storage.get(), configuration.container, configuration.blob_path,
|
||||
query, virtual_block, context, nullptr);
|
||||
query, virtual_columns, context, nullptr);
|
||||
auto callback = std::make_shared<std::function<String()>>([iterator]() mutable -> String{ return iterator->next().relative_path; });
|
||||
return RemoteQueryExecutor::Extension{ .task_iterator = std::move(callback) };
|
||||
}
|
||||
|
@ -47,7 +47,6 @@ private:
|
||||
|
||||
StorageAzureBlob::Configuration configuration;
|
||||
NamesAndTypesList virtual_columns;
|
||||
Block virtual_block;
|
||||
std::unique_ptr<AzureObjectStorage> object_storage;
|
||||
};
|
||||
|
||||
|
@ -6,11 +6,13 @@
|
||||
#include <Storages/Distributed/DistributedAsyncInsertSource.h>
|
||||
#include <Storages/checkAndGetLiteralArgument.h>
|
||||
#include <Storages/prepareReadingFromFormat.h>
|
||||
#include <Storages/VirtualColumnUtils.h>
|
||||
|
||||
#include <Interpreters/Context.h>
|
||||
#include <Interpreters/evaluateConstantExpression.h>
|
||||
|
||||
#include <Parsers/ASTCreateQuery.h>
|
||||
#include <Parsers/ASTSelectQuery.h>
|
||||
#include <Parsers/ASTIdentifier_fwd.h>
|
||||
#include <Parsers/ASTInsertQuery.h>
|
||||
#include <Parsers/ASTLiteral.h>
|
||||
@ -64,6 +66,7 @@ namespace ProfileEvents
|
||||
extern const Event CreatedReadBufferOrdinary;
|
||||
extern const Event CreatedReadBufferMMap;
|
||||
extern const Event CreatedReadBufferMMapFailed;
|
||||
extern const Event EngineFileLikeReadFiles;
|
||||
}
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
@ -699,6 +702,8 @@ void StorageFile::setStorageMetadata(CommonArguments args)
|
||||
storage_metadata.setConstraints(args.constraints);
|
||||
storage_metadata.setComment(args.comment);
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
|
||||
virtual_columns = VirtualColumnUtils::getPathAndFileVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList());
|
||||
}
|
||||
|
||||
|
||||
@ -721,9 +726,20 @@ public:
|
||||
{
|
||||
public:
|
||||
explicit FilesIterator(
|
||||
const Strings & files_, std::vector<std::string> archives_, const IArchiveReader::NameFilter & name_filter_)
|
||||
const Strings & files_,
|
||||
std::vector<std::string> archives_,
|
||||
const IArchiveReader::NameFilter & name_filter_,
|
||||
ASTPtr query,
|
||||
const NamesAndTypesList & virtual_columns,
|
||||
ContextPtr context_)
|
||||
: files(files_), archives(std::move(archives_)), name_filter(name_filter_)
|
||||
{
|
||||
ASTPtr filter_ast;
|
||||
if (archives.empty() && !files.empty() && !files[0].empty())
|
||||
filter_ast = VirtualColumnUtils::createPathAndFileFilterAst(query, virtual_columns, files[0], context_);
|
||||
|
||||
if (filter_ast)
|
||||
VirtualColumnUtils::filterByPathOrFile(files, files, query, virtual_columns, context_, filter_ast);
|
||||
}
|
||||
|
||||
String next()
|
||||
@ -992,8 +1008,9 @@ public:
|
||||
});
|
||||
|
||||
pipeline = std::make_unique<QueryPipeline>(QueryPipelineBuilder::getPipeline(std::move(builder)));
|
||||
|
||||
reader = std::make_unique<PullingPipelineExecutor>(*pipeline);
|
||||
|
||||
ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles);
|
||||
}
|
||||
|
||||
Chunk chunk;
|
||||
@ -1006,21 +1023,7 @@ public:
|
||||
progress(num_rows, chunk_size ? chunk_size : chunk.bytes());
|
||||
|
||||
/// Enrich with virtual columns.
|
||||
|
||||
for (const auto & virtual_column : requested_virtual_columns)
|
||||
{
|
||||
if (virtual_column.name == "_path")
|
||||
{
|
||||
chunk.addColumn(virtual_column.type->createColumnConst(num_rows, current_path)->convertToFullColumnIfConst());
|
||||
}
|
||||
else if (virtual_column.name == "_file")
|
||||
{
|
||||
size_t last_slash_pos = current_path.find_last_of('/');
|
||||
auto file_name = current_path.substr(last_slash_pos + 1);
|
||||
chunk.addColumn(virtual_column.type->createColumnConst(num_rows, file_name)->convertToFullColumnIfConst());
|
||||
}
|
||||
}
|
||||
|
||||
VirtualColumnUtils::addRequestedPathAndFileVirtualsToChunk(chunk, requested_virtual_columns, current_path);
|
||||
return chunk;
|
||||
}
|
||||
|
||||
@ -1120,7 +1123,7 @@ Pipe StorageFile::read(
|
||||
}
|
||||
}
|
||||
|
||||
auto files_iterator = std::make_shared<StorageFileSource::FilesIterator>(paths, paths_to_archive, std::move(filter));
|
||||
auto files_iterator = std::make_shared<StorageFileSource::FilesIterator>(paths, paths_to_archive, std::move(filter), query_info.query, virtual_columns, context);
|
||||
auto this_ptr = std::static_pointer_cast<StorageFile>(shared_from_this());
|
||||
|
||||
size_t num_streams = max_num_streams;
|
||||
@ -1679,14 +1682,6 @@ void registerStorageFile(StorageFactory & factory)
|
||||
storage_features);
|
||||
}
|
||||
|
||||
|
||||
NamesAndTypesList StorageFile::getVirtuals() const
|
||||
{
|
||||
return NamesAndTypesList{
|
||||
{"_path", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
|
||||
{"_file", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())}};
|
||||
}
|
||||
|
||||
SchemaCache & StorageFile::getSchemaCache(const ContextPtr & context)
|
||||
{
|
||||
static SchemaCache schema_cache(context->getConfigRef().getUInt("schema_inference_cache_max_elements_for_file", DEFAULT_SCHEMA_CACHE_ELEMENTS));
|
||||
|
@ -65,7 +65,7 @@ public:
|
||||
bool storesDataOnDisk() const override;
|
||||
Strings getDataPaths() const override;
|
||||
|
||||
NamesAndTypesList getVirtuals() const override;
|
||||
NamesAndTypesList getVirtuals() const override { return virtual_columns; }
|
||||
|
||||
static Strings getPathsList(const String & table_path, const String & user_files_path, ContextPtr context, size_t & total_bytes_to_read);
|
||||
|
||||
@ -154,6 +154,8 @@ private:
|
||||
std::atomic<int32_t> readers_counter = 0;
|
||||
FileRenamer file_renamer;
|
||||
bool was_renamed = false;
|
||||
|
||||
NamesAndTypesList virtual_columns;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -25,7 +25,6 @@
|
||||
#include <Storages/StorageSnapshot.h>
|
||||
#include <Storages/PartitionedSink.h>
|
||||
#include <Storages/VirtualColumnUtils.h>
|
||||
#include <Storages/getVirtualsForStorage.h>
|
||||
#include <Storages/checkAndGetLiteralArgument.h>
|
||||
#include <Storages/StorageURL.h>
|
||||
#include <Storages/NamedCollectionsHelpers.h>
|
||||
@ -77,6 +76,7 @@ namespace ProfileEvents
|
||||
{
|
||||
extern const Event S3DeleteObjects;
|
||||
extern const Event S3ListObjects;
|
||||
extern const Event EngineFileLikeReadFiles;
|
||||
}
|
||||
|
||||
namespace DB
|
||||
@ -121,23 +121,6 @@ namespace ErrorCodes
|
||||
class IOutputFormat;
|
||||
using OutputFormatPtr = std::shared_ptr<IOutputFormat>;
|
||||
|
||||
static void addPathToVirtualColumns(Block & block, const String & path, size_t idx)
|
||||
{
|
||||
if (block.has("_path"))
|
||||
block.getByName("_path").column->assumeMutableRef().insert(path);
|
||||
|
||||
if (block.has("_file"))
|
||||
{
|
||||
auto pos = path.find_last_of('/');
|
||||
assert(pos != std::string::npos);
|
||||
|
||||
auto file = path.substr(pos + 1);
|
||||
block.getByName("_file").column->assumeMutableRef().insert(file);
|
||||
}
|
||||
|
||||
block.getByName("_idx").column->assumeMutableRef().insert(idx);
|
||||
}
|
||||
|
||||
class StorageS3Source::DisclosedGlobIterator::Impl : WithContext
|
||||
{
|
||||
public:
|
||||
@ -145,7 +128,7 @@ public:
|
||||
const S3::Client & client_,
|
||||
const S3::URI & globbed_uri_,
|
||||
ASTPtr & query_,
|
||||
const Block & virtual_header_,
|
||||
const NamesAndTypesList & virtual_columns_,
|
||||
ContextPtr context_,
|
||||
KeysWithInfo * read_keys_,
|
||||
const S3Settings::RequestSettings & request_settings_,
|
||||
@ -154,7 +137,7 @@ public:
|
||||
, client(client_.clone())
|
||||
, globbed_uri(globbed_uri_)
|
||||
, query(query_)
|
||||
, virtual_header(virtual_header_)
|
||||
, virtual_columns(virtual_columns_)
|
||||
, read_keys(read_keys_)
|
||||
, request_settings(request_settings_)
|
||||
, list_objects_pool(CurrentMetrics::StorageS3Threads, CurrentMetrics::StorageS3ThreadsActive, 1)
|
||||
@ -293,35 +276,26 @@ private:
|
||||
|
||||
if (!is_initialized)
|
||||
{
|
||||
createFilterAST(temp_buffer.front().key);
|
||||
filter_ast = VirtualColumnUtils::createPathAndFileFilterAst(query, virtual_columns, fs::path(globbed_uri.bucket) / temp_buffer.front().key, getContext());
|
||||
is_initialized = true;
|
||||
}
|
||||
|
||||
if (filter_ast)
|
||||
{
|
||||
auto block = virtual_header.cloneEmpty();
|
||||
for (size_t i = 0; i < temp_buffer.size(); ++i)
|
||||
addPathToVirtualColumns(block, fs::path(globbed_uri.bucket) / temp_buffer[i].key, i);
|
||||
std::vector<String> paths;
|
||||
paths.reserve(temp_buffer.size());
|
||||
for (const auto & key_with_info : temp_buffer)
|
||||
paths.push_back(fs::path(globbed_uri.bucket) / key_with_info.key);
|
||||
|
||||
VirtualColumnUtils::filterBlockWithQuery(query, block, getContext(), filter_ast);
|
||||
const auto & idxs = typeid_cast<const ColumnUInt64 &>(*block.getByName("_idx").column);
|
||||
|
||||
buffer.reserve(block.rows());
|
||||
for (UInt64 idx : idxs.getData())
|
||||
{
|
||||
if (file_progress_callback)
|
||||
file_progress_callback(FileProgress(0, temp_buffer[idx].info->size));
|
||||
buffer.emplace_back(std::move(temp_buffer[idx]));
|
||||
}
|
||||
VirtualColumnUtils::filterByPathOrFile(temp_buffer, paths, query, virtual_columns, getContext(), filter_ast);
|
||||
}
|
||||
else
|
||||
|
||||
buffer = std::move(temp_buffer);
|
||||
|
||||
if (file_progress_callback)
|
||||
{
|
||||
buffer = std::move(temp_buffer);
|
||||
if (file_progress_callback)
|
||||
{
|
||||
for (const auto & [_, info] : buffer)
|
||||
file_progress_callback(FileProgress(0, info->size));
|
||||
}
|
||||
for (const auto & [_, info] : buffer)
|
||||
file_progress_callback(FileProgress(0, info->size));
|
||||
}
|
||||
|
||||
/// Set iterator only after the whole batch is processed
|
||||
@ -331,20 +305,6 @@ private:
|
||||
read_keys->insert(read_keys->end(), buffer.begin(), buffer.end());
|
||||
}
|
||||
|
||||
void createFilterAST(const String & any_key)
|
||||
{
|
||||
if (!query || !virtual_header)
|
||||
return;
|
||||
|
||||
/// Create a virtual block with one row to construct filter
|
||||
/// Append "idx" column as the filter result
|
||||
virtual_header.insert({ColumnUInt64::create(), std::make_shared<DataTypeUInt64>(), "_idx"});
|
||||
|
||||
auto block = virtual_header.cloneEmpty();
|
||||
addPathToVirtualColumns(block, fs::path(globbed_uri.bucket) / any_key, 0);
|
||||
VirtualColumnUtils::prepareFilterBlockWithQuery(query, getContext(), block, filter_ast);
|
||||
}
|
||||
|
||||
std::future<ListObjectsOutcome> listObjectsAsync()
|
||||
{
|
||||
return list_objects_scheduler([this]
|
||||
@ -368,7 +328,7 @@ private:
|
||||
std::unique_ptr<S3::Client> client;
|
||||
S3::URI globbed_uri;
|
||||
ASTPtr query;
|
||||
Block virtual_header;
|
||||
NamesAndTypesList virtual_columns;
|
||||
bool is_initialized{false};
|
||||
ASTPtr filter_ast;
|
||||
std::unique_ptr<re2::RE2> matcher;
|
||||
@ -389,12 +349,12 @@ StorageS3Source::DisclosedGlobIterator::DisclosedGlobIterator(
|
||||
const S3::Client & client_,
|
||||
const S3::URI & globbed_uri_,
|
||||
ASTPtr query,
|
||||
const Block & virtual_header,
|
||||
const NamesAndTypesList & virtual_columns_,
|
||||
ContextPtr context,
|
||||
KeysWithInfo * read_keys_,
|
||||
const S3Settings::RequestSettings & request_settings_,
|
||||
std::function<void(FileProgress)> file_progress_callback_)
|
||||
: pimpl(std::make_shared<StorageS3Source::DisclosedGlobIterator::Impl>(client_, globbed_uri_, query, virtual_header, context, read_keys_, request_settings_, file_progress_callback_))
|
||||
: pimpl(std::make_shared<StorageS3Source::DisclosedGlobIterator::Impl>(client_, globbed_uri_, query, virtual_columns_, context, read_keys_, request_settings_, file_progress_callback_))
|
||||
{
|
||||
}
|
||||
|
||||
@ -413,7 +373,7 @@ public:
|
||||
const String & bucket_,
|
||||
const S3Settings::RequestSettings & request_settings_,
|
||||
ASTPtr query_,
|
||||
const Block & virtual_header_,
|
||||
const NamesAndTypesList & virtual_columns_,
|
||||
ContextPtr context_,
|
||||
KeysWithInfo * read_keys_,
|
||||
std::function<void(FileProgress)> file_progress_callback_)
|
||||
@ -424,37 +384,21 @@ public:
|
||||
, bucket(bucket_)
|
||||
, request_settings(request_settings_)
|
||||
, query(query_)
|
||||
, virtual_header(virtual_header_)
|
||||
, virtual_columns(virtual_columns_)
|
||||
, file_progress_callback(file_progress_callback_)
|
||||
{
|
||||
/// Create a virtual block with one row to construct filter
|
||||
if (query && virtual_header && !keys.empty())
|
||||
ASTPtr filter_ast;
|
||||
if (!keys.empty())
|
||||
filter_ast = VirtualColumnUtils::createPathAndFileFilterAst(query, virtual_columns, fs::path(bucket) / keys[0], getContext());
|
||||
|
||||
if (filter_ast)
|
||||
{
|
||||
/// Append "idx" column as the filter result
|
||||
virtual_header.insert({ColumnUInt64::create(), std::make_shared<DataTypeUInt64>(), "_idx"});
|
||||
std::vector<String> paths;
|
||||
paths.reserve(keys.size());
|
||||
for (const auto & key : keys)
|
||||
paths.push_back(fs::path(bucket) / key);
|
||||
|
||||
auto block = virtual_header.cloneEmpty();
|
||||
addPathToVirtualColumns(block, fs::path(bucket) / keys.front(), 0);
|
||||
|
||||
ASTPtr filter_ast;
|
||||
VirtualColumnUtils::prepareFilterBlockWithQuery(query, getContext(), block, filter_ast);
|
||||
|
||||
if (filter_ast)
|
||||
{
|
||||
block = virtual_header.cloneEmpty();
|
||||
for (size_t i = 0; i < keys.size(); ++i)
|
||||
addPathToVirtualColumns(block, fs::path(bucket) / keys[i], i);
|
||||
|
||||
VirtualColumnUtils::filterBlockWithQuery(query, block, getContext(), filter_ast);
|
||||
const auto & idxs = typeid_cast<const ColumnUInt64 &>(*block.getByName("_idx").column);
|
||||
|
||||
Strings filtered_keys;
|
||||
filtered_keys.reserve(block.rows());
|
||||
for (UInt64 idx : idxs.getData())
|
||||
filtered_keys.emplace_back(std::move(keys[idx]));
|
||||
|
||||
keys = std::move(filtered_keys);
|
||||
}
|
||||
VirtualColumnUtils::filterByPathOrFile(keys, paths, query, virtual_columns, getContext(), filter_ast);
|
||||
}
|
||||
|
||||
if (read_keys_)
|
||||
@ -488,7 +432,7 @@ private:
|
||||
String bucket;
|
||||
S3Settings::RequestSettings request_settings;
|
||||
ASTPtr query;
|
||||
Block virtual_header;
|
||||
NamesAndTypesList virtual_columns;
|
||||
std::function<void(FileProgress)> file_progress_callback;
|
||||
};
|
||||
|
||||
@ -499,13 +443,13 @@ StorageS3Source::KeysIterator::KeysIterator(
|
||||
const String & bucket_,
|
||||
const S3Settings::RequestSettings & request_settings_,
|
||||
ASTPtr query,
|
||||
const Block & virtual_header,
|
||||
const NamesAndTypesList & virtual_columns_,
|
||||
ContextPtr context,
|
||||
KeysWithInfo * read_keys,
|
||||
std::function<void(FileProgress)> file_progress_callback_)
|
||||
: pimpl(std::make_shared<StorageS3Source::KeysIterator::Impl>(
|
||||
client_, version_id_, keys_, bucket_, request_settings_,
|
||||
query, virtual_header, context, read_keys, file_progress_callback_))
|
||||
query, virtual_columns_, context, read_keys, file_progress_callback_))
|
||||
{
|
||||
}
|
||||
|
||||
@ -615,6 +559,8 @@ StorageS3Source::ReaderHolder StorageS3Source::createReader()
|
||||
auto pipeline = std::make_unique<QueryPipeline>(QueryPipelineBuilder::getPipeline(std::move(builder)));
|
||||
auto current_reader = std::make_unique<PullingPipelineExecutor>(*pipeline);
|
||||
|
||||
ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles);
|
||||
|
||||
return ReaderHolder{key_with_info.key, bucket, std::move(read_buf), std::move(input_format), std::move(pipeline), std::move(current_reader)};
|
||||
}
|
||||
|
||||
@ -716,22 +662,7 @@ Chunk StorageS3Source::generate()
|
||||
UInt64 num_rows = chunk.getNumRows();
|
||||
size_t chunk_size = reader.getInputFormat()->getApproxBytesReadForChunk();
|
||||
progress(num_rows, chunk_size ? chunk_size : chunk.bytes());
|
||||
|
||||
const auto & file_path = reader.getPath();
|
||||
for (const auto & virtual_column : requested_virtual_columns)
|
||||
{
|
||||
if (virtual_column.name == "_path")
|
||||
{
|
||||
chunk.addColumn(virtual_column.type->createColumnConst(num_rows, file_path));
|
||||
}
|
||||
else if (virtual_column.name == "_file")
|
||||
{
|
||||
size_t last_slash_pos = file_path.find_last_of('/');
|
||||
auto column = virtual_column.type->createColumnConst(num_rows, file_path.substr(last_slash_pos + 1));
|
||||
chunk.addColumn(column);
|
||||
}
|
||||
}
|
||||
|
||||
VirtualColumnUtils::addRequestedPathAndFileVirtualsToChunk(chunk, requested_virtual_columns, reader.getPath());
|
||||
return chunk;
|
||||
}
|
||||
|
||||
@ -976,14 +907,7 @@ StorageS3::StorageS3(
|
||||
storage_metadata.setComment(comment);
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
|
||||
auto default_virtuals = NamesAndTypesList{
|
||||
{"_path", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
|
||||
{"_file", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())}};
|
||||
|
||||
auto columns = storage_metadata.getSampleBlock().getNamesAndTypesList();
|
||||
virtual_columns = getVirtualsForStorage(columns, default_virtuals);
|
||||
for (const auto & column : virtual_columns)
|
||||
virtual_block.insert({column.type->createColumn(), column.type, column.name});
|
||||
virtual_columns = VirtualColumnUtils::getPathAndFileVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList());
|
||||
}
|
||||
|
||||
std::shared_ptr<StorageS3Source::IIterator> StorageS3::createFileIterator(
|
||||
@ -991,7 +915,7 @@ std::shared_ptr<StorageS3Source::IIterator> StorageS3::createFileIterator(
|
||||
bool distributed_processing,
|
||||
ContextPtr local_context,
|
||||
ASTPtr query,
|
||||
const Block & virtual_block,
|
||||
const NamesAndTypesList & virtual_columns,
|
||||
KeysWithInfo * read_keys,
|
||||
std::function<void(FileProgress)> file_progress_callback)
|
||||
{
|
||||
@ -1003,7 +927,7 @@ std::shared_ptr<StorageS3Source::IIterator> StorageS3::createFileIterator(
|
||||
{
|
||||
/// Iterate through disclosed globs and make a source for each file
|
||||
return std::make_shared<StorageS3Source::DisclosedGlobIterator>(
|
||||
*configuration.client, configuration.url, query, virtual_block,
|
||||
*configuration.client, configuration.url, query, virtual_columns,
|
||||
local_context, read_keys, configuration.request_settings, file_progress_callback);
|
||||
}
|
||||
else
|
||||
@ -1011,7 +935,7 @@ std::shared_ptr<StorageS3Source::IIterator> StorageS3::createFileIterator(
|
||||
return std::make_shared<StorageS3Source::KeysIterator>(
|
||||
*configuration.client, configuration.url.version_id, configuration.keys,
|
||||
configuration.url.bucket, configuration.request_settings, query,
|
||||
virtual_block, local_context, read_keys, file_progress_callback);
|
||||
virtual_columns, local_context, read_keys, file_progress_callback);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1047,7 +971,7 @@ Pipe StorageS3::read(
|
||||
Pipes pipes;
|
||||
|
||||
std::shared_ptr<StorageS3Source::IIterator> iterator_wrapper = createFileIterator(
|
||||
query_configuration, distributed_processing, local_context, query_info.query, virtual_block, nullptr, local_context->getFileProgressCallback());
|
||||
query_configuration, distributed_processing, local_context, query_info.query, virtual_columns, nullptr, local_context->getFileProgressCallback());
|
||||
|
||||
auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, supportsSubsetOfColumns(), getVirtuals());
|
||||
bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty())
|
||||
|
@ -69,7 +69,7 @@ public:
|
||||
const S3::Client & client_,
|
||||
const S3::URI & globbed_uri_,
|
||||
ASTPtr query,
|
||||
const Block & virtual_header,
|
||||
const NamesAndTypesList & virtual_columns,
|
||||
ContextPtr context,
|
||||
KeysWithInfo * read_keys_ = nullptr,
|
||||
const S3Settings::RequestSettings & request_settings_ = {},
|
||||
@ -93,7 +93,7 @@ public:
|
||||
const String & bucket_,
|
||||
const S3Settings::RequestSettings & request_settings_,
|
||||
ASTPtr query,
|
||||
const Block & virtual_header,
|
||||
const NamesAndTypesList & virtual_columns,
|
||||
ContextPtr context,
|
||||
KeysWithInfo * read_keys = nullptr,
|
||||
std::function<void(FileProgress)> progress_callback_ = {});
|
||||
@ -338,7 +338,6 @@ private:
|
||||
Configuration configuration;
|
||||
std::mutex configuration_update_mutex;
|
||||
NamesAndTypesList virtual_columns;
|
||||
Block virtual_block;
|
||||
|
||||
String name;
|
||||
const bool distributed_processing;
|
||||
@ -352,7 +351,7 @@ private:
|
||||
bool distributed_processing,
|
||||
ContextPtr local_context,
|
||||
ASTPtr query,
|
||||
const Block & virtual_block,
|
||||
const NamesAndTypesList & virtual_columns,
|
||||
KeysWithInfo * read_keys = nullptr,
|
||||
std::function<void(FileProgress)> progress_callback = {});
|
||||
|
||||
|
@ -16,7 +16,7 @@
|
||||
#include <Storages/SelectQueryInfo.h>
|
||||
#include <Storages/StorageDictionary.h>
|
||||
#include <Storages/extractTableFunctionArgumentsFromSelectQuery.h>
|
||||
#include <Storages/getVirtualsForStorage.h>
|
||||
#include <Storages/VirtualColumnUtils.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <Parsers/queryToString.h>
|
||||
#include <TableFunctions/TableFunctionS3Cluster.h>
|
||||
@ -61,14 +61,7 @@ StorageS3Cluster::StorageS3Cluster(
|
||||
storage_metadata.setConstraints(constraints_);
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
|
||||
auto default_virtuals = NamesAndTypesList{
|
||||
{"_path", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
|
||||
{"_file", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())}};
|
||||
|
||||
auto columns = storage_metadata.getSampleBlock().getNamesAndTypesList();
|
||||
virtual_columns = getVirtualsForStorage(columns, default_virtuals);
|
||||
for (const auto & column : virtual_columns)
|
||||
virtual_block.insert({column.type->createColumn(), column.type, column.name});
|
||||
virtual_columns = VirtualColumnUtils::getPathAndFileVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList());
|
||||
}
|
||||
|
||||
void StorageS3Cluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context)
|
||||
@ -88,7 +81,7 @@ void StorageS3Cluster::updateConfigurationIfChanged(ContextPtr local_context)
|
||||
RemoteQueryExecutor::Extension StorageS3Cluster::getTaskIteratorExtension(ASTPtr query, const ContextPtr & context) const
|
||||
{
|
||||
auto iterator = std::make_shared<StorageS3Source::DisclosedGlobIterator>(
|
||||
*s3_configuration.client, s3_configuration.url, query, virtual_block, context, nullptr, s3_configuration.request_settings, context->getFileProgressCallback());
|
||||
*s3_configuration.client, s3_configuration.url, query, virtual_columns, context, nullptr, s3_configuration.request_settings, context->getFileProgressCallback());
|
||||
auto callback = std::make_shared<std::function<String()>>([iterator]() mutable -> String { return iterator->next().key; });
|
||||
return RemoteQueryExecutor::Extension{ .task_iterator = std::move(callback) };
|
||||
}
|
||||
|
@ -50,7 +50,6 @@ private:
|
||||
|
||||
StorageS3::Configuration s3_configuration;
|
||||
NamesAndTypesList virtual_columns;
|
||||
Block virtual_block;
|
||||
};
|
||||
|
||||
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include <Common/ThreadStatus.h>
|
||||
#include <Common/parseRemoteDescription.h>
|
||||
#include <Common/NamedCollections/NamedCollections.h>
|
||||
#include <Common/ProfileEvents.h>
|
||||
#include <IO/ReadWriteBufferFromHTTP.h>
|
||||
#include <IO/HTTPHeaderEntries.h>
|
||||
|
||||
@ -39,6 +40,10 @@
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypeLowCardinality.h>
|
||||
|
||||
namespace ProfileEvents
|
||||
{
|
||||
extern const Event EngineFileLikeReadFiles;
|
||||
}
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -126,6 +131,8 @@ IStorageURLBase::IStorageURLBase(
|
||||
storage_metadata.setConstraints(constraints_);
|
||||
storage_metadata.setComment(comment);
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
|
||||
virtual_columns = VirtualColumnUtils::getPathAndFileVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList());
|
||||
}
|
||||
|
||||
|
||||
@ -159,9 +166,23 @@ namespace
|
||||
class StorageURLSource::DisclosedGlobIterator::Impl
|
||||
{
|
||||
public:
|
||||
Impl(const String & uri, size_t max_addresses)
|
||||
Impl(const String & uri_, size_t max_addresses, const ASTPtr & query, const NamesAndTypesList & virtual_columns, const ContextPtr & context)
|
||||
{
|
||||
uris = parseRemoteDescription(uri, 0, uri.size(), ',', max_addresses);
|
||||
uris = parseRemoteDescription(uri_, 0, uri_.size(), ',', max_addresses);
|
||||
|
||||
ASTPtr filter_ast;
|
||||
if (!uris.empty())
|
||||
filter_ast = VirtualColumnUtils::createPathAndFileFilterAst(query, virtual_columns, Poco::URI(uris[0]).getPath(), context);
|
||||
|
||||
if (filter_ast)
|
||||
{
|
||||
std::vector<String> paths;
|
||||
paths.reserve(uris.size());
|
||||
for (const auto & uri : uris)
|
||||
paths.push_back(Poco::URI(uri).getPath());
|
||||
|
||||
VirtualColumnUtils::filterByPathOrFile(uris, paths, query, virtual_columns, context, filter_ast);
|
||||
}
|
||||
}
|
||||
|
||||
String next()
|
||||
@ -183,8 +204,8 @@ private:
|
||||
std::atomic_size_t index = 0;
|
||||
};
|
||||
|
||||
StorageURLSource::DisclosedGlobIterator::DisclosedGlobIterator(const String & uri, size_t max_addresses)
|
||||
: pimpl(std::make_shared<StorageURLSource::DisclosedGlobIterator::Impl>(uri, max_addresses)) {}
|
||||
StorageURLSource::DisclosedGlobIterator::DisclosedGlobIterator(const String & uri, size_t max_addresses, const ASTPtr & query, const NamesAndTypesList & virtual_columns, const ContextPtr & context)
|
||||
: pimpl(std::make_shared<StorageURLSource::DisclosedGlobIterator::Impl>(uri, max_addresses, query, virtual_columns, context)) {}
|
||||
|
||||
String StorageURLSource::DisclosedGlobIterator::next()
|
||||
{
|
||||
@ -313,6 +334,8 @@ StorageURLSource::StorageURLSource(
|
||||
|
||||
pipeline = std::make_unique<QueryPipeline>(QueryPipelineBuilder::getPipeline(std::move(builder)));
|
||||
reader = std::make_unique<PullingPipelineExecutor>(*pipeline);
|
||||
|
||||
ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles);
|
||||
return true;
|
||||
};
|
||||
}
|
||||
@ -337,23 +360,7 @@ Chunk StorageURLSource::generate()
|
||||
UInt64 num_rows = chunk.getNumRows();
|
||||
size_t chunk_size = input_format->getApproxBytesReadForChunk();
|
||||
progress(num_rows, chunk_size ? chunk_size : chunk.bytes());
|
||||
|
||||
const String & path{curr_uri.getPath()};
|
||||
|
||||
for (const auto & virtual_column : requested_virtual_columns)
|
||||
{
|
||||
if (virtual_column.name == "_path")
|
||||
{
|
||||
chunk.addColumn(virtual_column.type->createColumnConst(num_rows, path));
|
||||
}
|
||||
else if (virtual_column.name == "_file")
|
||||
{
|
||||
size_t last_slash_pos = path.find_last_of('/');
|
||||
auto column = virtual_column.type->createColumnConst(num_rows, path.substr(last_slash_pos + 1));
|
||||
chunk.addColumn(column);
|
||||
}
|
||||
}
|
||||
|
||||
VirtualColumnUtils::addRequestedPathAndFileVirtualsToChunk(chunk, requested_virtual_columns, curr_uri.getPath());
|
||||
return chunk;
|
||||
}
|
||||
|
||||
@ -727,7 +734,7 @@ Pipe IStorageURLBase::read(
|
||||
else if (is_url_with_globs)
|
||||
{
|
||||
/// Iterate through disclosed globs and make a source for each file
|
||||
auto glob_iterator = std::make_shared<StorageURLSource::DisclosedGlobIterator>(uri, max_addresses);
|
||||
auto glob_iterator = std::make_shared<StorageURLSource::DisclosedGlobIterator>(uri, max_addresses, query_info.query, virtual_columns, local_context);
|
||||
iterator_wrapper = std::make_shared<StorageURLSource::IteratorWrapper>([glob_iterator, max_addresses]()
|
||||
{
|
||||
String next_uri = glob_iterator->next();
|
||||
@ -879,9 +886,7 @@ SinkToStoragePtr IStorageURLBase::write(const ASTPtr & query, const StorageMetad
|
||||
|
||||
NamesAndTypesList IStorageURLBase::getVirtuals() const
|
||||
{
|
||||
return NamesAndTypesList{
|
||||
{"_path", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
|
||||
{"_file", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())}};
|
||||
return virtual_columns;
|
||||
}
|
||||
|
||||
SchemaCache & IStorageURLBase::getSchemaCache(const ContextPtr & context)
|
||||
|
@ -88,6 +88,8 @@ protected:
|
||||
ASTPtr partition_by;
|
||||
bool distributed_processing;
|
||||
|
||||
NamesAndTypesList virtual_columns;
|
||||
|
||||
virtual std::string getReadMethod() const;
|
||||
|
||||
virtual std::vector<std::pair<std::string, std::string>> getReadURIParams(
|
||||
@ -146,7 +148,8 @@ public:
|
||||
class DisclosedGlobIterator
|
||||
{
|
||||
public:
|
||||
DisclosedGlobIterator(const String & uri_, size_t max_addresses);
|
||||
DisclosedGlobIterator(const String & uri_, size_t max_addresses, const ASTPtr & query, const NamesAndTypesList & virtual_columns, const ContextPtr & context);
|
||||
|
||||
String next();
|
||||
size_t size();
|
||||
private:
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include <Storages/SelectQueryInfo.h>
|
||||
#include <Storages/StorageURL.h>
|
||||
#include <Storages/extractTableFunctionArgumentsFromSelectQuery.h>
|
||||
#include <Storages/VirtualColumnUtils.h>
|
||||
|
||||
#include <TableFunctions/TableFunctionURLCluster.h>
|
||||
|
||||
@ -67,6 +68,8 @@ StorageURLCluster::StorageURLCluster(
|
||||
|
||||
storage_metadata.setConstraints(constraints_);
|
||||
setInMemoryMetadata(storage_metadata);
|
||||
|
||||
virtual_columns = VirtualColumnUtils::getPathAndFileVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList());
|
||||
}
|
||||
|
||||
void StorageURLCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context)
|
||||
@ -78,18 +81,11 @@ void StorageURLCluster::addColumnsStructureToQuery(ASTPtr & query, const String
|
||||
TableFunctionURLCluster::addColumnsStructureToArguments(expression_list->children, structure, context);
|
||||
}
|
||||
|
||||
RemoteQueryExecutor::Extension StorageURLCluster::getTaskIteratorExtension(ASTPtr, const ContextPtr & context) const
|
||||
RemoteQueryExecutor::Extension StorageURLCluster::getTaskIteratorExtension(ASTPtr query, const ContextPtr & context) const
|
||||
{
|
||||
auto iterator = std::make_shared<StorageURLSource::DisclosedGlobIterator>(uri, context->getSettingsRef().glob_expansion_max_elements);
|
||||
auto iterator = std::make_shared<StorageURLSource::DisclosedGlobIterator>(uri, context->getSettingsRef().glob_expansion_max_elements, query, virtual_columns, context);
|
||||
auto callback = std::make_shared<TaskIterator>([iter = std::move(iterator)]() mutable -> String { return iter->next(); });
|
||||
return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)};
|
||||
}
|
||||
|
||||
NamesAndTypesList StorageURLCluster::getVirtuals() const
|
||||
{
|
||||
return NamesAndTypesList{
|
||||
{"_path", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
|
||||
{"_file", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())}};
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -32,7 +32,7 @@ public:
|
||||
|
||||
std::string getName() const override { return "URLCluster"; }
|
||||
|
||||
NamesAndTypesList getVirtuals() const override;
|
||||
NamesAndTypesList getVirtuals() const override { return virtual_columns; }
|
||||
|
||||
RemoteQueryExecutor::Extension getTaskIteratorExtension(ASTPtr query, const ContextPtr & context) const override;
|
||||
|
||||
@ -46,6 +46,7 @@ private:
|
||||
String uri;
|
||||
String format_name;
|
||||
String compression_method;
|
||||
NamesAndTypesList virtual_columns;
|
||||
};
|
||||
|
||||
|
||||
|
@ -20,6 +20,10 @@
|
||||
#include <Columns/ColumnsCommon.h>
|
||||
#include <Columns/FilterDescription.h>
|
||||
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypeLowCardinality.h>
|
||||
|
||||
#include <Processors/QueryPlan/QueryPlan.h>
|
||||
#include <Processors/QueryPlan/BuildQueryPipelineSettings.h>
|
||||
#include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h>
|
||||
@ -266,6 +270,93 @@ void filterBlockWithQuery(const ASTPtr & query, Block & block, ContextPtr contex
|
||||
}
|
||||
}
|
||||
|
||||
NamesAndTypesList getPathAndFileVirtualsForStorage(NamesAndTypesList storage_columns)
|
||||
{
|
||||
auto default_virtuals = NamesAndTypesList{
|
||||
{"_path", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
|
||||
{"_file", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())}};
|
||||
|
||||
default_virtuals.sort();
|
||||
storage_columns.sort();
|
||||
|
||||
NamesAndTypesList result_virtuals;
|
||||
std::set_difference(
|
||||
default_virtuals.begin(), default_virtuals.end(), storage_columns.begin(), storage_columns.end(),
|
||||
std::back_inserter(result_virtuals),
|
||||
[](const NameAndTypePair & lhs, const NameAndTypePair & rhs){ return lhs.name < rhs.name; });
|
||||
|
||||
return result_virtuals;
|
||||
}
|
||||
|
||||
static void addPathAndFileToVirtualColumns(Block & block, const String & path, size_t idx)
|
||||
{
|
||||
if (block.has("_path"))
|
||||
block.getByName("_path").column->assumeMutableRef().insert(path);
|
||||
|
||||
if (block.has("_file"))
|
||||
{
|
||||
auto pos = path.find_last_of('/');
|
||||
String file;
|
||||
if (pos != std::string::npos)
|
||||
file = path.substr(pos + 1);
|
||||
else
|
||||
file = path;
|
||||
|
||||
block.getByName("_file").column->assumeMutableRef().insert(file);
|
||||
}
|
||||
|
||||
block.getByName("_idx").column->assumeMutableRef().insert(idx);
|
||||
}
|
||||
|
||||
ASTPtr createPathAndFileFilterAst(const ASTPtr & query, const NamesAndTypesList & virtual_columns, const String & path_example, const ContextPtr & context)
|
||||
{
|
||||
if (!query || virtual_columns.empty())
|
||||
return {};
|
||||
|
||||
Block block;
|
||||
for (const auto & column : virtual_columns)
|
||||
block.insert({column.type->createColumn(), column.type, column.name});
|
||||
/// Create a block with one row to construct filter
|
||||
/// Append "idx" column as the filter result
|
||||
block.insert({ColumnUInt64::create(), std::make_shared<DataTypeUInt64>(), "_idx"});
|
||||
addPathAndFileToVirtualColumns(block, path_example, 0);
|
||||
ASTPtr filter_ast;
|
||||
prepareFilterBlockWithQuery(query, context, block, filter_ast);
|
||||
return filter_ast;
|
||||
}
|
||||
|
||||
ColumnPtr getFilterByPathAndFileIndexes(const std::vector<String> & paths, const ASTPtr & query, const NamesAndTypesList & virtual_columns, const ContextPtr & context, ASTPtr filter_ast)
|
||||
{
|
||||
Block block;
|
||||
for (const auto & column : virtual_columns)
|
||||
block.insert({column.type->createColumn(), column.type, column.name});
|
||||
block.insert({ColumnUInt64::create(), std::make_shared<DataTypeUInt64>(), "_idx"});
|
||||
|
||||
for (size_t i = 0; i != paths.size(); ++i)
|
||||
addPathAndFileToVirtualColumns(block, paths[i], i);
|
||||
|
||||
filterBlockWithQuery(query, block, context, filter_ast);
|
||||
|
||||
return block.getByName("_idx").column;
|
||||
}
|
||||
|
||||
void addRequestedPathAndFileVirtualsToChunk(Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, const String & path)
|
||||
{
|
||||
for (const auto & virtual_column : requested_virtual_columns)
|
||||
{
|
||||
if (virtual_column.name == "_path")
|
||||
{
|
||||
chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), path));
|
||||
}
|
||||
else if (virtual_column.name == "_file")
|
||||
{
|
||||
size_t last_slash_pos = path.find_last_of('/');
|
||||
auto file_name = path.substr(last_slash_pos + 1);
|
||||
chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), file_name));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -46,6 +46,28 @@ auto extractSingleValueFromBlock(const Block & block, const String & name)
|
||||
return res;
|
||||
}
|
||||
|
||||
NamesAndTypesList getPathAndFileVirtualsForStorage(NamesAndTypesList storage_columns);
|
||||
|
||||
ASTPtr createPathAndFileFilterAst(const ASTPtr & query, const NamesAndTypesList & virtual_columns, const String & path_example, const ContextPtr & context);
|
||||
|
||||
ColumnPtr getFilterByPathAndFileIndexes(const std::vector<String> & paths, const ASTPtr & query, const NamesAndTypesList & virtual_columns, const ContextPtr & context, ASTPtr filter_ast);
|
||||
|
||||
template <typename T>
|
||||
void filterByPathOrFile(std::vector<T> & sources, const std::vector<String> & paths, const ASTPtr & query, const NamesAndTypesList & virtual_columns, const ContextPtr & context, ASTPtr filter_ast)
|
||||
{
|
||||
auto indexes_column = getFilterByPathAndFileIndexes(paths, query, virtual_columns, context, filter_ast);
|
||||
const auto & indexes = typeid_cast<const ColumnUInt64 &>(*indexes_column).getData();
|
||||
if (indexes.size() == sources.size())
|
||||
return;
|
||||
|
||||
std::vector<T> filtered_sources;
|
||||
filtered_sources.reserve(indexes.size());
|
||||
for (auto index : indexes)
|
||||
filtered_sources.emplace_back(std::move(sources[index]));
|
||||
sources = std::move(filtered_sources);
|
||||
}
|
||||
|
||||
void addRequestedPathAndFileVirtualsToChunk(Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, const String & path);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,22 +0,0 @@
|
||||
#include "getVirtualsForStorage.h"
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
NamesAndTypesList getVirtualsForStorage(const NamesAndTypesList & storage_columns_, const NamesAndTypesList & default_virtuals_)
|
||||
{
|
||||
auto default_virtuals = default_virtuals_;
|
||||
auto storage_columns = storage_columns_;
|
||||
default_virtuals.sort();
|
||||
storage_columns.sort();
|
||||
|
||||
NamesAndTypesList result_virtuals;
|
||||
std::set_difference(
|
||||
default_virtuals.begin(), default_virtuals.end(), storage_columns.begin(), storage_columns.end(),
|
||||
std::back_inserter(result_virtuals),
|
||||
[](const NameAndTypePair & lhs, const NameAndTypePair & rhs){ return lhs.name < rhs.name; });
|
||||
|
||||
return result_virtuals;
|
||||
}
|
||||
|
||||
}
|
@ -1,9 +0,0 @@
|
||||
#pragma once
|
||||
#include <Core/NamesAndTypes.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
NamesAndTypesList getVirtualsForStorage(const NamesAndTypesList & storage_columns_, const NamesAndTypesList & default_virtuals_);
|
||||
|
||||
}
|
@ -712,3 +712,33 @@ def test_function_signatures(cluster):
|
||||
# " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression, structure\n"
|
||||
query_10 = f"select * from azureBlobStorage('{storage_account_url}', 'cont', 'test_signature.csv', '{account_name}', '{account_key}', 'CSV', 'auto', 'column1 UInt32')"
|
||||
assert azure_query(node, query_10) == "1\n2\n3\n"
|
||||
|
||||
|
||||
def test_filtering_by_file_or_path(cluster):
|
||||
node = cluster.instances["node"]
|
||||
azure_query(
|
||||
node,
|
||||
"INSERT INTO TABLE FUNCTION azureBlobStorage('http://azurite1:10000/devstoreaccount1', 'cont', 'test_filter1.tsv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') select 1",
|
||||
)
|
||||
|
||||
azure_query(
|
||||
node,
|
||||
"INSERT INTO TABLE FUNCTION azureBlobStorage('http://azurite1:10000/devstoreaccount1', 'cont', 'test_filter2.tsv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') select 2",
|
||||
)
|
||||
|
||||
azure_query(
|
||||
node,
|
||||
"INSERT INTO TABLE FUNCTION azureBlobStorage('http://azurite1:10000/devstoreaccount1', 'cont', 'test_filter3.tsv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') select 3",
|
||||
)
|
||||
|
||||
node.query(
|
||||
f"select count() from azureBlobStorage('http://azurite1:10000/devstoreaccount1', 'cont', 'test_filter*.tsv', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') where _file = 'test_filter1.tsv'"
|
||||
)
|
||||
|
||||
node.query("SYSTEM FLUSH LOGS")
|
||||
|
||||
result = node.query(
|
||||
f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query like '%select%azure%test_filter%' AND type='QueryFinish'"
|
||||
)
|
||||
|
||||
assert int(result) == 1
|
||||
|
@ -1890,3 +1890,32 @@ def test_read_subcolumns(started_cluster):
|
||||
assert (
|
||||
res == "42\t/root/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n"
|
||||
)
|
||||
|
||||
|
||||
def test_filtering_by_file_or_path(started_cluster):
|
||||
bucket = started_cluster.minio_bucket
|
||||
instance = started_cluster.instances["dummy"]
|
||||
|
||||
instance.query(
|
||||
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter1.tsv', auto, 'x UInt64') select 1"
|
||||
)
|
||||
|
||||
instance.query(
|
||||
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter2.tsv', auto, 'x UInt64') select 2"
|
||||
)
|
||||
|
||||
instance.query(
|
||||
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter3.tsv', auto, 'x UInt64') select 3"
|
||||
)
|
||||
|
||||
instance.query(
|
||||
f"select count() from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter*.tsv') where _file = 'test_filter1.tsv'"
|
||||
)
|
||||
|
||||
instance.query("SYSTEM FLUSH LOGS")
|
||||
|
||||
result = instance.query(
|
||||
f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query like '%select%s3%test_filter%' AND type='QueryFinish'"
|
||||
)
|
||||
|
||||
assert int(result) == 1
|
||||
|
@ -5,9 +5,9 @@
|
||||
{"data":{"k0":0,"k1":0,"k2":0,"k3":0,"k4":100,"k5":0}}
|
||||
{"data":{"k0":0,"k1":0,"k2":0,"k3":0,"k4":0,"k5":100}}
|
||||
Tuple(k0 Int8, k1 Int8, k2 Int8, k3 Int8, k4 Int8, k5 Int8)
|
||||
{"data":{"k0":100,"k1":0,"k2":0}}
|
||||
{"data":{"k0":0,"k1":100,"k2":0}}
|
||||
{"data":{"k0":0,"k1":0,"k2":100}}
|
||||
{"data":{"k0":0,"k1":100,"k2":0}}
|
||||
{"data":{"k0":100,"k1":0,"k2":0}}
|
||||
Tuple(k0 Int8, k1 Int8, k2 Int8)
|
||||
{"data":{"k1":100,"k3":0}}
|
||||
{"data":{"k1":0,"k3":100}}
|
||||
|
@ -28,7 +28,7 @@ ${CLICKHOUSE_CLIENT} -q "INSERT INTO t_json_files \
|
||||
SELECT _file, data FROM file('01825_file_*.json', 'JSONAsObject', 'data JSON') \
|
||||
ORDER BY _file LIMIT 3" --max_threads 1 --min_insert_block_size_rows 1 --max_insert_block_size 1 --max_block_size 1 --allow_experimental_object_type 1
|
||||
|
||||
${CLICKHOUSE_CLIENT} -q "SELECT data FROM t_json_files ORDER BY file FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1
|
||||
${CLICKHOUSE_CLIENT} -q "SELECT data FROM t_json_files ORDER BY file, data FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1
|
||||
${CLICKHOUSE_CLIENT} -q "SELECT toTypeName(data) FROM t_json_files LIMIT 1"
|
||||
|
||||
${CLICKHOUSE_CLIENT} -q "TRUNCATE TABLE IF EXISTS t_json_files"
|
||||
|
@ -142,3 +142,8 @@ Expression (Projection)
|
||||
Description: annoy GRANULARITY 4
|
||||
Parts: 1/1
|
||||
Granules: 4/4
|
||||
--- Test correctness of Annoy index with > 1 mark
|
||||
1 [1,0,0,0]
|
||||
9000 [9000,0,0,0]
|
||||
1 (1,0,0,0)
|
||||
9000 (9000,0,0,0)
|
||||
|
@ -1,4 +1,4 @@
|
||||
-- Tags: no-fasttest, no-ubsan, no-cpu-aarch64
|
||||
-- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-ordinary-database, no-asan
|
||||
|
||||
SET allow_experimental_annoy_index = 1;
|
||||
SET allow_experimental_analyzer = 0;
|
||||
@ -249,3 +249,35 @@ DROP TABLE tab;
|
||||
|
||||
-- (*) Storage and search in Annoy indexes is inherently random. Tests which check for exact row matches would be unstable. Therefore,
|
||||
-- comment them out.
|
||||
|
||||
SELECT '--- Test correctness of Annoy index with > 1 mark';
|
||||
|
||||
CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes=0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0; -- disable adaptive granularity due to bug
|
||||
INSERT INTO tab SELECT number, [toFloat32(number), 0., 0., 0.] from numbers(10000);
|
||||
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vector, [1.0, 0.0, 0.0, 0.0])
|
||||
LIMIT 1;
|
||||
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vector, [9000.0, 0.0, 0.0, 0.0])
|
||||
LIMIT 1;
|
||||
|
||||
DROP TABLE tab;
|
||||
|
||||
CREATE TABLE tab(id Int32, vector Tuple(Float32, Float32, Float32, Float32), INDEX annoy_index vector TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes=0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0; -- disable adaptive granularity due to bug
|
||||
INSERT INTO tab SELECT number, (toFloat32(number), 0., 0., 0.) from numbers(10000);
|
||||
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vector, (1.0, 0.0, 0.0, 0.0))
|
||||
LIMIT 1;
|
||||
|
||||
SELECT *
|
||||
FROM tab
|
||||
ORDER BY L2Distance(vector, (9000.0, 0.0, 0.0, 0.0))
|
||||
LIMIT 1;
|
||||
|
||||
DROP TABLE tab;
|
||||
|
@ -0,0 +1,2 @@
|
||||
1
|
||||
1
|
@ -0,0 +1,17 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-fasttest
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
echo "1" > $CLICKHOUSE_TEST_UNIQUE_NAME.data1.tsv
|
||||
echo "2" > $CLICKHOUSE_TEST_UNIQUE_NAME.data2.tsv
|
||||
echo "3" > $CLICKHOUSE_TEST_UNIQUE_NAME.data3.tsv
|
||||
|
||||
$CLICKHOUSE_LOCAL --print-profile-events -q "select * from file('$CLICKHOUSE_TEST_UNIQUE_NAME.data{1,2,3}.tsv', auto, 'x UInt64') where _file like '%data1%' format Null" 2>&1 | grep -F -c "EngineFileLikeReadFiles: 1"
|
||||
|
||||
$CLICKHOUSE_LOCAL --print-profile-events -q "select * from file('$CLICKHOUSE_TEST_UNIQUE_NAME.data{1,2,3}.tsv', auto, 'x UInt64') where _path like '%data1%' format Null" 2>&1 | grep -F -c "EngineFileLikeReadFiles: 1"
|
||||
|
||||
rm $CLICKHOUSE_TEST_UNIQUE_NAME.data*
|
||||
|
@ -0,0 +1,2 @@
|
||||
1
|
||||
1
|
@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-fasttest
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
$CLICKHOUSE_LOCAL --print-profile-events -q "select * from url('http://localhost:11111/test/{a,b,c}.tsv', auto, 'x UInt64, y UInt64, z UInt64') where _file = 'a.tsv' format Null" 2>&1 | grep -F -c "EngineFileLikeReadFiles: 1"
|
||||
|
||||
$CLICKHOUSE_LOCAL --print-profile-events -q "select * from url('http://localhost:11111/test/{a,b,c}.tsv', auto, 'x UInt64, y UInt64, z UInt64') where _path = '/test/a.tsv' format Null" 2>&1 | grep -F -c "EngineFileLikeReadFiles: 1"
|
||||
|
@ -0,0 +1,2 @@
|
||||
1
|
||||
1
|
@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env bash
|
||||
# Tags: no-fasttest, use-hdfs
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "insert into table function hdfs('hdfs://localhost:12222/$CLICKHOUSE_TEST_UNIQUE_NAME.data1.tsv') select 1 settings hdfs_truncate_on_insert=1;"
|
||||
$CLICKHOUSE_CLIENT -q "insert into table function hdfs('hdfs://localhost:12222/$CLICKHOUSE_TEST_UNIQUE_NAME.data2.tsv') select 2 settings hdfs_truncate_on_insert=1;"
|
||||
$CLICKHOUSE_CLIENT -q "insert into table function hdfs('hdfs://localhost:12222/$CLICKHOUSE_TEST_UNIQUE_NAME.data3.tsv') select 3 settings hdfs_truncate_on_insert=1;"
|
||||
|
||||
|
||||
$CLICKHOUSE_CLIENT --print-profile-events -q "select * from hdfs('hdfs://localhost:12222/$CLICKHOUSE_TEST_UNIQUE_NAME.data*.tsv', auto, 'x UInt64') where _file like '%data1%' format Null" 2>&1 | grep -F -c "EngineFileLikeReadFiles: 1"
|
||||
|
||||
$CLICKHOUSE_CLIENT --print-profile-events -q "select * from hdfs('hdfs://localhost:12222/$CLICKHOUSE_TEST_UNIQUE_NAME.data*.tsv', auto, 'x UInt64') where _path like '%data1%' format Null" 2>&1 | grep -F -c "EngineFileLikeReadFiles: 1"
|
@ -10,3 +10,4 @@ set max_threads=1;
|
||||
select trimLeft(explain) from (explain pipeline SELECT DISTINCT id, v FROM t_sparse_distinct) where explain ilike '%DistinctSortedChunkTransform%';
|
||||
DistinctSortedChunkTransform
|
||||
SELECT DISTINCT id, v FROM t_sparse_distinct format Null;
|
||||
DROP TABLE t_sparse_distinct;
|
||||
|
Loading…
Reference in New Issue
Block a user