Merge branch 'ClickHouse:master' into fix-named-collections-on-cluster-23.7

This commit is contained in:
Al Korgun 2023-07-27 10:48:25 +03:00 committed by GitHub
commit f026ccf11e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
76 changed files with 4140 additions and 401 deletions

View File

@ -67,6 +67,8 @@ public:
Message(
const std::string & source, const std::string & text, Priority prio, const char * file, int line, std::string_view fmt_str = {});
Message(
std::string && source, std::string && text, Priority prio, const char * file, int line, std::string_view fmt_str);
/// Creates a Message with the given source, text, priority,
/// source file path and line.
///

View File

@ -60,6 +60,19 @@ Message::Message(const std::string& source, const std::string& text, Priority pr
}
Message::Message(std::string && source, std::string && text, Priority prio, const char * file, int line, std::string_view fmt_str):
_source(std::move(source)),
_text(std::move(text)),
_prio(prio),
_tid(0),
_file(file),
_line(line),
_pMap(0),
_fmt_str(fmt_str)
{
init();
}
Message::Message(const Message& msg):
_source(msg._source),
_text(msg._text),

View File

@ -502,9 +502,10 @@ target_include_directories(_parquet SYSTEM BEFORE
"${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src"
"${CMAKE_CURRENT_SOURCE_DIR}/cpp/src")
target_link_libraries(_parquet
PUBLIC _arrow
PRIVATE
PUBLIC
_arrow
ch_contrib::thrift
PRIVATE
boost::headers_only
boost::regex
OpenSSL::Crypto OpenSSL::SSL)

View File

@ -945,6 +945,44 @@ Result:
└────────────┴───────┘
```
## toDecimalString
Converts a numeric value to String with the number of fractional digits in the output specified by the user.
**Syntax**
``` sql
toDecimalString(number, scale)
```
**Parameters**
- `number` — Value to be represented as String, [Int, UInt](/docs/en/sql-reference/data-types/int-uint.md), [Float](/docs/en/sql-reference/data-types/float.md), [Decimal](/docs/en/sql-reference/data-types/decimal.md),
- `scale` — Number of fractional digits, [UInt8](/docs/en/sql-reference/data-types/int-uint.md).
* Maximum scale for [Decimal](/docs/en/sql-reference/data-types/decimal.md) and [Int, UInt](/docs/en/sql-reference/data-types/int-uint.md) types is 77 (it is the maximum possible number of significant digits for Decimal),
* Maximum scale for [Float](/docs/en/sql-reference/data-types/float.md) is 60.
**Returned value**
- Input value represented as [String](/docs/en/sql-reference/data-types/string.md) with given number of fractional digits (scale).
The number is rounded up or down according to common arithmetic in case requested scale is smaller than original number's scale.
**Example**
Query:
``` sql
SELECT toDecimalString(CAST('64.32', 'Float64'), 5);
```
Result:
```response
┌toDecimalString(CAST('64.32', 'Float64'), 5)─┐
│ 64.32000 │
└─────────────────────────────────────────────┘
```
## reinterpretAsUInt(8\|16\|32\|64)
## reinterpretAsInt(8\|16\|32\|64)

View File

@ -414,3 +414,29 @@ Will do sync syscall.
```sql
SYSTEM SYNC FILE CACHE [ON CLUSTER cluster_name]
```
### SYSTEM STOP LISTEN
Closes the socket and gracefully terminates the existing connections to the server on the specified port with the specified protocol.
However, if the corresponding protocol settings were not specified in the clickhouse-server configuration, this command will have no effect.
```sql
SYSTEM STOP LISTEN [ON CLUSTER cluster_name] [QUERIES ALL | QUERIES DEFAULT | QUERIES CUSTOM | TCP | TCP_WITH_PROXY | TCP_SECURE | HTTP | HTTPS | MYSQL | GRPC | POSTGRESQL | PROMETHEUS | CUSTOM 'protocol']
```
- If `CUSTOM 'protocol'` modifier is specified, the custom protocol with the specified name defined in the protocols section of the server configuration will be stopped.
- If `QUERIES ALL` modifier is specified, all protocols are stopped.
- If `QUERIES DEFAULT` modifier is specified, all default protocols are stopped.
- If `QUERIES CUSTOM` modifier is specified, all custom protocols are stopped.
### SYSTEM START LISTEN
Allows new connections to be established on the specified protocols.
However, if the server on the specified port and protocol was not stopped using the SYSTEM STOP LISTEN command, this command will have no effect.
```sql
SYSTEM START LISTEN [ON CLUSTER cluster_name] [QUERIES ALL | QUERIES DEFAULT | QUERIES CUSTOM | TCP | TCP_WITH_PROXY | TCP_SECURE | HTTP | HTTPS | MYSQL | GRPC | POSTGRESQL | PROMETHEUS | CUSTOM 'protocol']
```

View File

@ -762,6 +762,44 @@ SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut;
└────────────┴───────┘
```
## toDecimalString
Принимает любой численный тип первым аргументом, возвращает строковое десятичное представление числа с точностью, заданной вторым аргументом.
**Синтаксис**
``` sql
toDecimalString(number, scale)
```
**Параметры**
- `number` — Значение любого числового типа: [Int, UInt](/docs/ru/sql-reference/data-types/int-uint.md), [Float](/docs/ru/sql-reference/data-types/float.md), [Decimal](/docs/ru/sql-reference/data-types/decimal.md),
- `scale` — Требуемое количество десятичных знаков после запятой, [UInt8](/docs/ru/sql-reference/data-types/int-uint.md).
* Значение `scale` для типов [Decimal](/docs/ru/sql-reference/data-types/decimal.md) и [Int, UInt](/docs/ru/sql-reference/data-types/int-uint.md) должно не превышать 77 (так как это наибольшее количество значимых символов для этих типов),
* Значение `scale` для типа [Float](/docs/ru/sql-reference/data-types/float.md) не должно превышать 60.
**Возвращаемое значение**
- Строка ([String](/docs/en/sql-reference/data-types/string.md)), представляющая собой десятичное представление входного числа с заданной длиной дробной части.
При необходимости число округляется по стандартным правилам арифметики.
**Пример использования**
Запрос:
``` sql
SELECT toDecimalString(CAST('64.32', 'Float64'), 5);
```
Результат:
```response
┌─toDecimalString(CAST('64.32', 'Float64'), 5)┐
│ 64.32000 │
└─────────────────────────────────────────────┘
```
## reinterpretAsUInt(8\|16\|32\|64) {#reinterpretasuint8163264}
## reinterpretAsInt(8\|16\|32\|64) {#reinterpretasint8163264}

View File

@ -812,6 +812,11 @@ bool Client::processWithFuzzing(const String & full_query)
}
catch (...)
{
if (!ast_to_process)
fmt::print(stderr,
"Error while forming new query: {}\n",
getCurrentExceptionMessage(true));
// Some functions (e.g. protocol parsers) don't throw, but
// set last_exception instead, so we'll also do it here for
// uniformity.

View File

@ -65,6 +65,7 @@ if (BUILD_STANDALONE_KEEPER)
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/PrometheusRequestHandler.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/PrometheusMetricsWriter.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/waitServersToFinish.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/ServerType.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTPRequestHandlerFactoryMain.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTP/HTTPServer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Server/HTTP/ReadHeaders.cpp

View File

@ -1457,6 +1457,24 @@ try
access_control.reload(AccessControl::ReloadMode::USERS_CONFIG_ONLY);
});
global_context->setStopServersCallback([&](const ServerType & server_type)
{
stopServers(servers, server_type);
});
global_context->setStartServersCallback([&](const ServerType & server_type)
{
createServers(
config(),
listen_hosts,
listen_try,
server_pool,
async_metrics,
servers,
/* start_servers= */ true,
server_type);
});
/// Limit on total number of concurrently executed queries.
global_context->getProcessList().setMaxSize(server_settings.max_concurrent_queries);
@ -1998,7 +2016,8 @@ void Server::createServers(
Poco::ThreadPool & server_pool,
AsynchronousMetrics & async_metrics,
std::vector<ProtocolServerAdapter> & servers,
bool start_servers)
bool start_servers,
const ServerType & server_type)
{
const Settings & settings = global_context->getSettingsRef();
@ -2012,6 +2031,9 @@ void Server::createServers(
for (const auto & protocol : protocols)
{
if (!server_type.shouldStart(ServerType::Type::CUSTOM, protocol))
continue;
std::vector<std::string> hosts;
if (config.has("protocols." + protocol + ".host"))
hosts.push_back(config.getString("protocols." + protocol + ".host"));
@ -2057,9 +2079,13 @@ void Server::createServers(
}
for (const auto & listen_host : listen_hosts)
{
const char * port_name;
if (server_type.shouldStart(ServerType::Type::HTTP))
{
/// HTTP
const char * port_name = "http_port";
port_name = "http_port";
createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
{
Poco::Net::ServerSocket socket;
@ -2074,7 +2100,10 @@ void Server::createServers(
std::make_unique<HTTPServer>(
httpContext(), createHandlerFactory(*this, config, async_metrics, "HTTPHandler-factory"), server_pool, socket, http_params));
});
}
if (server_type.shouldStart(ServerType::Type::HTTPS))
{
/// HTTPS
port_name = "https_port";
createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
@ -2095,7 +2124,10 @@ void Server::createServers(
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "HTTPS protocol is disabled because Poco library was built without NetSSL support.");
#endif
});
}
if (server_type.shouldStart(ServerType::Type::TCP))
{
/// TCP
port_name = "tcp_port";
createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
@ -2114,7 +2146,10 @@ void Server::createServers(
socket,
new Poco::Net::TCPServerParams));
});
}
if (server_type.shouldStart(ServerType::Type::TCP_WITH_PROXY))
{
/// TCP with PROXY protocol, see https://github.com/wolfeidau/proxyv2/blob/master/docs/proxy-protocol.txt
port_name = "tcp_with_proxy_port";
createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
@ -2133,7 +2168,10 @@ void Server::createServers(
socket,
new Poco::Net::TCPServerParams));
});
}
if (server_type.shouldStart(ServerType::Type::TCP_SECURE))
{
/// TCP with SSL
port_name = "tcp_port_secure";
createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
@ -2157,7 +2195,10 @@ void Server::createServers(
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "SSL support for TCP protocol is disabled because Poco library was built without NetSSL support.");
#endif
});
}
if (server_type.shouldStart(ServerType::Type::MYSQL))
{
port_name = "mysql_port";
createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
{
@ -2171,7 +2212,10 @@ void Server::createServers(
"MySQL compatibility protocol: " + address.toString(),
std::make_unique<TCPServer>(new MySQLHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams));
});
}
if (server_type.shouldStart(ServerType::Type::POSTGRESQL))
{
port_name = "postgresql_port";
createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
{
@ -2185,8 +2229,11 @@ void Server::createServers(
"PostgreSQL compatibility protocol: " + address.toString(),
std::make_unique<TCPServer>(new PostgreSQLHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams));
});
}
#if USE_GRPC
if (server_type.shouldStart(ServerType::Type::GRPC))
{
port_name = "grpc_port";
createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
{
@ -2197,8 +2244,10 @@ void Server::createServers(
"gRPC protocol: " + server_address.toString(),
std::make_unique<GRPCServer>(*this, makeSocketAddress(listen_host, port, &logger())));
});
}
#endif
if (server_type.shouldStart(ServerType::Type::PROMETHEUS))
{
/// Prometheus (if defined and not setup yet with http_port)
port_name = "prometheus.port";
createServer(config, listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
@ -2216,6 +2265,7 @@ void Server::createServers(
});
}
}
}
void Server::createInterserverServers(
Poco::Util::AbstractConfiguration & config,
@ -2224,7 +2274,8 @@ void Server::createInterserverServers(
Poco::ThreadPool & server_pool,
AsynchronousMetrics & async_metrics,
std::vector<ProtocolServerAdapter> & servers,
bool start_servers)
bool start_servers,
const ServerType & server_type)
{
const Settings & settings = global_context->getSettingsRef();
@ -2235,9 +2286,13 @@ void Server::createInterserverServers(
/// Now iterate over interserver_listen_hosts
for (const auto & interserver_listen_host : interserver_listen_hosts)
{
const char * port_name;
if (server_type.shouldStart(ServerType::Type::INTERSERVER_HTTP))
{
/// Interserver IO HTTP
const char * port_name = "interserver_http_port";
port_name = "interserver_http_port";
createServer(config, interserver_listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
{
Poco::Net::ServerSocket socket;
@ -2255,7 +2310,10 @@ void Server::createInterserverServers(
socket,
http_params));
});
}
if (server_type.shouldStart(ServerType::Type::INTERSERVER_HTTPS))
{
port_name = "interserver_https_port";
createServer(config, interserver_listen_host, port_name, listen_try, start_servers, servers, [&](UInt16 port) -> ProtocolServerAdapter
{
@ -2281,6 +2339,44 @@ void Server::createInterserverServers(
});
}
}
}
void Server::stopServers(
std::vector<ProtocolServerAdapter> & servers,
const ServerType & server_type
) const
{
Poco::Logger * log = &logger();
/// Remove servers once all their connections are closed
auto check_server = [&log](const char prefix[], auto & server)
{
if (!server.isStopping())
return false;
size_t current_connections = server.currentConnections();
LOG_DEBUG(log, "Server {}{}: {} ({} connections)",
server.getDescription(),
prefix,
!current_connections ? "finished" : "waiting",
current_connections);
return !current_connections;
};
std::erase_if(servers, std::bind_front(check_server, " (from one of previous remove)"));
for (auto & server : servers)
{
if (!server.isStopping())
{
const std::string server_port_name = server.getPortName();
if (server_type.shouldStop(server_port_name))
server.stop();
}
}
std::erase_if(servers, std::bind_front(check_server, ""));
}
void Server::updateServers(
Poco::Util::AbstractConfiguration & config,

View File

@ -3,8 +3,9 @@
#include <Server/IServer.h>
#include <Daemon/BaseDaemon.h>
#include "Server/HTTP/HTTPContext.h"
#include <Server/HTTP/HTTPContext.h>
#include <Server/TCPProtocolStackFactory.h>
#include <Server/ServerType.h>
#include <Poco/Net/HTTPServerParams.h>
/** Server provides three interfaces:
@ -106,7 +107,8 @@ private:
Poco::ThreadPool & server_pool,
AsynchronousMetrics & async_metrics,
std::vector<ProtocolServerAdapter> & servers,
bool start_servers = false);
bool start_servers = false,
const ServerType & server_type = ServerType(ServerType::Type::QUERIES_ALL));
void createInterserverServers(
Poco::Util::AbstractConfiguration & config,
@ -115,7 +117,8 @@ private:
Poco::ThreadPool & server_pool,
AsynchronousMetrics & async_metrics,
std::vector<ProtocolServerAdapter> & servers,
bool start_servers = false);
bool start_servers = false,
const ServerType & server_type = ServerType(ServerType::Type::QUERIES_ALL));
void updateServers(
Poco::Util::AbstractConfiguration & config,
@ -123,6 +126,11 @@ private:
AsynchronousMetrics & async_metrics,
std::vector<ProtocolServerAdapter> & servers,
std::vector<ProtocolServerAdapter> & servers_to_start_before_tables);
void stopServers(
std::vector<ProtocolServerAdapter> & servers,
const ServerType & server_type
) const;
};
}

View File

@ -187,6 +187,7 @@ enum class AccessType
M(SYSTEM_THREAD_FUZZER, "SYSTEM START THREAD FUZZER, SYSTEM STOP THREAD FUZZER, START THREAD FUZZER, STOP THREAD FUZZER", GLOBAL, SYSTEM) \
M(SYSTEM_UNFREEZE, "SYSTEM UNFREEZE", GLOBAL, SYSTEM) \
M(SYSTEM_FAILPOINT, "SYSTEM ENABLE FAILPOINT, SYSTEM DISABLE FAILPOINT", GLOBAL, SYSTEM) \
M(SYSTEM_LISTEN, "SYSTEM START LISTEN, SYSTEM STOP LISTEN", GLOBAL, SYSTEM) \
M(SYSTEM, "", GROUP, ALL) /* allows to execute SYSTEM {SHUTDOWN|RELOAD CONFIG|...} */ \
\
M(dictGet, "dictHas, dictGetHierarchy, dictIsIn", DICTIONARY, ALL) /* allows to execute functions dictGet(), dictHas(), dictGetHierarchy(), dictIsIn() */\

View File

@ -267,6 +267,10 @@ add_object_library(clickhouse_processors_queryplan Processors/QueryPlan)
add_object_library(clickhouse_processors_queryplan_optimizations Processors/QueryPlan/Optimizations)
add_object_library(clickhouse_user_defined_functions Functions/UserDefined)
if (USE_PARQUET)
add_object_library(clickhouse_processors_formats_impl_parquet Processors/Formats/Impl/Parquet)
endif()
if (TARGET ch_contrib::nuraft)
add_object_library(clickhouse_coordination Coordination)
endif()

View File

@ -1,4 +1,26 @@
#include "Allocator.h"
template class Allocator<false>;
template class Allocator<true>;
/** Keep definition of this constant in cpp file; otherwise its value
* is inlined into allocator code making it impossible to override it
* in third-party code.
*
* Note: extern may seem redundant, but is actually needed due to bug in GCC.
* See also: https://gcc.gnu.org/legacy-ml/gcc-help/2017-12/msg00021.html
*/
#ifdef NDEBUG
__attribute__((__weak__)) extern const size_t MMAP_THRESHOLD = 128 * (1ULL << 20);
#else
/**
* In debug build, use small mmap threshold to reproduce more memory
* stomping bugs. Along with ASLR it will hopefully detect more issues than
* ASan. The program may fail due to the limit on number of memory mappings.
*
* Not too small to avoid too quick exhaust of memory mappings.
*/
__attribute__((__weak__)) extern const size_t MMAP_THRESHOLD = 16384;
#endif
template class Allocator<false, false>;
template class Allocator<true, false>;
template class Allocator<false, true>;
template class Allocator<true, true>;

View File

@ -36,26 +36,51 @@
#include <Common/Allocator_fwd.h>
/// Required for older Darwin builds, that lack definition of MAP_ANONYMOUS
#ifndef MAP_ANONYMOUS
#define MAP_ANONYMOUS MAP_ANON
#endif
/**
* Many modern allocators (for example, tcmalloc) do not do a mremap for
* realloc, even in case of large enough chunks of memory. Although this allows
* you to increase performance and reduce memory consumption during realloc.
* To fix this, we do mremap manually if the chunk of memory is large enough.
* The threshold (64 MB) is chosen quite large, since changing the address
* space is very slow, especially in the case of a large number of threads. We
* expect that the set of operations mmap/something to do/mremap can only be
* performed about 1000 times per second.
*
* P.S. This is also required, because tcmalloc can not allocate a chunk of
* memory greater than 16 GB.
*
* P.P.S. Note that MMAP_THRESHOLD symbol is intentionally made weak. It allows
* to override it during linkage when using ClickHouse as a library in
* third-party applications which may already use own allocator doing mmaps
* in the implementation of alloc/realloc.
*/
extern const size_t MMAP_THRESHOLD;
static constexpr size_t MALLOC_MIN_ALIGNMENT = 8;
namespace CurrentMetrics
{
extern const Metric MMappedAllocs;
extern const Metric MMappedAllocBytes;
}
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int CANNOT_ALLOCATE_MEMORY;
extern const int CANNOT_MUNMAP;
extern const int CANNOT_MREMAP;
extern const int LOGICAL_ERROR;
}
}
/** Previously there was a code which tried to use manual mmap and mremap (clickhouse_mremap.h) for large allocations/reallocations (64MB+).
* Most modern allocators (including jemalloc) don't use mremap, so the idea was to take advantage from mremap system call for large reallocs.
* Actually jemalloc had support for mremap, but it was intentionally removed from codebase https://github.com/jemalloc/jemalloc/commit/e2deab7a751c8080c2b2cdcfd7b11887332be1bb.
* Our performance tests also shows that without manual mmap/mremap/munmap clickhouse is overall faster for about 1-2% and up to 5-7x for some types of queries.
* That is why we don't do manuall mmap/mremap/munmap here and completely rely on jemalloc for allocations of any size.
*/
/** Responsible for allocating / freeing memory. Used, for example, in PODArray, Arena.
* Also used in hash tables.
* The interface is different from std::allocator
@ -63,8 +88,10 @@ namespace ErrorCodes
* - passing the size into the `free` method;
* - by the presence of the `alignment` argument;
* - the possibility of zeroing memory (used in hash tables);
* - random hint address for mmap
* - mmap_threshold for using mmap less or more
*/
template <bool clear_memory_>
template <bool clear_memory_, bool mmap_populate>
class Allocator
{
public:
@ -82,7 +109,7 @@ public:
try
{
checkSize(size);
freeNoTrack(buf);
freeNoTrack(buf, size);
CurrentMemoryTracker::free(size);
}
catch (...)
@ -105,26 +132,49 @@ public:
/// nothing to do.
/// BTW, it's not possible to change alignment while doing realloc.
}
else if (alignment <= MALLOC_MIN_ALIGNMENT)
else if (old_size < MMAP_THRESHOLD && new_size < MMAP_THRESHOLD
&& alignment <= MALLOC_MIN_ALIGNMENT)
{
/// Resize malloc'd memory region with no special alignment requirement.
CurrentMemoryTracker::realloc(old_size, new_size);
void * new_buf = ::realloc(buf, new_size);
if (nullptr == new_buf)
{
DB::throwFromErrno(
fmt::format("Allocator: Cannot realloc from {} to {}.", ReadableSize(old_size), ReadableSize(new_size)), DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
}
DB::throwFromErrno(fmt::format("Allocator: Cannot realloc from {} to {}.", ReadableSize(old_size), ReadableSize(new_size)), DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
buf = new_buf;
if constexpr (clear_memory)
if (new_size > old_size)
memset(reinterpret_cast<char *>(buf) + old_size, 0, new_size - old_size);
}
else if (old_size >= MMAP_THRESHOLD && new_size >= MMAP_THRESHOLD)
{
/// Resize mmap'd memory region.
CurrentMemoryTracker::realloc(old_size, new_size);
// On apple and freebsd self-implemented mremap used (common/mremap.h)
buf = clickhouse_mremap(buf, old_size, new_size, MREMAP_MAYMOVE,
PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
if (MAP_FAILED == buf)
DB::throwFromErrno(fmt::format("Allocator: Cannot mremap memory chunk from {} to {}.",
ReadableSize(old_size), ReadableSize(new_size)), DB::ErrorCodes::CANNOT_MREMAP);
/// No need for zero-fill, because mmap guarantees it.
}
else if (new_size < MMAP_THRESHOLD)
{
/// Small allocs that requires a copy. Assume there's enough memory in system. Call CurrentMemoryTracker once.
CurrentMemoryTracker::realloc(old_size, new_size);
void * new_buf = allocNoTrack(new_size, alignment);
memcpy(new_buf, buf, std::min(old_size, new_size));
freeNoTrack(buf, old_size);
buf = new_buf;
}
else
{
/// Big allocs that requires a copy. MemoryTracker is called inside 'alloc', 'free' methods.
void * new_buf = alloc(new_size, alignment);
memcpy(new_buf, buf, std::min(old_size, new_size));
free(buf, old_size);
@ -142,10 +192,43 @@ protected:
static constexpr bool clear_memory = clear_memory_;
// Freshly mmapped pages are copy-on-write references to a global zero page.
// On the first write, a page fault occurs, and an actual writable page is
// allocated. If we are going to use this memory soon, such as when resizing
// hash tables, it makes sense to pre-fault the pages by passing
// MAP_POPULATE to mmap(). This takes some time, but should be faster
// overall than having a hot loop interrupted by page faults.
// It is only supported on Linux.
static constexpr int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS
#if defined(OS_LINUX)
| (mmap_populate ? MAP_POPULATE : 0)
#endif
;
private:
void * allocNoTrack(size_t size, size_t alignment)
{
void * buf;
size_t mmap_min_alignment = ::getPageSize();
if (size >= MMAP_THRESHOLD)
{
if (alignment > mmap_min_alignment)
throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS,
"Too large alignment {}: more than page size when allocating {}.",
ReadableSize(alignment), ReadableSize(size));
buf = mmap(getMmapHint(), size, PROT_READ | PROT_WRITE,
mmap_flags, -1, 0);
if (MAP_FAILED == buf)
DB::throwFromErrno(fmt::format("Allocator: Cannot mmap {}.", ReadableSize(size)), DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
/// No need for zero-fill, because mmap guarantees it.
CurrentMetrics::add(CurrentMetrics::MMappedAllocs);
CurrentMetrics::add(CurrentMetrics::MMappedAllocBytes, size);
}
else
{
if (alignment <= MALLOC_MIN_ALIGNMENT)
{
if constexpr (clear_memory)
@ -168,13 +251,25 @@ private:
if constexpr (clear_memory)
memset(buf, 0, size);
}
}
return buf;
}
void freeNoTrack(void * buf)
void freeNoTrack(void * buf, size_t size)
{
if (size >= MMAP_THRESHOLD)
{
if (0 != munmap(buf, size))
DB::throwFromErrno(fmt::format("Allocator: Cannot munmap {}.", ReadableSize(size)), DB::ErrorCodes::CANNOT_MUNMAP);
CurrentMetrics::sub(CurrentMetrics::MMappedAllocs);
CurrentMetrics::sub(CurrentMetrics::MMappedAllocBytes, size);
}
else
{
::free(buf);
}
}
void checkSize(size_t size)
{
@ -182,6 +277,21 @@ private:
if (size >= 0x8000000000000000ULL)
throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Too large size ({}) passed to allocator. It indicates an error.", size);
}
#ifndef NDEBUG
/// In debug builds, request mmap() at random addresses (a kind of ASLR), to
/// reproduce more memory stomping bugs. Note that Linux doesn't do it by
/// default. This may lead to worse TLB performance.
void * getMmapHint()
{
return reinterpret_cast<void *>(std::uniform_int_distribution<intptr_t>(0x100000000000UL, 0x700000000000UL)(thread_local_rng));
}
#else
void * getMmapHint()
{
return nullptr;
}
#endif
};
@ -257,5 +367,7 @@ constexpr size_t allocatorInitialBytes<AllocatorWithStackMemory<
/// Prevent implicit template instantiation of Allocator
extern template class Allocator<false>;
extern template class Allocator<true>;
extern template class Allocator<false, false>;
extern template class Allocator<true, false>;
extern template class Allocator<false, true>;
extern template class Allocator<true, true>;

View File

@ -3,7 +3,7 @@
* This file provides forward declarations for Allocator.
*/
template <bool clear_memory_>
template <bool clear_memory_, bool mmap_populate = false>
class Allocator;
template <typename Base, size_t N = 64, size_t Alignment = 1>

View File

@ -149,8 +149,10 @@
M(RestartReplicaThreadsActive, "Number of threads in the RESTART REPLICA thread pool running a task.") \
M(QueryPipelineExecutorThreads, "Number of threads in the PipelineExecutor thread pool.") \
M(QueryPipelineExecutorThreadsActive, "Number of threads in the PipelineExecutor thread pool running a task.") \
M(ParquetDecoderThreads, "Number of threads in the ParquetBlockInputFormat thread pool running a task.") \
M(ParquetDecoderThreadsActive, "Number of threads in the ParquetBlockInputFormat thread pool.") \
M(ParquetDecoderThreads, "Number of threads in the ParquetBlockInputFormat thread pool.") \
M(ParquetDecoderThreadsActive, "Number of threads in the ParquetBlockInputFormat thread pool running a task.") \
M(ParquetEncoderThreads, "Number of threads in ParquetBlockOutputFormat thread pool.") \
M(ParquetEncoderThreadsActive, "Number of threads in ParquetBlockOutputFormat thread pool running a task.") \
M(OutdatedPartsLoadingThreads, "Number of threads in the threadpool for loading Outdated data parts.") \
M(OutdatedPartsLoadingThreadsActive, "Number of active threads in the threadpool for loading Outdated data parts.") \
M(DistributedBytesToInsert, "Number of pending bytes to process for asynchronous insertion into Distributed tables. Number of bytes for every shard is summed.") \
@ -173,6 +175,8 @@
M(PartsInMemory, "In-memory parts.") \
M(MMappedFiles, "Total number of mmapped files.") \
M(MMappedFileBytes, "Sum size of mmapped file regions.") \
M(MMappedAllocs, "Total number of mmapped allocations") \
M(MMappedAllocBytes, "Sum bytes of mmapped allocations") \
M(AsynchronousReadWait, "Number of threads waiting for asynchronous read.") \
M(PendingAsyncInsert, "Number of asynchronous inserts that are waiting for flush.") \
M(KafkaConsumers, "Number of active Kafka consumers") \

View File

@ -8,7 +8,7 @@
* table, so it makes sense to pre-fault the pages so that page faults don't
* interrupt the resize loop. Set the allocator parameter accordingly.
*/
using HashTableAllocator = Allocator<true /* clear_memory */>;
using HashTableAllocator = Allocator<true /* clear_memory */, true /* mmap_populate */>;
template <size_t initial_bytes = 64>
using HashTableAllocatorWithStackMemory = AllocatorWithStackMemory<HashTableAllocator, initial_bytes>;

View File

@ -27,15 +27,9 @@ struct Interval
};
template <typename IntervalStorageType>
bool operator<(const Interval<IntervalStorageType> & lhs, const Interval<IntervalStorageType> & rhs)
auto operator<=>(const Interval<IntervalStorageType> & lhs, const Interval<IntervalStorageType> & rhs)
{
return std::tie(lhs.left, lhs.right) < std::tie(rhs.left, rhs.right);
}
template <typename IntervalStorageType>
bool operator<=(const Interval<IntervalStorageType> & lhs, const Interval<IntervalStorageType> & rhs)
{
return std::tie(lhs.left, lhs.right) <= std::tie(rhs.left, rhs.right);
return std::tie(lhs.left, lhs.right) <=> std::tie(rhs.left, rhs.right);
}
template <typename IntervalStorageType>
@ -44,24 +38,6 @@ bool operator==(const Interval<IntervalStorageType> & lhs, const Interval<Interv
return std::tie(lhs.left, lhs.right) == std::tie(rhs.left, rhs.right);
}
template <typename IntervalStorageType>
bool operator!=(const Interval<IntervalStorageType> & lhs, const Interval<IntervalStorageType> & rhs)
{
return std::tie(lhs.left, lhs.right) != std::tie(rhs.left, rhs.right);
}
template <typename IntervalStorageType>
bool operator>(const Interval<IntervalStorageType> & lhs, const Interval<IntervalStorageType> & rhs)
{
return std::tie(lhs.left, lhs.right) > std::tie(rhs.left, rhs.right);
}
template <typename IntervalStorageType>
bool operator>=(const Interval<IntervalStorageType> & lhs, const Interval<IntervalStorageType> & rhs)
{
return std::tie(lhs.left, lhs.right) >= std::tie(rhs.left, rhs.right);
}
struct IntervalTreeVoidValue
{
};

View File

@ -43,6 +43,17 @@ struct PreformattedMessage
operator const std::string & () const { return text; }
operator std::string () && { return std::move(text); }
operator fmt::format_string<> () const { UNREACHABLE(); }
void apply(std::string & out_text, std::string_view & out_format_string) const &
{
out_text = text;
out_format_string = format_string;
}
void apply(std::string & out_text, std::string_view & out_format_string) &&
{
out_text = std::move(text);
out_format_string = format_string;
}
};
template <typename... Args>
@ -99,10 +110,33 @@ template <typename T> constexpr std::string_view tryGetStaticFormatString(T && x
}
}
/// Constexpr ifs are not like ifdefs, and compiler still checks that unneeded code can be compiled
/// This template is useful to avoid compilation failures when condition of some "constexpr if" is false
template<bool enable> struct ConstexprIfsAreNotIfdefs
{
template <typename T> constexpr static std::string_view getStaticFormatString(T &&) { return {}; }
template <typename T> static PreformattedMessage getPreformatted(T &&) { return {}; }
};
template<> struct ConstexprIfsAreNotIfdefs<true>
{
template <typename T> consteval static std::string_view getStaticFormatString(T && x)
{
/// See tryGetStaticFormatString(...)
static_assert(!std::is_same_v<std::string, std::decay_t<T>>);
static_assert(std::is_nothrow_convertible<T, const char * const>::value);
static_assert(!std::is_pointer<T>::value);
return std::string_view(x);
}
template <typename T> static T && getPreformatted(T && x) { return std::forward<T>(x); }
};
template <typename... Ts> constexpr size_t numArgs(Ts &&...) { return sizeof...(Ts); }
template <typename T, typename... Ts> constexpr auto firstArg(T && x, Ts &&...) { return std::forward<T>(x); }
/// For implicit conversion of fmt::basic_runtime<> to char* for std::string ctor
template <typename T, typename... Ts> constexpr auto firstArg(fmt::basic_runtime<T> && data, Ts &&...) { return data.str.data(); }
template <typename T, typename... Ts> constexpr auto firstArg(const fmt::basic_runtime<T> & data, Ts &&...) { return data.str.data(); }
consteval ssize_t formatStringCountArgsNum(const char * const str, size_t len)
{
@ -142,26 +176,19 @@ consteval void formatStringCheckArgsNumImpl(std::string_view str, size_t nargs)
functionThatFailsCompilationOfConstevalFunctions("unexpected number of arguments in a format string");
}
template <typename... Args>
struct CheckArgsNumHelperImpl
{
template<typename T>
consteval CheckArgsNumHelperImpl(T && str)
consteval void formatStringCheckArgsNum(T && str, size_t nargs)
{
formatStringCheckArgsNumImpl(tryGetStaticFormatString(str), sizeof...(Args));
formatStringCheckArgsNumImpl(tryGetStaticFormatString(str), nargs);
}
template<typename T> inline void formatStringCheckArgsNum(fmt::basic_runtime<T> &&, size_t) {}
template<> inline void formatStringCheckArgsNum(PreformattedMessage &, size_t) {}
template<> inline void formatStringCheckArgsNum(const PreformattedMessage &, size_t) {}
template<> inline void formatStringCheckArgsNum(PreformattedMessage &&, size_t) {}
/// No checks for fmt::runtime and PreformattedMessage
template<typename T> CheckArgsNumHelperImpl(fmt::basic_runtime<T> &&) {}
template<> CheckArgsNumHelperImpl(PreformattedMessage &) {}
template<> CheckArgsNumHelperImpl(const PreformattedMessage &) {}
template<> CheckArgsNumHelperImpl(PreformattedMessage &&) {}
};
template <typename... Args> using CheckArgsNumHelper = CheckArgsNumHelperImpl<std::type_identity_t<Args>...>;
template <typename... Args> void formatStringCheckArgsNum(CheckArgsNumHelper<Args...>, Args &&...) {}
template<typename T> struct FormatStringTypeInfo{ static constexpr bool is_static = true; static constexpr bool has_format = true; };
template<typename T> struct FormatStringTypeInfo<fmt::basic_runtime<T>> { static constexpr bool is_static = false; static constexpr bool has_format = false; };
template<> struct FormatStringTypeInfo<PreformattedMessage> { static constexpr bool is_static = false; static constexpr bool has_format = true; };
/// This wrapper helps to avoid too frequent and noisy log messages.
/// For each pair (logger_name, format_string) it remembers when such a message was logged the last time.

View File

@ -15,4 +15,14 @@ template class PODArray<Int8, 4096, Allocator<false>, PADDING_FOR_SIMD - 1, PADD
template class PODArray<Int16, 4096, Allocator<false>, PADDING_FOR_SIMD - 1, PADDING_FOR_SIMD>;
template class PODArray<Int32, 4096, Allocator<false>, PADDING_FOR_SIMD - 1, PADDING_FOR_SIMD>;
template class PODArray<Int64, 4096, Allocator<false>, PADDING_FOR_SIMD - 1, PADDING_FOR_SIMD>;
template class PODArray<UInt8, 4096, Allocator<false>, 0, 0>;
template class PODArray<UInt16, 4096, Allocator<false>, 0, 0>;
template class PODArray<UInt32, 4096, Allocator<false>, 0, 0>;
template class PODArray<UInt64, 4096, Allocator<false>, 0, 0>;
template class PODArray<Int8, 4096, Allocator<false>, 0, 0>;
template class PODArray<Int16, 4096, Allocator<false>, 0, 0>;
template class PODArray<Int32, 4096, Allocator<false>, 0, 0>;
template class PODArray<Int64, 4096, Allocator<false>, 0, 0>;
}

View File

@ -783,4 +783,15 @@ extern template class PODArray<Int8, 4096, Allocator<false>, PADDING_FOR_SIMD -
extern template class PODArray<Int16, 4096, Allocator<false>, PADDING_FOR_SIMD - 1, PADDING_FOR_SIMD>;
extern template class PODArray<Int32, 4096, Allocator<false>, PADDING_FOR_SIMD - 1, PADDING_FOR_SIMD>;
extern template class PODArray<Int64, 4096, Allocator<false>, PADDING_FOR_SIMD - 1, PADDING_FOR_SIMD>;
extern template class PODArray<UInt8, 4096, Allocator<false>, 0, 0>;
extern template class PODArray<UInt16, 4096, Allocator<false>, 0, 0>;
extern template class PODArray<UInt32, 4096, Allocator<false>, 0, 0>;
extern template class PODArray<UInt64, 4096, Allocator<false>, 0, 0>;
extern template class PODArray<Int8, 4096, Allocator<false>, 0, 0>;
extern template class PODArray<Int16, 4096, Allocator<false>, 0, 0>;
extern template class PODArray<Int32, 4096, Allocator<false>, 0, 0>;
extern template class PODArray<Int64, 4096, Allocator<false>, 0, 0>;
}

View File

@ -101,9 +101,6 @@ void ProgressIndication::writeFinalProgress()
<< formatReadableSizeWithDecimalSuffix(progress.read_bytes * 1000000000.0 / elapsed_ns) << "/s.)";
else
std::cout << ". ";
auto peak_memory_usage = getMemoryUsage().peak;
if (peak_memory_usage >= 0)
std::cout << "\nPeak memory usage (for query) " << formatReadableSizeWithBinarySuffix(peak_memory_usage) << ".";
}
void ProgressIndication::writeProgress(WriteBufferFromFileDescriptor & message)

View File

@ -1,7 +1,7 @@
#pragma once
/// Macros for convenient usage of Poco logger.
#include <unistd.h>
#include <fmt/format.h>
#include <Poco/Logger.h>
#include <Poco/Message.h>
@ -28,6 +28,32 @@ namespace
#define LOG_IMPL_FIRST_ARG(X, ...) X
/// Copy-paste from contrib/libpq/include/c.h
/// There's no easy way to count the number of arguments without evaluating these arguments...
#define CH_VA_ARGS_NARGS(...) \
CH_VA_ARGS_NARGS_(__VA_ARGS__, \
63,62,61,60, \
59,58,57,56,55,54,53,52,51,50, \
49,48,47,46,45,44,43,42,41,40, \
39,38,37,36,35,34,33,32,31,30, \
29,28,27,26,25,24,23,22,21,20, \
19,18,17,16,15,14,13,12,11,10, \
9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
#define CH_VA_ARGS_NARGS_( \
_01,_02,_03,_04,_05,_06,_07,_08,_09,_10, \
_11,_12,_13,_14,_15,_16,_17,_18,_19,_20, \
_21,_22,_23,_24,_25,_26,_27,_28,_29,_30, \
_31,_32,_33,_34,_35,_36,_37,_38,_39,_40, \
_41,_42,_43,_44,_45,_46,_47,_48,_49,_50, \
_51,_52,_53,_54,_55,_56,_57,_58,_59,_60, \
_61,_62,_63, N, ...) \
(N)
#define LINE_NUM_AS_STRING_IMPL2(x) #x
#define LINE_NUM_AS_STRING_IMPL(x) LINE_NUM_AS_STRING_IMPL2(x)
#define LINE_NUM_AS_STRING LINE_NUM_AS_STRING_IMPL(__LINE__)
#define MESSAGE_FOR_EXCEPTION_ON_LOGGING "Failed to write a log message: " __FILE__ ":" LINE_NUM_AS_STRING "\n"
/// Logs a message to a specified logger with that level.
/// If more than one argument is provided,
/// the first argument is interpreted as a template with {}-substitutions
@ -39,21 +65,48 @@ namespace
auto _logger = ::getLogger(logger); \
const bool _is_clients_log = (DB::CurrentThread::getGroup() != nullptr) && \
(DB::CurrentThread::get().getClientLogsLevel() >= (priority)); \
if (_is_clients_log || _logger->is((PRIORITY))) \
if (!_is_clients_log && !_logger->is((PRIORITY))) \
break; \
\
try \
{ \
std::string formatted_message = numArgs(__VA_ARGS__) > 1 ? fmt::format(__VA_ARGS__) : firstArg(__VA_ARGS__); \
formatStringCheckArgsNum(__VA_ARGS__); \
if (auto _channel = _logger->getChannel()) \
{ \
std::string file_function; \
file_function += __FILE__; \
file_function += "; "; \
file_function += __PRETTY_FUNCTION__; \
Poco::Message poco_message(_logger->name(), formatted_message, \
(PRIORITY), file_function.c_str(), __LINE__, tryGetStaticFormatString(LOG_IMPL_FIRST_ARG(__VA_ARGS__))); \
_channel->log(poco_message); \
} \
ProfileEvents::incrementForLogMessage(PRIORITY); \
auto _channel = _logger->getChannel(); \
if (!_channel) \
break; \
\
constexpr size_t _nargs = CH_VA_ARGS_NARGS(__VA_ARGS__); \
using LogTypeInfo = FormatStringTypeInfo<std::decay_t<decltype(LOG_IMPL_FIRST_ARG(__VA_ARGS__))>>; \
\
std::string_view _format_string; \
std::string _formatted_message; \
\
if constexpr (LogTypeInfo::is_static) \
{ \
formatStringCheckArgsNum(LOG_IMPL_FIRST_ARG(__VA_ARGS__), _nargs - 1); \
_format_string = ConstexprIfsAreNotIfdefs<LogTypeInfo::is_static>::getStaticFormatString(LOG_IMPL_FIRST_ARG(__VA_ARGS__)); \
} \
\
constexpr bool is_preformatted_message = !LogTypeInfo::is_static && LogTypeInfo::has_format; \
if constexpr (is_preformatted_message) \
{ \
static_assert(_nargs == 1 || !is_preformatted_message); \
ConstexprIfsAreNotIfdefs<is_preformatted_message>::getPreformatted(LOG_IMPL_FIRST_ARG(__VA_ARGS__)).apply(_formatted_message, _format_string); \
} \
else \
{ \
_formatted_message = _nargs == 1 ? firstArg(__VA_ARGS__) : fmt::format(__VA_ARGS__); \
} \
\
std::string _file_function = __FILE__ "; "; \
_file_function += __PRETTY_FUNCTION__; \
Poco::Message _poco_message(_logger->name(), std::move(_formatted_message), \
(PRIORITY), _file_function.c_str(), __LINE__, _format_string); \
_channel->log(_poco_message); \
} \
catch (...) \
{ \
::write(STDERR_FILENO, static_cast<const void *>(MESSAGE_FOR_EXCEPTION_ON_LOGGING), sizeof(MESSAGE_FOR_EXCEPTION_ON_LOGGING)); \
} \
} while (false)

View File

@ -1,6 +1,7 @@
#include <string>
#include <vector>
#include <Common/logger_useful.h>
#include <Common/thread_local_rng.h>
#include <gtest/gtest.h>
#include <Poco/Logger.h>
@ -50,3 +51,55 @@ TEST(Logger, TestLog)
}
}
static size_t global_counter = 0;
static std::string getLogMessage()
{
++global_counter;
return "test1 " + std::to_string(thread_local_rng());
}
static size_t getLogMessageParam()
{
++global_counter;
return thread_local_rng();
}
static PreformattedMessage getPreformatted()
{
++global_counter;
return PreformattedMessage::create("test3 {}", thread_local_rng());
}
static size_t getLogMessageParamOrThrow()
{
size_t x = thread_local_rng();
if (x % 1000 == 0)
return x;
throw Poco::Exception("error", 42);
}
TEST(Logger, SideEffects)
{
std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
auto my_channel = Poco::AutoPtr<Poco::StreamChannel>(new Poco::StreamChannel(oss));
auto * log = &Poco::Logger::create("Logger", my_channel.get());
log->setLevel("trace");
/// Ensure that parameters are evaluated only once
global_counter = 0;
LOG_TRACE(log, fmt::runtime(getLogMessage()));
EXPECT_EQ(global_counter, 1);
LOG_TRACE(log, "test2 {}", getLogMessageParam());
EXPECT_EQ(global_counter, 2);
LOG_TRACE(log, getPreformatted());
EXPECT_EQ(global_counter, 3);
auto var = PreformattedMessage::create("test4 {}", thread_local_rng());
LOG_TRACE(log, var);
EXPECT_EQ(var.text.starts_with("test4 "), true);
EXPECT_EQ(var.format_string, "test4 {}");
LOG_TRACE(log, "test no throw {}", getLogMessageParamOrThrow());
}

View File

@ -674,6 +674,7 @@ class IColumn;
M(UInt64, remote_read_min_bytes_for_seek, 4 * DBMS_DEFAULT_BUFFER_SIZE, "Min bytes required for remote read (url, s3) to do seek, instead of read with ignore.", 0) \
M(UInt64, merge_tree_min_bytes_per_task_for_remote_reading, 4 * DBMS_DEFAULT_BUFFER_SIZE, "Min bytes to read per task.", 0) \
M(Bool, merge_tree_use_const_size_tasks_for_remote_reading, true, "Whether to use constant size tasks for reading from a remote table.", 0) \
M(Bool, merge_tree_determine_task_size_by_prewhere_columns, true, "Whether to use only prewhere columns size to determine reading task size.", 0) \
\
M(Bool, async_insert, false, "If true, data from INSERT query is stored in queue and later flushed to table in background. If wait_for_async_insert is false, INSERT query is processed almost instantly, otherwise client will wait until data will be flushed to table", 0) \
M(Bool, wait_for_async_insert, true, "If true wait for processing of asynchronous insertion", 0) \
@ -953,6 +954,10 @@ class IColumn;
M(ParquetVersion, output_format_parquet_version, "2.latest", "Parquet format version for output format. Supported versions: 1.0, 2.4, 2.6 and 2.latest (default)", 0) \
M(ParquetCompression, output_format_parquet_compression_method, "lz4", "Compression method for Parquet output format. Supported codecs: snappy, lz4, brotli, zstd, gzip, none (uncompressed)", 0) \
M(Bool, output_format_parquet_compliant_nested_types, true, "In parquet file schema, use name 'element' instead of 'item' for list elements. This is a historical artifact of Arrow library implementation. Generally increases compatibility, except perhaps with some old versions of Arrow.", 0) \
M(Bool, output_format_parquet_use_custom_encoder, true, "Use experimental faster Parquet encoder implementation.", 0) \
M(Bool, output_format_parquet_parallel_encoding, true, "Do Parquet encoding in multiple threads. Requires output_format_parquet_use_custom_encoder.", 0) \
M(UInt64, output_format_parquet_data_page_size, 1024 * 1024, "Target page size in bytes, before compression.", 0) \
M(UInt64, output_format_parquet_batch_size, 1024, "Check page size every this many rows. Consider decreasing if you have columns with average values size above a few KBs.", 0) \
M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy'.", 0) \
M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \
M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \

View File

@ -130,6 +130,10 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.parquet.max_block_size = settings.input_format_parquet_max_block_size;
format_settings.parquet.output_compression_method = settings.output_format_parquet_compression_method;
format_settings.parquet.output_compliant_nested_types = settings.output_format_parquet_compliant_nested_types;
format_settings.parquet.use_custom_encoder = settings.output_format_parquet_use_custom_encoder;
format_settings.parquet.parallel_encoding = settings.output_format_parquet_parallel_encoding;
format_settings.parquet.data_page_size = settings.output_format_parquet_data_page_size;
format_settings.parquet.write_batch_size = settings.output_format_parquet_batch_size;
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
format_settings.pretty.color = settings.output_format_pretty_color;
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
@ -434,7 +438,7 @@ OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible(
return format;
}
return getOutputFormat(name, buf, sample, context, _format_settings);
return getOutputFormat(name, buf, sample, context, format_settings);
}
@ -453,6 +457,7 @@ OutputFormatPtr FormatFactory::getOutputFormat(
context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Format, name);
auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
format_settings.max_threads = context->getSettingsRef().max_threads;
/** TODO: Materialization is needed, because formats can use the functions `IDataType`,
* which only work with full columns.

View File

@ -100,6 +100,8 @@ struct FormatSettings
UInt64 max_parser_depth = DBMS_DEFAULT_MAX_PARSER_DEPTH;
size_t max_threads = 1;
enum class ArrowCompression
{
NONE,
@ -233,10 +235,14 @@ struct FormatSettings
bool output_string_as_string = false;
bool output_fixed_string_as_fixed_byte_array = true;
bool preserve_order = false;
bool use_custom_encoder = true;
bool parallel_encoding = true;
UInt64 max_block_size = 8192;
ParquetVersion output_version;
ParquetCompression output_compression_method = ParquetCompression::SNAPPY;
bool output_compliant_nested_types = true;
size_t data_page_size = 1024 * 1024;
size_t write_batch_size = 1024;
} parquet;
struct Pretty

View File

@ -0,0 +1,22 @@
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionToDecimalString.h>
#include <Functions/IFunction.h>
namespace DB
{
REGISTER_FUNCTION(ToDecimalString)
{
factory.registerFunction<FunctionToDecimalString>(
FunctionDocumentation{
.description=R"(
Returns string representation of a number. First argument is the number of any numeric type,
second argument is the desired number of digits in fractional part. Returns String.
)",
.examples{{"toDecimalString", "SELECT toDecimalString(2.1456,2)", ""}},
.categories{"String"}
}, FunctionFactory::CaseInsensitive);
}
}

View File

@ -0,0 +1,262 @@
#pragma once
#include <Core/Types.h>
#include <Core/DecimalFunctions.h>
#include <Functions/IFunction.h>
#include <Functions/FunctionHelpers.h>
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnDecimal.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
#include <IO/WriteBufferFromVector.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/Context_fwd.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER;
}
class FunctionToDecimalString : public IFunction
{
public:
static constexpr auto name = "toDecimalString";
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionToDecimalString>(); }
String getName() const override { return name; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
size_t getNumberOfArguments() const override { return 2; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
FunctionArgumentDescriptors mandatory_args = {
{"Value", &isNumber<IDataType>, nullptr, "Number"},
{"precision", &isNativeInteger<IDataType>, &isColumnConst, "const Integer"}
};
validateFunctionArgumentTypes(*this, arguments, mandatory_args, {});
return std::make_shared<DataTypeString>();
}
bool useDefaultImplementationForConstants() const override { return true; }
private:
/// For operations with Integer/Float
template <typename FromVectorType>
void vectorConstant(const FromVectorType & vec_from, UInt8 precision,
ColumnString::Chars & vec_to, ColumnString::Offsets & result_offsets) const
{
size_t input_rows_count = vec_from.size();
result_offsets.resize(input_rows_count);
/// Buffer is used here and in functions below because resulting size cannot be precisely anticipated,
/// and buffer resizes on-the-go. Also, .count() provided by buffer is convenient in this case.
WriteBufferFromVector<ColumnString::Chars> buf_to(vec_to);
for (size_t i = 0; i < input_rows_count; ++i)
{
format(vec_from[i], buf_to, precision);
result_offsets[i] = buf_to.count();
}
buf_to.finalize();
}
template <typename FirstArgVectorType>
void vectorVector(const FirstArgVectorType & vec_from, const ColumnVector<UInt8>::Container & vec_precision,
ColumnString::Chars & vec_to, ColumnString::Offsets & result_offsets) const
{
size_t input_rows_count = vec_from.size();
result_offsets.resize(input_rows_count);
WriteBufferFromVector<ColumnString::Chars> buf_to(vec_to);
constexpr size_t max_digits = std::numeric_limits<UInt256>::digits10;
for (size_t i = 0; i < input_rows_count; ++i)
{
if (vec_precision[i] > max_digits)
throw DB::Exception(DB::ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER,
"Too many fractional digits requested, shall not be more than {}", max_digits);
format(vec_from[i], buf_to, vec_precision[i]);
result_offsets[i] = buf_to.count();
}
buf_to.finalize();
}
/// For operations with Decimal
template <typename FirstArgVectorType>
void vectorConstant(const FirstArgVectorType & vec_from, UInt8 precision,
ColumnString::Chars & vec_to, ColumnString::Offsets & result_offsets, UInt8 from_scale) const
{
/// There are no more than 77 meaning digits (as it is the max length of UInt256). So we can limit it with 77.
constexpr size_t max_digits = std::numeric_limits<UInt256>::digits10;
if (precision > max_digits)
throw DB::Exception(DB::ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER,
"Too many fractional digits requested for Decimal, must not be more than {}", max_digits);
WriteBufferFromVector<ColumnString::Chars> buf_to(vec_to);
size_t input_rows_count = vec_from.size();
result_offsets.resize(input_rows_count);
for (size_t i = 0; i < input_rows_count; ++i)
{
writeText(vec_from[i], from_scale, buf_to, true, true, precision);
writeChar(0, buf_to);
result_offsets[i] = buf_to.count();
}
buf_to.finalize();
}
template <typename FirstArgVectorType>
void vectorVector(const FirstArgVectorType & vec_from, const ColumnVector<UInt8>::Container & vec_precision,
ColumnString::Chars & vec_to, ColumnString::Offsets & result_offsets, UInt8 from_scale) const
{
size_t input_rows_count = vec_from.size();
result_offsets.resize(input_rows_count);
WriteBufferFromVector<ColumnString::Chars> buf_to(vec_to);
constexpr size_t max_digits = std::numeric_limits<UInt256>::digits10;
for (size_t i = 0; i < input_rows_count; ++i)
{
if (vec_precision[i] > max_digits)
throw DB::Exception(DB::ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER,
"Too many fractional digits requested for Decimal, must not be more than {}", max_digits);
writeText(vec_from[i], from_scale, buf_to, true, true, vec_precision[i]);
writeChar(0, buf_to);
result_offsets[i] = buf_to.count();
}
buf_to.finalize();
}
template <is_floating_point T>
static void format(T value, DB::WriteBuffer & out, UInt8 precision)
{
/// Maximum of 60 is hard-coded in 'double-conversion/double-conversion.h' for floating point values,
/// Catch this here to give user a more reasonable error.
if (precision > 60)
throw DB::Exception(DB::ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER,
"Too high precision requested for Float, must not be more than 60, got {}", Int8(precision));
DB::DoubleConverter<false>::BufferType buffer;
double_conversion::StringBuilder builder{buffer, sizeof(buffer)};
const auto result = DB::DoubleConverter<false>::instance().ToFixed(value, precision, &builder);
if (!result)
throw DB::Exception(DB::ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER, "Error processing number: {}", value);
out.write(buffer, builder.position());
writeChar(0, out);
}
template <is_integer T>
static void format(T value, DB::WriteBuffer & out, UInt8 precision)
{
/// Fractional part for Integer is just trailing zeros. Let's limit it with 77 (like with Decimals).
constexpr size_t max_digits = std::numeric_limits<UInt256>::digits10;
if (precision > max_digits)
throw DB::Exception(DB::ErrorCodes::CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER,
"Too many fractional digits requested, shall not be more than {}", max_digits);
writeText(value, out);
if (precision > 0) [[likely]]
{
writeChar('.', out);
for (int i = 0; i < precision; ++i)
writeChar('0', out);
writeChar(0, out);
}
}
public:
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
{
switch (arguments[0].type->getTypeId())
{
case TypeIndex::UInt8: return executeType<UInt8>(arguments);
case TypeIndex::UInt16: return executeType<UInt16>(arguments);
case TypeIndex::UInt32: return executeType<UInt32>(arguments);
case TypeIndex::UInt64: return executeType<UInt64>(arguments);
case TypeIndex::UInt128: return executeType<UInt128>(arguments);
case TypeIndex::UInt256: return executeType<UInt256>(arguments);
case TypeIndex::Int8: return executeType<Int8>(arguments);
case TypeIndex::Int16: return executeType<Int16>(arguments);
case TypeIndex::Int32: return executeType<Int32>(arguments);
case TypeIndex::Int64: return executeType<Int64>(arguments);
case TypeIndex::Int128: return executeType<Int128>(arguments);
case TypeIndex::Int256: return executeType<Int256>(arguments);
case TypeIndex::Float32: return executeType<Float32>(arguments);
case TypeIndex::Float64: return executeType<Float64>(arguments);
case TypeIndex::Decimal32: return executeType<Decimal32>(arguments);
case TypeIndex::Decimal64: return executeType<Decimal64>(arguments);
case TypeIndex::Decimal128: return executeType<Decimal128>(arguments);
case TypeIndex::Decimal256: return executeType<Decimal256>(arguments);
default:
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}",
arguments[0].column->getName(), getName());
}
}
private:
template <typename T>
ColumnPtr executeType(const ColumnsWithTypeAndName & arguments) const
{
const auto * precision_col = checkAndGetColumn<ColumnVector<UInt8>>(arguments[1].column.get());
const auto * precision_col_const = checkAndGetColumnConst<ColumnVector<UInt8>>(arguments[1].column.get());
auto result_col = ColumnString::create();
auto * result_col_string = assert_cast<ColumnString *>(result_col.get());
ColumnString::Chars & result_chars = result_col_string->getChars();
ColumnString::Offsets & result_offsets = result_col_string->getOffsets();
if constexpr (is_decimal<T>)
{
const auto * from_col = checkAndGetColumn<ColumnDecimal<T>>(arguments[0].column.get());
UInt8 from_scale = from_col->getScale();
if (from_col)
{
if (precision_col_const)
vectorConstant(from_col->getData(), precision_col_const->template getValue<UInt8>(), result_chars, result_offsets, from_scale);
else if (precision_col)
vectorVector(from_col->getData(), precision_col->getData(), result_chars, result_offsets, from_scale);
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of second argument of function formatDecimal", arguments[1].column->getName());
}
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function formatDecimal", arguments[0].column->getName());
}
else
{
const auto * from_col = checkAndGetColumn<ColumnVector<T>>(arguments[0].column.get());
if (from_col)
{
if (precision_col_const)
vectorConstant(from_col->getData(), precision_col_const->template getValue<UInt8>(), result_chars, result_offsets);
else if (precision_col)
vectorVector(from_col->getData(), precision_col->getData(), result_chars, result_offsets);
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of second argument of function formatDecimal", arguments[1].column->getName());
}
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function formatDecimal", arguments[0].column->getName());
}
return result_col;
}
};
}

View File

@ -905,26 +905,26 @@ inline void writeText(const IPv4 & x, WriteBuffer & buf) { writeIPv4Text(x, buf)
inline void writeText(const IPv6 & x, WriteBuffer & buf) { writeIPv6Text(x, buf); }
template <typename T>
void writeDecimalFractional(const T & x, UInt32 scale, WriteBuffer & ostr, bool trailing_zeros)
void writeDecimalFractional(const T & x, UInt32 scale, WriteBuffer & ostr, bool trailing_zeros,
bool fixed_fractional_length, UInt32 fractional_length)
{
/// If it's big integer, but the number of digits is small,
/// use the implementation for smaller integers for more efficient arithmetic.
if constexpr (std::is_same_v<T, Int256>)
{
if (x <= std::numeric_limits<UInt32>::max())
{
writeDecimalFractional(static_cast<UInt32>(x), scale, ostr, trailing_zeros);
writeDecimalFractional(static_cast<UInt32>(x), scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
return;
}
else if (x <= std::numeric_limits<UInt64>::max())
{
writeDecimalFractional(static_cast<UInt64>(x), scale, ostr, trailing_zeros);
writeDecimalFractional(static_cast<UInt64>(x), scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
return;
}
else if (x <= std::numeric_limits<UInt128>::max())
{
writeDecimalFractional(static_cast<UInt128>(x), scale, ostr, trailing_zeros);
writeDecimalFractional(static_cast<UInt128>(x), scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
return;
}
}
@ -932,24 +932,36 @@ void writeDecimalFractional(const T & x, UInt32 scale, WriteBuffer & ostr, bool
{
if (x <= std::numeric_limits<UInt32>::max())
{
writeDecimalFractional(static_cast<UInt32>(x), scale, ostr, trailing_zeros);
writeDecimalFractional(static_cast<UInt32>(x), scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
return;
}
else if (x <= std::numeric_limits<UInt64>::max())
{
writeDecimalFractional(static_cast<UInt64>(x), scale, ostr, trailing_zeros);
writeDecimalFractional(static_cast<UInt64>(x), scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
return;
}
}
constexpr size_t max_digits = std::numeric_limits<UInt256>::digits10;
assert(scale <= max_digits);
assert(fractional_length <= max_digits);
char buf[max_digits];
memset(buf, '0', scale);
memset(buf, '0', std::max(scale, fractional_length));
T value = x;
Int32 last_nonzero_pos = 0;
for (Int32 pos = scale - 1; pos >= 0; --pos)
if (fixed_fractional_length && fractional_length < scale)
{
T new_value = value / DecimalUtils::scaleMultiplier<Int256>(scale - fractional_length - 1);
auto round_carry = new_value % 10;
value = new_value / 10;
if (round_carry >= 5)
value += 1;
}
for (Int32 pos = fixed_fractional_length ? std::min(scale - 1, fractional_length - 1) : scale - 1; pos >= 0; --pos)
{
auto remainder = value % 10;
value /= 10;
@ -961,11 +973,12 @@ void writeDecimalFractional(const T & x, UInt32 scale, WriteBuffer & ostr, bool
}
writeChar('.', ostr);
ostr.write(buf, trailing_zeros ? scale : last_nonzero_pos + 1);
ostr.write(buf, fixed_fractional_length ? fractional_length : (trailing_zeros ? scale : last_nonzero_pos + 1));
}
template <typename T>
void writeText(Decimal<T> x, UInt32 scale, WriteBuffer & ostr, bool trailing_zeros)
void writeText(Decimal<T> x, UInt32 scale, WriteBuffer & ostr, bool trailing_zeros,
bool fixed_fractional_length = false, UInt32 fractional_length = 0)
{
T part = DecimalUtils::getWholePart(x, scale);
@ -976,7 +989,7 @@ void writeText(Decimal<T> x, UInt32 scale, WriteBuffer & ostr, bool trailing_zer
writeIntText(part, ostr);
if (scale)
if (scale || (fixed_fractional_length && fractional_length > 0))
{
part = DecimalUtils::getFractionalPart(x, scale);
if (part || trailing_zeros)
@ -984,7 +997,7 @@ void writeText(Decimal<T> x, UInt32 scale, WriteBuffer & ostr, bool trailing_zer
if (part < 0)
part *= T(-1);
writeDecimalFractional(part, scale, ostr, trailing_zeros);
writeDecimalFractional(part, scale, ostr, trailing_zeros, fixed_fractional_length, fractional_length);
}
}
}

View File

@ -1210,22 +1210,16 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data &
else if (data.is_create_parameterized_view && query_parameter)
{
const auto data_type = DataTypeFactory::instance().get(query_parameter->type);
/// Use getUniqueName() to allow multiple use of query parameter in the query:
///
/// CREATE VIEW view AS
/// SELECT *
/// FROM system.one
/// WHERE dummy = {k1:Int}+1 OR dummy = {k1:Int}+2
/// ^^ ^^
///
/// NOTE: query in the VIEW will not be modified this is needed
/// only during analysis for CREATE VIEW to avoid duplicated
/// column names.
ColumnWithTypeAndName column(data_type, data.getUniqueName("__" + query_parameter->getColumnName()));
/// During analysis for CREATE VIEW of a parameterized view, if parameter is
/// used multiple times, column is only added once
if (!data.hasColumn(query_parameter->name))
{
ColumnWithTypeAndName column(data_type, query_parameter->name);
data.addColumn(column);
}
argument_types.push_back(data_type);
argument_names.push_back(column.name);
argument_names.push_back(query_parameter->name);
}
else
{

View File

@ -21,6 +21,7 @@
#include <Core/BackgroundSchedulePool.h>
#include <Formats/FormatFactory.h>
#include <Databases/IDatabase.h>
#include <Server/ServerType.h>
#include <Storages/IStorage.h>
#include <Storages/MarkCache.h>
#include <Storages/MergeTree/MergeList.h>
@ -357,6 +358,9 @@ struct ContextSharedPart : boost::noncopyable
Context::ConfigReloadCallback config_reload_callback;
Context::StartStopServersCallback start_servers_callback;
Context::StartStopServersCallback stop_servers_callback;
bool is_server_completely_started = false;
#if USE_ROCKSDB
@ -3688,6 +3692,36 @@ void Context::reloadConfig() const
shared->config_reload_callback();
}
void Context::setStartServersCallback(StartStopServersCallback && callback)
{
/// Is initialized at server startup, so lock isn't required. Otherwise use mutex.
shared->start_servers_callback = std::move(callback);
}
void Context::setStopServersCallback(StartStopServersCallback && callback)
{
/// Is initialized at server startup, so lock isn't required. Otherwise use mutex.
shared->stop_servers_callback = std::move(callback);
}
void Context::startServers(const ServerType & server_type) const
{
/// Use mutex if callback may be changed after startup.
if (!shared->start_servers_callback)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't start servers because start_servers_callback is not set.");
shared->start_servers_callback(server_type);
}
void Context::stopServers(const ServerType & server_type) const
{
/// Use mutex if callback may be changed after startup.
if (!shared->stop_servers_callback)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't stop servers because stop_servers_callback is not set.");
shared->stop_servers_callback(server_type);
}
void Context::shutdown()
{

View File

@ -134,6 +134,7 @@ using StoragePolicyPtr = std::shared_ptr<const IStoragePolicy>;
using StoragePoliciesMap = std::map<String, StoragePolicyPtr>;
class StoragePolicySelector;
using StoragePolicySelectorPtr = std::shared_ptr<const StoragePolicySelector>;
class ServerType;
template <class Queue>
class MergeTreeBackgroundExecutor;
@ -1057,6 +1058,13 @@ public:
void setConfigReloadCallback(ConfigReloadCallback && callback);
void reloadConfig() const;
using StartStopServersCallback = std::function<void(const ServerType &)>;
void setStartServersCallback(StartStopServersCallback && callback);
void setStopServersCallback(StartStopServersCallback && callback);
void startServers(const ServerType & server_type) const;
void stopServers(const ServerType & server_type) const;
void shutdown();
bool isInternalQuery() const { return is_internal_query; }

View File

@ -349,6 +349,15 @@ DatabaseAndTable DatabaseCatalog::getTableImpl(
DatabasePtr database;
{
// Callers assume that this method doesn't throw exceptions, but getDatabaseName() will throw if there is no database part.
// So, fail early and gracefully...
if (!table_id.hasDatabase())
{
if (exception)
exception->emplace(Exception(ErrorCodes::UNKNOWN_DATABASE, "Empty database name"));
return {};
}
std::lock_guard lock{databases_mutex};
auto it = databases.find(table_id.getDatabaseName());
if (databases.end() == it)

View File

@ -556,6 +556,14 @@ BlockIO InterpreterSystemQuery::execute()
);
break;
}
case Type::STOP_LISTEN:
getContext()->checkAccess(AccessType::SYSTEM_LISTEN);
getContext()->stopServers(query.server_type);
break;
case Type::START_LISTEN:
getContext()->checkAccess(AccessType::SYSTEM_LISTEN);
getContext()->startServers(query.server_type);
break;
case Type::FLUSH_ASYNC_INSERT_QUEUE:
{
getContext()->checkAccess(AccessType::SYSTEM_FLUSH_ASYNC_INSERT_QUEUE);
@ -567,9 +575,6 @@ BlockIO InterpreterSystemQuery::execute()
queue->flushAll();
break;
}
case Type::STOP_LISTEN_QUERIES:
case Type::START_LISTEN_QUERIES:
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "{} is not supported yet", query.type);
case Type::STOP_THREAD_FUZZER:
getContext()->checkAccess(AccessType::SYSTEM_THREAD_FUZZER);
ThreadFuzzer::stop();
@ -1181,8 +1186,12 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster()
required_access.emplace_back(AccessType::SYSTEM_SYNC_FILE_CACHE);
break;
}
case Type::STOP_LISTEN_QUERIES:
case Type::START_LISTEN_QUERIES:
case Type::STOP_LISTEN:
case Type::START_LISTEN:
{
required_access.emplace_back(AccessType::SYSTEM_LISTEN);
break;
}
case Type::STOP_THREAD_FUZZER:
case Type::START_THREAD_FUZZER:
case Type::ENABLE_FAILPOINT:

View File

@ -220,6 +220,17 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState &,
{
settings.ostr << (settings.hilite ? hilite_none : "");
}
else if (type == Type::START_LISTEN || type == Type::STOP_LISTEN)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << " " << ServerType::serverTypeToString(server_type.type)
<< (settings.hilite ? hilite_none : "");
if (server_type.type == ServerType::CUSTOM)
{
settings.ostr << (settings.hilite ? hilite_identifier : "") << " " << backQuoteIfNeed(server_type.custom_name);
}
}
}

View File

@ -3,6 +3,7 @@
#include <Parsers/ASTQueryWithOnCluster.h>
#include <Parsers/IAST.h>
#include <Parsers/SyncReplicaMode.h>
#include <Server/ServerType.h>
#include "config.h"
@ -35,8 +36,8 @@ public:
#if USE_AWS_S3
DROP_S3_CLIENT_CACHE,
#endif
STOP_LISTEN_QUERIES,
START_LISTEN_QUERIES,
STOP_LISTEN,
START_LISTEN,
RESTART_REPLICAS,
RESTART_REPLICA,
RESTORE_REPLICA,
@ -116,6 +117,8 @@ public:
SyncReplicaMode sync_replica_mode = SyncReplicaMode::DEFAULT;
ServerType server_type;
String getID(char) const override { return "SYSTEM query"; }
ASTPtr clone() const override

View File

@ -442,6 +442,42 @@ bool ParserSystemQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected &
break;
}
case Type::START_LISTEN:
case Type::STOP_LISTEN:
{
if (!parseQueryWithOnCluster(res, pos, expected))
return false;
ServerType::Type current_type = ServerType::Type::END;
std::string current_custom_name;
for (const auto & type : magic_enum::enum_values<ServerType::Type>())
{
if (ParserKeyword{ServerType::serverTypeToString(type)}.ignore(pos, expected))
{
current_type = type;
break;
}
}
if (current_type == ServerType::Type::END)
return false;
if (current_type == ServerType::CUSTOM)
{
ASTPtr ast;
if (!ParserStringLiteral{}.parse(pos, ast, expected))
return false;
current_custom_name = ast->as<ASTLiteral &>().value.get<const String &>();
}
res->server_type = ServerType(current_type, current_custom_name);
break;
}
default:
{
if (!parseQueryWithOnCluster(res, pos, expected))

View File

@ -3,8 +3,8 @@ set(SRCS)
clickhouse_add_executable(lexer lexer.cpp ${SRCS})
target_link_libraries(lexer PRIVATE clickhouse_parsers)
clickhouse_add_executable(select_parser select_parser.cpp ${SRCS})
clickhouse_add_executable(select_parser select_parser.cpp ${SRCS} "../../Server/ServerType.cpp")
target_link_libraries(select_parser PRIVATE clickhouse_parsers)
clickhouse_add_executable(create_parser create_parser.cpp ${SRCS})
clickhouse_add_executable(create_parser create_parser.cpp ${SRCS} "../../Server/ServerType.cpp")
target_link_libraries(create_parser PRIVATE clickhouse_parsers)

View File

@ -684,9 +684,6 @@ namespace DB
bool output_fixed_string_as_fixed_byte_array,
std::unordered_map<String, MutableColumnPtr> & dictionary_values)
{
const String column_type_name = column_type->getFamilyName();
WhichDataType which(column_type);
switch (column_type->getTypeId())
{
case TypeIndex::Nullable:
@ -796,7 +793,7 @@ namespace DB
FOR_INTERNAL_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
default:
throw Exception(ErrorCodes::UNKNOWN_TYPE, "Internal type '{}' of a column '{}' is not supported for conversion into {} data format.", column_type_name, column_name, format_name);
throw Exception(ErrorCodes::UNKNOWN_TYPE, "Internal type '{}' of a column '{}' is not supported for conversion into {} data format.", column_type->getFamilyName(), column_name, format_name);
}
}

View File

@ -0,0 +1,628 @@
#include "Processors/Formats/Impl/Parquet/Write.h"
#include <Columns/MaskOperations.h>
#include <Columns/ColumnFixedString.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnLowCardinality.h>
#include <Columns/ColumnMap.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeDateTime64.h>
#include <DataTypes/DataTypeFixedString.h>
/// This file deals with schema conversion and with repetition and definition levels.
/// Schema conversion is pretty straightforward.
/// "Repetition and definition levels" are a somewhat tricky way of encoding information about
/// optional fields and lists.
///
/// If you don't want to learn how these work, feel free to skip the updateRepDefLevels* functions.
/// All you need to know is:
/// * values for nulls are not encoded, so we have to filter nullable columns,
/// * information about all array lengths and nulls is encoded in the arrays `def` and `rep`,
/// which need to be encoded next to the data,
/// * `def` and `rep` arrays can be longer than `primitive_column`, because they include nulls and
/// empty arrays; the values in primitive_column correspond to positions where def[i] == max_def.
///
/// If you do want to learn it, dremel paper: https://research.google/pubs/pub36632/
/// Instead of reading the whole paper, try staring at figures 2-3 for a while - it might be enough.
/// (Why does Parquet do all this instead of just storing array lengths and null masks? I'm not
/// really sure.)
///
/// We calculate the levels recursively, from inner to outer columns.
/// This means scanning the whole array for each Array/Nullable nesting level, which is probably not
/// the most efficient way to do it. But there's usually at most one nesting level, so it's fine.
///
/// Most of this is moot because ClickHouse doesn't support nullable arrays or tuples right now, so
/// almost none of the tricky cases can happen. We implement it in full generality anyway (mostly
/// because I only learned the previous sentence after writing most of the code).
namespace DB::ErrorCodes
{
extern const int UNKNOWN_TYPE;
extern const int TOO_DEEP_RECURSION; // I'm 14 and this is deep
extern const int UNKNOWN_COMPRESSION_METHOD;
extern const int LOGICAL_ERROR;
}
namespace DB::Parquet
{
/// Thrift structs that Parquet uses for various metadata inside the parquet file.
namespace parq = parquet::format;
namespace
{
void assertNoDefOverflow(ColumnChunkWriteState & s)
{
if (s.max_def == UINT8_MAX)
throw Exception(ErrorCodes::TOO_DEEP_RECURSION,
"Column has more than 255 levels of nested Array/Nullable. Impressive! Unfortunately, "
"this is not supported by this Parquet encoder (but is supported by Parquet, if you "
"really need this for some reason).");
}
void updateRepDefLevelsAndFilterColumnForNullable(ColumnChunkWriteState & s, const NullMap & null_map)
{
/// Increment definition levels for non-nulls.
/// Filter the column to contain only non-null values.
assertNoDefOverflow(s);
++s.max_def;
/// Normal case: no arrays or nullables inside this nullable.
if (s.max_def == 1)
{
chassert(s.def.empty());
s.def.resize(null_map.size());
for (size_t i = 0; i < s.def.size(); ++i)
s.def[i] = !null_map[i];
/// We could be more efficient with this:
/// * Instead of doing the filter() here, we could defer it to writeColumnChunkBody(), at
/// least in the simple case of Nullable(Primitive). Then it'll parallelize if the table
/// consists of one big tuple.
/// * Instead of filtering explicitly, we could build filtering into the data encoder.
/// * Instead of filling out the `def` values above, we could point to null_map and build
/// the '!' into the encoder.
/// None of these seem worth the complexity right now.
s.primitive_column = s.primitive_column->filter(s.def, /*result_size_hint*/ -1);
return;
}
/// Weird general case: Nullable(Array), Nullable(Nullable), or any arbitrary nesting like that.
/// This is currently not allowed in ClickHouse, but let's support it anyway just in case.
IColumn::Filter filter;
size_t row_idx = static_cast<size_t>(-1);
for (size_t i = 0; i < s.def.size(); ++i)
{
row_idx += s.max_rep == 0 || s.rep[i] == 0;
if (s.def[i] == s.max_def - 1)
filter.push_back(!null_map[row_idx]);
s.def[i] += !null_map[row_idx];
}
s.primitive_column = s.primitive_column->filter(filter, /*result_size_hint*/ -1);
}
void updateRepDefLevelsForArray(ColumnChunkWriteState & s, const IColumn::Offsets & offsets)
{
/// Increment all definition levels.
/// For non-first elements of arrays, increment repetition levels.
/// For empty arrays, insert a zero into repetition and definition levels arrays.
assertNoDefOverflow(s);
++s.max_def;
++s.max_rep;
/// Common case: no arrays or nullables inside this array.
if (s.max_rep == 1 && s.max_def == 1)
{
s.def.resize_fill(s.primitive_column->size(), 1);
s.rep.resize_fill(s.primitive_column->size(), 1);
size_t i = 0;
for (ssize_t row = 0; row < static_cast<ssize_t>(offsets.size()); ++row)
{
size_t n = offsets[row] - offsets[row - 1];
if (n)
{
s.rep[i] = 0;
i += n;
}
else
{
s.def.push_back(1);
s.rep.push_back(1);
s.def[i] = 0;
s.rep[i] = 0;
i += 1;
}
}
return;
}
/// General case: Array(Array), Array(Nullable), or any arbitrary nesting like that.
for (auto & x : s.def)
++x;
if (s.max_rep == 1)
s.rep.resize_fill(s.def.size(), 1);
else
for (auto & x : s.rep)
++x;
PaddedPODArray<UInt8> mask(s.def.size(), 1); // for inserting zeroes to rep and def
size_t i = 0; // in the input (s.def/s.rep)
size_t empty_arrays = 0;
for (ssize_t row = 0; row < static_cast<ssize_t>(offsets.size()); ++row)
{
size_t n = offsets[row] - offsets[row - 1];
if (n)
{
/// Un-increment the first rep of the array.
/// Skip n "items" in the nested column; first element of each item has rep = 1
/// (we incremented it above).
chassert(s.rep[i] == 1);
--s.rep[i];
do
{
++i;
if (i == s.rep.size())
{
--n;
chassert(n == 0);
break;
}
n -= s.rep[i] == 1;
} while (n);
}
else
{
mask.push_back(1);
mask[i + empty_arrays] = 0;
++empty_arrays;
}
}
if (empty_arrays != 0)
{
expandDataByMask(s.def, mask, false);
expandDataByMask(s.rep, mask, false);
}
}
parq::CompressionCodec::type compressionMethodToParquet(CompressionMethod c)
{
switch (c)
{
case CompressionMethod::None: return parq::CompressionCodec::UNCOMPRESSED;
case CompressionMethod::Snappy: return parq::CompressionCodec::SNAPPY;
case CompressionMethod::Gzip: return parq::CompressionCodec::GZIP;
case CompressionMethod::Brotli: return parq::CompressionCodec::BROTLI;
case CompressionMethod::Lz4: return parq::CompressionCodec::LZ4_RAW;
case CompressionMethod::Zstd: return parq::CompressionCodec::ZSTD;
default:
throw Exception(ErrorCodes::UNKNOWN_COMPRESSION_METHOD, "Compression method {} is not supported by Parquet", toContentEncodingName(c));
}
}
/// Depth-first traversal of the schema tree for this column.
void prepareColumnRecursive(
ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
ColumnChunkWriteStates & states, SchemaElements & schemas);
void preparePrimitiveColumn(ColumnPtr column, DataTypePtr type, const std::string & name,
const WriteOptions & options, ColumnChunkWriteStates & states, SchemaElements & schemas)
{
/// Add physical column info.
auto & state = states.emplace_back();
state.primitive_column = column;
state.compression = options.compression;
state.column_chunk.__isset.meta_data = true;
state.column_chunk.meta_data.__set_path_in_schema({name});
state.column_chunk.meta_data.__set_codec(compressionMethodToParquet(state.compression));
/// Add logical schema leaf.
auto & schema = schemas.emplace_back();
schema.__set_repetition_type(parq::FieldRepetitionType::REQUIRED);
schema.__set_name(name);
/// Convert the type enums.
using T = parq::Type;
using C = parq::ConvertedType;
auto types = [&](T::type type_, std::optional<C::type> converted = std::nullopt, std::optional<parq::LogicalType> logical = std::nullopt)
{
state.column_chunk.meta_data.__set_type(type_);
schema.__set_type(type_);
if (converted)
schema.__set_converted_type(*converted);
if (logical)
schema.__set_logicalType(*logical);
};
auto int_type = [](Int8 bits, bool signed_)
{
parq::LogicalType t;
t.__isset.INTEGER = true;
t.INTEGER.__set_bitWidth(bits);
t.INTEGER.__set_isSigned(signed_);
return t;
};
auto fixed_string = [&](size_t size, std::optional<C::type> converted = std::nullopt, std::optional<parq::LogicalType> logical = std::nullopt)
{
state.column_chunk.meta_data.__set_type(parq::Type::FIXED_LEN_BYTE_ARRAY);
schema.__set_type(parq::Type::FIXED_LEN_BYTE_ARRAY);
schema.__set_type_length(static_cast<Int32>(size));
if (converted)
schema.__set_converted_type(*converted);
if (logical)
schema.__set_logicalType(*logical);
};
auto decimal = [&](Int32 bytes, UInt32 precision, UInt32 scale)
{
state.column_chunk.meta_data.__set_type(parq::Type::FIXED_LEN_BYTE_ARRAY);
schema.__set_type(parq::Type::FIXED_LEN_BYTE_ARRAY);
schema.__set_type_length(bytes);
schema.__set_scale(static_cast<Int32>(scale));
schema.__set_precision(static_cast<Int32>(precision));
schema.__set_converted_type(parq::ConvertedType::DECIMAL);
parq::DecimalType d;
d.__set_scale(static_cast<Int32>(scale));
d.__set_precision(static_cast<Int32>(precision));
parq::LogicalType t;
t.__set_DECIMAL(d);
schema.__set_logicalType(t);
};
switch (type->getTypeId())
{
case TypeIndex::UInt8:
if (isBool(type))
{
types(T::BOOLEAN);
state.is_bool = true;
}
else
{
types(T::INT32, C::UINT_8 , int_type(8 , false));
}
break;
case TypeIndex::UInt16: types(T::INT32, C::UINT_16, int_type(16, false)); break;
case TypeIndex::UInt32: types(T::INT32, C::UINT_32, int_type(32, false)); break;
case TypeIndex::UInt64: types(T::INT64, C::UINT_64, int_type(64, false)); break;
case TypeIndex::Int8: types(T::INT32, C::INT_8 , int_type(8 , true)); break;
case TypeIndex::Int16: types(T::INT32, C::INT_16 , int_type(16, true)); break;
case TypeIndex::Int32: types(T::INT32); break;
case TypeIndex::Int64: types(T::INT64); break;
case TypeIndex::Float32: types(T::FLOAT); break;
case TypeIndex::Float64: types(T::DOUBLE); break;
/// These don't have suitable parquet logical types, so we write them as plain numbers.
/// (Parquet has "enums" but they're just strings, with nowhere to declare all possible enum
/// values in advance as part of the data type.)
case TypeIndex::Enum8: types(T::INT32, C::INT_8 , int_type(8 , true)); break; // Int8
case TypeIndex::Enum16: types(T::INT32, C::INT_16 , int_type(16, true)); break; // Int16
case TypeIndex::IPv4: types(T::INT32, C::UINT_32, int_type(32, false)); break; // UInt32
case TypeIndex::Date: types(T::INT32, C::UINT_16, int_type(16, false)); break; // UInt16
case TypeIndex::DateTime: types(T::INT32, C::UINT_32, int_type(32, false)); break; // UInt32
case TypeIndex::Date32:
{
parq::LogicalType t;
t.__set_DATE({});
types(T::INT32, C::DATE, t);
break;
}
case TypeIndex::DateTime64:
{
std::optional<parq::ConvertedType::type> converted;
std::optional<parq::TimeUnit> unit;
switch (assert_cast<const DataTypeDateTime64 &>(*type).getScale())
{
case 3:
converted = parq::ConvertedType::TIMESTAMP_MILLIS;
unit.emplace().__set_MILLIS({});
break;
case 6:
converted = parq::ConvertedType::TIMESTAMP_MICROS;
unit.emplace().__set_MICROS({});
break;
case 9:
unit.emplace().__set_NANOS({});
break;
}
std::optional<parq::LogicalType> t;
if (unit)
{
parq::TimestampType tt;
tt.__set_isAdjustedToUTC(true);
tt.__set_unit(*unit);
t.emplace().__set_TIMESTAMP(tt);
}
types(T::INT64, converted, t);
break;
}
case TypeIndex::String:
case TypeIndex::FixedString:
{
if (options.output_fixed_string_as_fixed_byte_array &&
type->getTypeId() == TypeIndex::FixedString)
{
fixed_string(assert_cast<const DataTypeFixedString &>(*type).getN());
}
else if (options.output_string_as_string)
{
parq::LogicalType t;
t.__set_STRING({});
types(T::BYTE_ARRAY, C::UTF8, t);
}
else
{
types(T::BYTE_ARRAY);
}
break;
}
/// Parquet doesn't have logical types for these.
case TypeIndex::UInt128: fixed_string(16); break;
case TypeIndex::UInt256: fixed_string(32); break;
case TypeIndex::Int128: fixed_string(16); break;
case TypeIndex::Int256: fixed_string(32); break;
case TypeIndex::IPv6: fixed_string(16); break;
case TypeIndex::Decimal32: decimal(4 , getDecimalPrecision(*type), getDecimalScale(*type)); break;
case TypeIndex::Decimal64: decimal(8 , getDecimalPrecision(*type), getDecimalScale(*type)); break;
case TypeIndex::Decimal128: decimal(16, getDecimalPrecision(*type), getDecimalScale(*type)); break;
case TypeIndex::Decimal256: decimal(32, getDecimalPrecision(*type), getDecimalScale(*type)); break;
default:
throw Exception(ErrorCodes::UNKNOWN_TYPE, "Internal type '{}' of column '{}' is not supported for conversion into Parquet data format.", type->getFamilyName(), name);
}
}
void prepareColumnNullable(
ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
ColumnChunkWriteStates & states, SchemaElements & schemas)
{
const ColumnNullable * column_nullable = assert_cast<const ColumnNullable *>(column.get());
ColumnPtr nested_column = column_nullable->getNestedColumnPtr();
DataTypePtr nested_type = assert_cast<const DataTypeNullable *>(type.get())->getNestedType();
const NullMap & null_map = column_nullable->getNullMapData();
size_t child_states_begin = states.size();
size_t child_schema_idx = schemas.size();
prepareColumnRecursive(nested_column, nested_type, name, options, states, schemas);
if (schemas[child_schema_idx].repetition_type == parq::FieldRepetitionType::REQUIRED)
{
/// Normal case: we just slap a FieldRepetitionType::OPTIONAL onto the nested column.
schemas[child_schema_idx].repetition_type = parq::FieldRepetitionType::OPTIONAL;
}
else
{
/// Weird case: Nullable(Nullable(...)). Or Nullable(Tuple(Nullable(...))), etc.
/// This is probably not allowed in ClickHouse, but let's support it just in case.
auto & schema = *schemas.insert(schemas.begin() + child_schema_idx, {});
schema.__set_repetition_type(parq::FieldRepetitionType::OPTIONAL);
schema.__set_name("nullable");
schema.__set_num_children(1);
for (size_t i = child_states_begin; i < states.size(); ++i)
{
Strings & path = states[i].column_chunk.meta_data.path_in_schema;
path.insert(path.begin(), schema.name + ".");
}
}
for (size_t i = child_states_begin; i < states.size(); ++i)
{
auto & s = states[i];
updateRepDefLevelsAndFilterColumnForNullable(s, null_map);
}
}
void prepareColumnTuple(
ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
ColumnChunkWriteStates & states, SchemaElements & schemas)
{
const auto * column_tuple = assert_cast<const ColumnTuple *>(column.get());
const auto * type_tuple = assert_cast<const DataTypeTuple *>(type.get());
auto & tuple_schema = schemas.emplace_back();
tuple_schema.__set_repetition_type(parq::FieldRepetitionType::REQUIRED);
tuple_schema.__set_name(name);
tuple_schema.__set_num_children(static_cast<Int32>(type_tuple->getElements().size()));
size_t child_states_begin = states.size();
for (size_t i = 0; i < type_tuple->getElements().size(); ++i)
prepareColumnRecursive(column_tuple->getColumnPtr(i), type_tuple->getElement(i), type_tuple->getNameByPosition(i + 1), options, states, schemas);
for (size_t i = child_states_begin; i < states.size(); ++i)
{
Strings & path = states[i].column_chunk.meta_data.path_in_schema;
/// O(nesting_depth^2), but who cares.
path.insert(path.begin(), name);
}
}
void prepareColumnArray(
ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
ColumnChunkWriteStates & states, SchemaElements & schemas)
{
const auto * column_array = assert_cast<const ColumnArray *>(column.get());
ColumnPtr nested_column = column_array->getDataPtr();
DataTypePtr nested_type = assert_cast<const DataTypeArray *>(type.get())->getNestedType();
const auto & offsets = column_array->getOffsets();
/// Schema for lists https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
///
/// required group `name` (List):
/// repeated group "list":
/// <recurse into nested type> "element"
/// Add the groups schema.
schemas.emplace_back();
schemas.emplace_back();
auto & list_schema = schemas[schemas.size() - 2];
auto & item_schema = schemas[schemas.size() - 1];
list_schema.__set_repetition_type(parq::FieldRepetitionType::REQUIRED);
list_schema.__set_name(name);
list_schema.__set_num_children(1);
list_schema.__set_converted_type(parq::ConvertedType::LIST);
list_schema.__isset.logicalType = true;
list_schema.logicalType.__set_LIST({});
item_schema.__set_repetition_type(parq::FieldRepetitionType::REPEATED);
item_schema.__set_name("list");
item_schema.__set_num_children(1);
std::array<std::string, 2> path_prefix = {list_schema.name, item_schema.name};
size_t child_states_begin = states.size();
/// Recurse.
prepareColumnRecursive(nested_column, nested_type, "element", options, states, schemas);
/// Update repetition+definition levels and fully-qualified column names (x -> myarray.list.x).
for (size_t i = child_states_begin; i < states.size(); ++i)
{
Strings & path = states[i].column_chunk.meta_data.path_in_schema;
path.insert(path.begin(), path_prefix.begin(), path_prefix.end());
updateRepDefLevelsForArray(states[i], offsets);
}
}
void prepareColumnMap(
ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
ColumnChunkWriteStates & states, SchemaElements & schemas)
{
const auto * column_map = assert_cast<const ColumnMap *>(column.get());
const auto * column_array = &column_map->getNestedColumn();
const auto & offsets = column_array->getOffsets();
ColumnPtr column_tuple = column_array->getDataPtr();
const auto * map_type = assert_cast<const DataTypeMap *>(type.get());
DataTypePtr tuple_type = std::make_shared<DataTypeTuple>(map_type->getKeyValueTypes(), Strings{"key", "value"});
/// Map is an array of tuples
/// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#maps
///
/// required group `name` (Map):
/// repeated group "key_value":
/// reqiured <...> "key"
/// <...> "value"
auto & map_schema = schemas.emplace_back();
map_schema.__set_repetition_type(parq::FieldRepetitionType::REQUIRED);
map_schema.__set_name(name);
map_schema.__set_num_children(1);
map_schema.__set_converted_type(parq::ConvertedType::MAP);
map_schema.__set_logicalType({});
map_schema.logicalType.__set_MAP({});
size_t tuple_schema_idx = schemas.size();
size_t child_states_begin = states.size();
prepareColumnTuple(column_tuple, tuple_type, "key_value", options, states, schemas);
schemas[tuple_schema_idx].__set_repetition_type(parq::FieldRepetitionType::REPEATED);
schemas[tuple_schema_idx].__set_converted_type(parq::ConvertedType::MAP_KEY_VALUE);
for (size_t i = child_states_begin; i < states.size(); ++i)
{
Strings & path = states[i].column_chunk.meta_data.path_in_schema;
path.insert(path.begin(), name);
updateRepDefLevelsForArray(states[i], offsets);
}
}
void prepareColumnRecursive(
ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
ColumnChunkWriteStates & states, SchemaElements & schemas)
{
switch (type->getTypeId())
{
case TypeIndex::Nullable: prepareColumnNullable(column, type, name, options, states, schemas); break;
case TypeIndex::Array: prepareColumnArray(column, type, name, options, states, schemas); break;
case TypeIndex::Tuple: prepareColumnTuple(column, type, name, options, states, schemas); break;
case TypeIndex::Map: prepareColumnMap(column, type, name, options, states, schemas); break;
case TypeIndex::LowCardinality:
{
auto nested_type = assert_cast<const DataTypeLowCardinality &>(*type).getDictionaryType();
if (nested_type->isNullable())
prepareColumnNullable(
column->convertToFullColumnIfLowCardinality(), nested_type, name, options, states, schemas);
else
/// Use nested data type, but keep ColumnLowCardinality. The encoder can deal with it.
preparePrimitiveColumn(column, nested_type, name, options, states, schemas);
break;
}
default:
preparePrimitiveColumn(column, type, name, options, states, schemas);
break;
}
}
}
SchemaElements convertSchema(const Block & sample, const WriteOptions & options)
{
SchemaElements schema;
auto & root = schema.emplace_back();
root.__set_name("schema");
root.__set_num_children(static_cast<Int32>(sample.columns()));
for (const auto & c : sample)
prepareColumnForWrite(c.column, c.type, c.name, options, nullptr, &schema);
return schema;
}
void prepareColumnForWrite(
ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
ColumnChunkWriteStates * out_columns_to_write, SchemaElements * out_schema)
{
if (column->empty() && out_columns_to_write != nullptr)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty column passed to Parquet encoder");
ColumnChunkWriteStates states;
SchemaElements schemas;
prepareColumnRecursive(column, type, name, options, states, schemas);
if (out_columns_to_write)
for (auto & s : states)
out_columns_to_write->push_back(std::move(s));
if (out_schema)
out_schema->insert(out_schema->end(), schemas.begin(), schemas.end());
if (column->empty())
states.clear();
}
}

View File

@ -0,0 +1,35 @@
#include <Processors/Formats/Impl/Parquet/ThriftUtil.h>
#include <thrift/protocol/TCompactProtocol.h>
namespace DB::Parquet
{
class WriteBufferTransport : public apache::thrift::transport::TTransport
{
public:
WriteBuffer & out;
size_t bytes = 0;
explicit WriteBufferTransport(WriteBuffer & out_) : out(out_) {}
void write(const uint8_t* buf, uint32_t len)
{
out.write(reinterpret_cast<const char *>(buf), len);
bytes += len;
}
};
template <typename T>
size_t serializeThriftStruct(const T & obj, WriteBuffer & out)
{
auto trans = std::make_shared<WriteBufferTransport>(out);
auto proto = apache::thrift::protocol::TCompactProtocolFactoryT<WriteBufferTransport>().getProtocol(trans);
obj.write(proto.get());
return trans->bytes;
}
template size_t serializeThriftStruct<parquet::format::PageHeader>(const parquet::format::PageHeader &, WriteBuffer & out);
template size_t serializeThriftStruct<parquet::format::ColumnChunk>(const parquet::format::ColumnChunk &, WriteBuffer & out);
template size_t serializeThriftStruct<parquet::format::FileMetaData>(const parquet::format::FileMetaData &, WriteBuffer & out);
}

View File

@ -0,0 +1,17 @@
#pragma once
#include <generated/parquet_types.h> // in contrib/arrow/cpp/src/ , generated from parquet.thrift
#include <IO/WriteBuffer.h>
namespace DB::Parquet
{
/// Returns number of bytes written.
template <typename T>
size_t serializeThriftStruct(const T & obj, WriteBuffer & out);
extern template size_t serializeThriftStruct<parquet::format::PageHeader>(const parquet::format::PageHeader &, WriteBuffer & out);
extern template size_t serializeThriftStruct<parquet::format::ColumnChunk>(const parquet::format::ColumnChunk &, WriteBuffer & out);
extern template size_t serializeThriftStruct<parquet::format::FileMetaData>(const parquet::format::FileMetaData &, WriteBuffer & out);
}

View File

@ -0,0 +1,911 @@
#include "Processors/Formats/Impl/Parquet/Write.h"
#include "Processors/Formats/Impl/Parquet/ThriftUtil.h"
#include <parquet/encoding.h>
#include <parquet/schema.h>
#include <arrow/util/rle_encoding.h>
#include <lz4.h>
#include <Columns/MaskOperations.h>
#include <Columns/ColumnFixedString.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnDecimal.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnMap.h>
#include <IO/WriteHelpers.h>
#include "config_version.h"
#if USE_SNAPPY
#include <snappy.h>
#endif
namespace DB::ErrorCodes
{
extern const int CANNOT_COMPRESS;
extern const int LIMIT_EXCEEDED;
extern const int LOGICAL_ERROR;
}
namespace DB::Parquet
{
namespace parq = parquet::format;
namespace
{
template <typename T, typename SourceType>
struct StatisticsNumeric
{
T min = std::numeric_limits<T>::max();
T max = std::numeric_limits<T>::min();
void add(SourceType x)
{
min = std::min(min, static_cast<T>(x));
max = std::max(max, static_cast<T>(x));
}
void merge(const StatisticsNumeric & s)
{
min = std::min(min, s.min);
max = std::max(max, s.max);
}
void clear() { *this = {}; }
parq::Statistics get(const WriteOptions &)
{
parq::Statistics s;
s.__isset.min_value = s.__isset.max_value = true;
s.min_value.resize(sizeof(T));
s.max_value.resize(sizeof(T));
memcpy(s.min_value.data(), &min, sizeof(T));
memcpy(s.max_value.data(), &max, sizeof(T));
if constexpr (std::is_signed<T>::value)
{
s.__set_min(s.min_value);
s.__set_max(s.max_value);
}
return s;
}
};
struct StatisticsFixedStringRef
{
size_t fixed_string_size = UINT64_MAX;
const uint8_t * min = nullptr;
const uint8_t * max = nullptr;
void add(parquet::FixedLenByteArray a)
{
chassert(fixed_string_size != UINT64_MAX);
addMin(a.ptr);
addMax(a.ptr);
}
void merge(const StatisticsFixedStringRef & s)
{
chassert(fixed_string_size == UINT64_MAX || fixed_string_size == s.fixed_string_size);
fixed_string_size = s.fixed_string_size;
if (s.min == nullptr)
return;
addMin(s.min);
addMax(s.max);
}
void clear() { min = max = nullptr; }
parq::Statistics get(const WriteOptions & options) const
{
parq::Statistics s;
if (min == nullptr || fixed_string_size > options.max_statistics_size)
return s;
s.__set_min_value(std::string(reinterpret_cast<const char *>(min), fixed_string_size));
s.__set_max_value(std::string(reinterpret_cast<const char *>(max), fixed_string_size));
return s;
}
void addMin(const uint8_t * p)
{
if (min == nullptr || memcmp(p, min, fixed_string_size) < 0)
min = p;
}
void addMax(const uint8_t * p)
{
if (max == nullptr || memcmp(p, max, fixed_string_size) > 0)
max = p;
}
};
template<size_t S>
struct StatisticsFixedStringCopy
{
bool empty = true;
std::array<uint8_t, S> min {};
std::array<uint8_t, S> max {};
void add(parquet::FixedLenByteArray a)
{
addMin(a.ptr);
addMax(a.ptr);
empty = false;
}
void merge(const StatisticsFixedStringCopy<S> & s)
{
if (s.empty)
return;
addMin(&s.min[0]);
addMax(&s.max[0]);
empty = false;
}
void clear() { empty = true; }
parq::Statistics get(const WriteOptions &) const
{
parq::Statistics s;
if (empty)
return s;
s.__set_min_value(std::string(reinterpret_cast<const char *>(min.data()), S));
s.__set_max_value(std::string(reinterpret_cast<const char *>(max.data()), S));
return s;
}
void addMin(const uint8_t * p)
{
if (empty || memcmp(p, min.data(), S) < 0)
memcpy(min.data(), p, S);
}
void addMax(const uint8_t * p)
{
if (empty || memcmp(p, max.data(), S) > 0)
memcpy(max.data(), p, S);
}
};
struct StatisticsStringRef
{
parquet::ByteArray min;
parquet::ByteArray max;
void add(parquet::ByteArray x)
{
addMin(x);
addMax(x);
}
void merge(const StatisticsStringRef & s)
{
if (s.min.ptr == nullptr)
return;
addMin(s.min);
addMax(s.max);
}
void clear() { *this = {}; }
parq::Statistics get(const WriteOptions & options) const
{
parq::Statistics s;
if (min.ptr == nullptr)
return s;
if (static_cast<size_t>(min.len) <= options.max_statistics_size)
s.__set_min_value(std::string(reinterpret_cast<const char *>(min.ptr), static_cast<size_t>(min.len)));
if (static_cast<size_t>(max.len) <= options.max_statistics_size)
s.__set_max_value(std::string(reinterpret_cast<const char *>(max.ptr), static_cast<size_t>(max.len)));
return s;
}
void addMin(parquet::ByteArray x)
{
if (min.ptr == nullptr || compare(x, min) < 0)
min = x;
}
void addMax(parquet::ByteArray x)
{
if (max.ptr == nullptr || compare(x, max) > 0)
max = x;
}
static int compare(parquet::ByteArray a, parquet::ByteArray b)
{
int t = memcmp(a.ptr, b.ptr, std::min(a.len, b.len));
if (t != 0)
return t;
return a.len - b.len;
}
};
/// The column usually needs to be converted to one of Parquet physical types, e.g. UInt16 -> Int32
/// or [element of ColumnString] -> std::string_view.
/// We do this conversion in small batches rather than all at once, just before encoding the batch,
/// in hopes of getting better performance through cache locality.
/// The Coverter* structs below are responsible for that.
/// When conversion is not needed, getBatch() will just return pointer into original data.
template <typename Col, typename To, typename MinMaxType = typename std::conditional<
std::is_signed<typename Col::Container::value_type>::value,
To,
typename std::make_unsigned<To>::type>::type>
struct ConverterNumeric
{
using Statistics = StatisticsNumeric<MinMaxType, To>;
const Col & column;
PODArray<To> buf;
explicit ConverterNumeric(const ColumnPtr & c) : column(assert_cast<const Col &>(*c)) {}
const To * getBatch(size_t offset, size_t count)
{
if constexpr (sizeof(*column.getData().data()) == sizeof(To))
return reinterpret_cast<const To *>(column.getData().data() + offset);
else
{
buf.resize(count);
for (size_t i = 0; i < count; ++i)
buf[i] = static_cast<To>(column.getData()[offset + i]); // NOLINT
return buf.data();
}
}
};
struct ConverterString
{
using Statistics = StatisticsStringRef;
const ColumnString & column;
PODArray<parquet::ByteArray> buf;
explicit ConverterString(const ColumnPtr & c) : column(assert_cast<const ColumnString &>(*c)) {}
const parquet::ByteArray * getBatch(size_t offset, size_t count)
{
buf.resize(count);
for (size_t i = 0; i < count; ++i)
{
StringRef s = column.getDataAt(offset + i);
buf[i] = parquet::ByteArray(static_cast<UInt32>(s.size), reinterpret_cast<const uint8_t *>(s.data));
}
return buf.data();
}
};
struct ConverterFixedString
{
using Statistics = StatisticsFixedStringRef;
const ColumnFixedString & column;
PODArray<parquet::FixedLenByteArray> buf;
explicit ConverterFixedString(const ColumnPtr & c) : column(assert_cast<const ColumnFixedString &>(*c)) {}
const parquet::FixedLenByteArray * getBatch(size_t offset, size_t count)
{
buf.resize(count);
for (size_t i = 0; i < count; ++i)
buf[i].ptr = reinterpret_cast<const uint8_t *>(column.getChars().data() + (offset + i) * column.getN());
return buf.data();
}
size_t fixedStringSize() { return column.getN(); }
};
struct ConverterFixedStringAsString
{
using Statistics = StatisticsStringRef;
const ColumnFixedString & column;
PODArray<parquet::ByteArray> buf;
explicit ConverterFixedStringAsString(const ColumnPtr & c) : column(assert_cast<const ColumnFixedString &>(*c)) {}
const parquet::ByteArray * getBatch(size_t offset, size_t count)
{
buf.resize(count);
for (size_t i = 0; i < count; ++i)
buf[i] = parquet::ByteArray(static_cast<UInt32>(column.getN()), reinterpret_cast<const uint8_t *>(column.getChars().data() + (offset + i) * column.getN()));
return buf.data();
}
};
template <typename T>
struct ConverterNumberAsFixedString
{
/// Calculate min/max statistics for little-endian fixed strings, not numbers, because parquet
/// doesn't know it's numbers.
using Statistics = StatisticsFixedStringCopy<sizeof(T)>;
const ColumnVector<T> & column;
PODArray<parquet::FixedLenByteArray> buf;
explicit ConverterNumberAsFixedString(const ColumnPtr & c) : column(assert_cast<const ColumnVector<T> &>(*c)) {}
const parquet::FixedLenByteArray * getBatch(size_t offset, size_t count)
{
buf.resize(count);
for (size_t i = 0; i < count; ++i)
buf[i].ptr = reinterpret_cast<const uint8_t *>(column.getData().data() + offset + i);
return buf.data();
}
size_t fixedStringSize() { return sizeof(T); }
};
/// Like ConverterNumberAsFixedString, but converts to big-endian. Because that's the byte order
/// Parquet uses for decimal types and literally nothing else, for some reason.
template <typename T>
struct ConverterDecimal
{
using Statistics = StatisticsFixedStringCopy<sizeof(T)>;
const ColumnDecimal<T> & column;
PODArray<uint8_t> data_buf;
PODArray<parquet::FixedLenByteArray> ptr_buf;
explicit ConverterDecimal(const ColumnPtr & c) : column(assert_cast<const ColumnDecimal<T> &>(*c)) {}
const parquet::FixedLenByteArray * getBatch(size_t offset, size_t count)
{
data_buf.resize(count * sizeof(T));
ptr_buf.resize(count);
memcpy(data_buf.data(), reinterpret_cast<const char *>(column.getData().data() + offset), count * sizeof(T));
for (size_t i = 0; i < count; ++i)
{
std::reverse(data_buf.data() + i * sizeof(T), data_buf.data() + (i + 1) * sizeof(T));
ptr_buf[i].ptr = data_buf.data() + i * sizeof(T);
}
return ptr_buf.data();
}
size_t fixedStringSize() { return sizeof(T); }
};
/// Returns either `source` or `scratch`.
PODArray<char> & compress(PODArray<char> & source, PODArray<char> & scratch, CompressionMethod method)
{
/// We could use wrapWriteBufferWithCompressionMethod() for everything, but I worry about the
/// overhead of creating a bunch of WriteBuffers on each page (thousands of values).
switch (method)
{
case CompressionMethod::None:
return source;
case CompressionMethod::Lz4:
{
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wold-style-cast"
size_t max_dest_size = LZ4_COMPRESSBOUND(source.size());
#pragma clang diagnostic pop
if (max_dest_size > std::numeric_limits<int>::max())
throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column of size {}", formatReadableSizeWithBinarySuffix(source.size()));
scratch.resize(max_dest_size);
int compressed_size = LZ4_compress_default(
source.data(),
scratch.data(),
static_cast<int>(source.size()),
static_cast<int>(max_dest_size));
scratch.resize(static_cast<size_t>(compressed_size));
return scratch;
}
#if USE_SNAPPY
case CompressionMethod::Snappy:
{
size_t max_dest_size = snappy::MaxCompressedLength(source.size());
if (max_dest_size > std::numeric_limits<int>::max())
throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column of size {}", formatReadableSizeWithBinarySuffix(source.size()));
scratch.resize(max_dest_size);
size_t compressed_size;
snappy::RawCompress(source.data(), source.size(), scratch.data(), &compressed_size);
scratch.resize(static_cast<size_t>(compressed_size));
return scratch;
}
#endif
default:
{
auto dest_buf = std::make_unique<WriteBufferFromVector<PODArray<char>>>(scratch);
auto compressed_buf = wrapWriteBufferWithCompressionMethod(
std::move(dest_buf),
method,
/*level*/ 3,
source.size(),
/*existing_memory*/ source.data());
chassert(compressed_buf->position() == source.data());
chassert(compressed_buf->available() == source.size());
compressed_buf->position() += source.size();
compressed_buf->finalize();
return scratch;
}
}
}
void encodeRepDefLevelsRLE(const UInt8 * data, size_t size, UInt8 max_level, PODArray<char> & out)
{
using arrow::util::RleEncoder;
chassert(max_level > 0);
size_t offset = out.size();
size_t prefix_size = sizeof(Int32);
int bit_width = bitScanReverse(max_level) + 1;
int max_rle_size = RleEncoder::MaxBufferSize(bit_width, static_cast<int>(size)) +
RleEncoder::MinBufferSize(bit_width);
out.resize(offset + prefix_size + max_rle_size);
RleEncoder encoder(reinterpret_cast<uint8_t *>(out.data() + offset + prefix_size), max_rle_size, bit_width);
for (size_t i = 0; i < size; ++i)
encoder.Put(data[i]);
encoder.Flush();
Int32 len = encoder.len();
memcpy(out.data() + offset, &len, prefix_size);
out.resize(offset + prefix_size + len);
}
void addToEncodingsUsed(ColumnChunkWriteState & s, parq::Encoding::type e)
{
if (!std::count(s.column_chunk.meta_data.encodings.begin(), s.column_chunk.meta_data.encodings.end(), e))
s.column_chunk.meta_data.encodings.push_back(e);
}
void writePage(const parq::PageHeader & header, const PODArray<char> & compressed, ColumnChunkWriteState & s, WriteBuffer & out)
{
size_t header_size = serializeThriftStruct(header, out);
out.write(compressed.data(), compressed.size());
/// Remember first data page and first dictionary page.
if (header.__isset.data_page_header && s.column_chunk.meta_data.data_page_offset == -1)
s.column_chunk.meta_data.__set_data_page_offset(s.column_chunk.meta_data.total_compressed_size);
if (header.__isset.dictionary_page_header && !s.column_chunk.meta_data.__isset.dictionary_page_offset)
s.column_chunk.meta_data.__set_dictionary_page_offset(s.column_chunk.meta_data.total_compressed_size);
s.column_chunk.meta_data.total_uncompressed_size += header.uncompressed_page_size + header_size;
s.column_chunk.meta_data.total_compressed_size += header.compressed_page_size + header_size;
}
template <typename ParquetDType, typename Converter>
void writeColumnImpl(
ColumnChunkWriteState & s, const WriteOptions & options, WriteBuffer & out, Converter && converter)
{
size_t num_values = s.max_def > 0 ? s.def.size() : s.primitive_column->size();
auto encoding = options.encoding;
typename Converter::Statistics page_statistics;
typename Converter::Statistics total_statistics;
bool use_dictionary = options.use_dictionary_encoding && !s.is_bool;
std::optional<parquet::ColumnDescriptor> fixed_string_descr;
if constexpr (std::is_same<ParquetDType, parquet::FLBAType>::value)
{
/// This just communicates one number to MakeTypedEncoder(): the fixed string length.
fixed_string_descr.emplace(parquet::schema::PrimitiveNode::Make(
"", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY,
parquet::ConvertedType::NONE, static_cast<int>(converter.fixedStringSize())), 0, 0);
if constexpr (std::is_same<typename Converter::Statistics, StatisticsFixedStringRef>::value)
page_statistics.fixed_string_size = converter.fixedStringSize();
}
/// Could use an arena here (by passing a custom MemoryPool), to reuse memory across pages.
/// Alternatively, we could avoid using arrow's dictionary encoding code and leverage
/// ColumnLowCardinality instead. It would work basically the same way as what this function
/// currently does: add values to the ColumnRowCardinality (instead of `encoder`) in batches,
/// checking dictionary size after each batch. That might be faster.
auto encoder = parquet::MakeTypedEncoder<ParquetDType>(
// ignored if using dictionary
static_cast<parquet::Encoding::type>(encoding),
use_dictionary, fixed_string_descr ? &*fixed_string_descr : nullptr);
struct PageData
{
parq::PageHeader header;
PODArray<char> data;
};
std::vector<PageData> dict_encoded_pages; // can't write them out until we have full dictionary
/// Reused across pages to reduce number of allocations and improve locality.
PODArray<char> encoded;
PODArray<char> compressed_maybe;
/// Start of current page.
size_t def_offset = 0; // index in def and rep
size_t data_offset = 0; // index in primitive_column
auto flush_page = [&](size_t def_count, size_t data_count)
{
encoded.clear();
/// Concatenate encoded rep, def, and data.
if (s.max_rep > 0)
encodeRepDefLevelsRLE(s.rep.data() + def_offset, def_count, s.max_rep, encoded);
if (s.max_def > 0)
encodeRepDefLevelsRLE(s.def.data() + def_offset, def_count, s.max_def, encoded);
std::shared_ptr<parquet::Buffer> values = encoder->FlushValues(); // resets it for next page
encoded.resize(encoded.size() + values->size());
memcpy(encoded.data() + encoded.size() - values->size(), values->data(), values->size());
values.reset();
if (encoded.size() > INT32_MAX)
throw Exception(ErrorCodes::CANNOT_COMPRESS, "Uncompressed page is too big: {}", encoded.size());
size_t uncompressed_size = encoded.size();
auto & compressed = compress(encoded, compressed_maybe, s.compression);
if (compressed.size() > INT32_MAX)
throw Exception(ErrorCodes::CANNOT_COMPRESS, "Compressed page is too big: {}", compressed.size());
parq::PageHeader header;
header.__set_type(parq::PageType::DATA_PAGE);
header.__set_uncompressed_page_size(static_cast<int>(uncompressed_size));
header.__set_compressed_page_size(static_cast<int>(compressed.size()));
header.__isset.data_page_header = true;
auto & d = header.data_page_header;
d.__set_num_values(static_cast<Int32>(def_count));
d.__set_encoding(use_dictionary ? parq::Encoding::RLE_DICTIONARY : encoding);
d.__set_definition_level_encoding(parq::Encoding::RLE);
d.__set_repetition_level_encoding(parq::Encoding::RLE);
/// We could also put checksum in `header.crc`, but apparently no one uses it:
/// https://issues.apache.org/jira/browse/PARQUET-594
if (options.write_page_statistics)
{
d.__set_statistics(page_statistics.get(options));
if (s.max_def == 1 && s.max_rep == 0)
d.statistics.__set_null_count(static_cast<Int64>(def_count - data_count));
}
total_statistics.merge(page_statistics);
page_statistics.clear();
if (use_dictionary)
{
dict_encoded_pages.push_back({.header = std::move(header)});
std::swap(dict_encoded_pages.back().data, compressed);
}
else
{
writePage(header, compressed, s, out);
}
def_offset += def_count;
data_offset += data_count;
};
auto flush_dict = [&] -> bool
{
auto * dict_encoder = dynamic_cast<parquet::DictEncoder<ParquetDType> *>(encoder.get());
int dict_size = dict_encoder->dict_encoded_size();
encoded.resize(static_cast<size_t>(dict_size));
dict_encoder->WriteDict(reinterpret_cast<uint8_t *>(encoded.data()));
auto & compressed = compress(encoded, compressed_maybe, s.compression);
if (compressed.size() > INT32_MAX)
throw Exception(ErrorCodes::CANNOT_COMPRESS, "Compressed dictionary page is too big: {}", compressed.size());
parq::PageHeader header;
header.__set_type(parq::PageType::DICTIONARY_PAGE);
header.__set_uncompressed_page_size(dict_size);
header.__set_compressed_page_size(static_cast<int>(compressed.size()));
header.__isset.dictionary_page_header = true;
header.dictionary_page_header.__set_num_values(dict_encoder->num_entries());
header.dictionary_page_header.__set_encoding(parq::Encoding::PLAIN);
writePage(header, compressed, s, out);
for (auto & p : dict_encoded_pages)
writePage(p.header, p.data, s, out);
dict_encoded_pages.clear();
encoder.reset();
return true;
};
auto is_dict_too_big = [&] {
auto * dict_encoder = dynamic_cast<parquet::DictEncoder<ParquetDType> *>(encoder.get());
int dict_size = dict_encoder->dict_encoded_size();
return static_cast<size_t>(dict_size) >= options.dictionary_size_limit;
};
while (def_offset < num_values)
{
/// Pick enough data for a page.
size_t next_def_offset = def_offset;
size_t next_data_offset = data_offset;
while (true)
{
/// Bite off a batch of defs and corresponding data values.
size_t def_count = std::min(options.write_batch_size, num_values - next_def_offset);
size_t data_count = 0;
if (s.max_def == 0)
data_count = def_count;
else
for (size_t i = 0; i < def_count; ++i)
data_count += s.def[next_def_offset + i] == s.max_def;
/// Encode the data (but not the levels yet), so that we can estimate its encoded size.
const typename ParquetDType::c_type * converted = converter.getBatch(next_data_offset, data_count);
if (options.write_page_statistics || options.write_column_chunk_statistics)
/// Workaround for clang bug: https://github.com/llvm/llvm-project/issues/63630
#ifdef MEMORY_SANITIZER
#pragma clang loop vectorize(disable)
#endif
for (size_t i = 0; i < data_count; ++i)
page_statistics.add(converted[i]);
encoder->Put(converted, static_cast<int>(data_count));
next_def_offset += def_count;
next_data_offset += data_count;
if (use_dictionary && is_dict_too_big())
{
/// Fallback to non-dictionary encoding.
///
/// Discard encoded data and start over.
/// This is different from what arrow does: arrow writes out the dictionary-encoded
/// data, then uses non-dictionary encoding for later pages.
/// Starting over seems better: it produces slightly smaller files (I saw 1-4%) in
/// exchange for slight decrease in speed (I saw < 5%). This seems like a good
/// trade because encoding speed is much less important than decoding (as evidenced
/// by arrow not supporting parallel encoding, even though it's easy to support).
def_offset = 0;
data_offset = 0;
dict_encoded_pages.clear();
use_dictionary = false;
#ifndef NDEBUG
/// Arrow's DictEncoderImpl destructor asserts that FlushValues() was called, so we
/// call it even though we don't need its output.
encoder->FlushValues();
#endif
encoder = parquet::MakeTypedEncoder<ParquetDType>(
static_cast<parquet::Encoding::type>(encoding), /* use_dictionary */ false,
fixed_string_descr ? &*fixed_string_descr : nullptr);
break;
}
if (next_def_offset == num_values ||
static_cast<size_t>(encoder->EstimatedDataEncodedSize()) >= options.data_page_size)
{
flush_page(next_def_offset - def_offset, next_data_offset - data_offset);
break;
}
}
}
if (use_dictionary)
flush_dict();
chassert(data_offset == s.primitive_column->size());
if (options.write_column_chunk_statistics)
{
s.column_chunk.meta_data.__set_statistics(total_statistics.get(options));
if (s.max_def == 1 && s.max_rep == 0)
s.column_chunk.meta_data.statistics.__set_null_count(static_cast<Int64>(def_offset - data_offset));
}
/// Report which encodings we've used.
if (s.max_rep > 0 || s.max_def > 0)
addToEncodingsUsed(s, parq::Encoding::RLE); // levels
if (use_dictionary)
{
addToEncodingsUsed(s, parq::Encoding::PLAIN); // dictionary itself
addToEncodingsUsed(s, parq::Encoding::RLE_DICTIONARY); // ids
}
else
{
addToEncodingsUsed(s, encoding);
}
}
}
void writeColumnChunkBody(ColumnChunkWriteState & s, const WriteOptions & options, WriteBuffer & out)
{
s.column_chunk.meta_data.__set_num_values(s.max_def > 0 ? s.def.size() : s.primitive_column->size());
/// We'll be updating these as we go.
s.column_chunk.meta_data.__set_encodings({});
s.column_chunk.meta_data.__set_total_compressed_size(0);
s.column_chunk.meta_data.__set_total_uncompressed_size(0);
s.column_chunk.meta_data.__set_data_page_offset(-1);
s.primitive_column = s.primitive_column->convertToFullColumnIfLowCardinality();
switch (s.primitive_column->getDataType())
{
/// Numeric conversion to Int32 or Int64.
#define N(source_type, parquet_dtype) \
writeColumnImpl<parquet::parquet_dtype>(s, options, out, \
ConverterNumeric<ColumnVector<source_type>, parquet::parquet_dtype::c_type>( \
s.primitive_column))
case TypeIndex::UInt8:
if (s.is_bool)
writeColumnImpl<parquet::BooleanType>(s, options, out,
ConverterNumeric<ColumnVector<UInt8>, bool, bool>(s.primitive_column));
else
N(UInt8 , Int32Type);
break;
case TypeIndex::UInt16 : N(UInt16, Int32Type); break;
case TypeIndex::UInt32 : N(UInt32, Int32Type); break;
case TypeIndex::UInt64 : N(UInt64, Int64Type); break;
case TypeIndex::Int8 : N(Int8 , Int32Type); break;
case TypeIndex::Int16 : N(Int16 , Int32Type); break;
case TypeIndex::Int32 : N(Int32 , Int32Type); break;
case TypeIndex::Int64 : N(Int64 , Int64Type); break;
case TypeIndex::Enum8: N(Int8 , Int32Type); break;
case TypeIndex::Enum16: N(Int16 , Int32Type); break;
case TypeIndex::Date: N(UInt16, Int32Type); break;
case TypeIndex::Date32: N(Int32 , Int32Type); break;
case TypeIndex::DateTime: N(UInt32, Int32Type); break;
#undef N
case TypeIndex::Float32:
writeColumnImpl<parquet::FloatType>(
s, options, out, ConverterNumeric<ColumnVector<Float32>, Float32, Float32>(
s.primitive_column));
break;
case TypeIndex::Float64:
writeColumnImpl<parquet::DoubleType>(
s, options, out, ConverterNumeric<ColumnVector<Float64>, Float64, Float64>(
s.primitive_column));
break;
case TypeIndex::DateTime64:
writeColumnImpl<parquet::Int64Type>(
s, options, out, ConverterNumeric<ColumnDecimal<DateTime64>, Int64, Int64>(
s.primitive_column));
break;
case TypeIndex::IPv4:
writeColumnImpl<parquet::Int32Type>(
s, options, out, ConverterNumeric<ColumnVector<IPv4>, Int32, UInt32>(
s.primitive_column));
break;
case TypeIndex::String:
writeColumnImpl<parquet::ByteArrayType>(
s, options, out, ConverterString(s.primitive_column));
break;
case TypeIndex::FixedString:
if (options.output_fixed_string_as_fixed_byte_array)
writeColumnImpl<parquet::FLBAType>(
s, options, out, ConverterFixedString(s.primitive_column));
else
writeColumnImpl<parquet::ByteArrayType>(
s, options, out, ConverterFixedStringAsString(s.primitive_column));
break;
#define F(source_type) \
writeColumnImpl<parquet::FLBAType>( \
s, options, out, ConverterNumberAsFixedString<source_type>(s.primitive_column))
case TypeIndex::UInt128: F(UInt128); break;
case TypeIndex::UInt256: F(UInt256); break;
case TypeIndex::Int128: F(Int128); break;
case TypeIndex::Int256: F(Int256); break;
case TypeIndex::IPv6: F(IPv6); break;
#undef F
#define D(source_type) \
writeColumnImpl<parquet::FLBAType>( \
s, options, out, ConverterDecimal<source_type>(s.primitive_column))
case TypeIndex::Decimal32: D(Decimal32); break;
case TypeIndex::Decimal64: D(Decimal64); break;
case TypeIndex::Decimal128: D(Decimal128); break;
case TypeIndex::Decimal256: D(Decimal256); break;
#undef D
default:
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected column type: {}", s.primitive_column->getFamilyName());
}
/// Free some memory.
s.primitive_column = {};
s.def = {};
s.rep = {};
}
void writeFileHeader(WriteBuffer & out)
{
/// Write the magic bytes. We're a wizard now.
out.write("PAR1", 4);
}
parq::ColumnChunk finalizeColumnChunkAndWriteFooter(
size_t offset_in_file, ColumnChunkWriteState s, const WriteOptions &, WriteBuffer & out)
{
if (s.column_chunk.meta_data.data_page_offset != -1)
s.column_chunk.meta_data.data_page_offset += offset_in_file;
if (s.column_chunk.meta_data.__isset.dictionary_page_offset)
s.column_chunk.meta_data.dictionary_page_offset += offset_in_file;
s.column_chunk.file_offset = offset_in_file + s.column_chunk.meta_data.total_compressed_size;
serializeThriftStruct(s.column_chunk, out);
return s.column_chunk;
}
parq::RowGroup makeRowGroup(std::vector<parq::ColumnChunk> column_chunks, size_t num_rows)
{
parq::RowGroup r;
r.__set_num_rows(num_rows);
r.__set_columns(column_chunks);
r.__set_total_compressed_size(0);
for (auto & c : r.columns)
{
r.total_byte_size += c.meta_data.total_uncompressed_size;
r.total_compressed_size += c.meta_data.total_compressed_size;
}
if (!r.columns.empty())
{
auto & m = r.columns[0].meta_data;
r.__set_file_offset(m.__isset.dictionary_page_offset ? m.dictionary_page_offset : m.data_page_offset);
}
return r;
}
void writeFileFooter(std::vector<parq::RowGroup> row_groups, SchemaElements schema, const WriteOptions & options, WriteBuffer & out)
{
parq::FileMetaData meta;
meta.version = 2;
meta.schema = std::move(schema);
meta.row_groups = std::move(row_groups);
for (auto & r : meta.row_groups)
meta.num_rows += r.num_rows;
meta.__set_created_by(VERSION_NAME " " VERSION_DESCRIBE);
if (options.write_page_statistics || options.write_column_chunk_statistics)
{
meta.__set_column_orders({});
for (auto & s : meta.schema)
if (!s.__isset.num_children)
meta.column_orders.emplace_back();
for (auto & c : meta.column_orders)
c.__set_TYPE_ORDER({});
}
size_t footer_size = serializeThriftStruct(meta, out);
if (footer_size > INT32_MAX)
throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Parquet file metadata too big: {}", footer_size);
writeIntBinary(static_cast<int>(footer_size), out);
out.write("PAR1", 4);
}
}

View File

@ -0,0 +1,136 @@
#pragma once
#include <Processors/Formats/Impl/Parquet/ThriftUtil.h>
#include <Columns/IColumn.h>
#include <DataTypes/IDataType.h>
#include <Common/PODArray.h>
#include <IO/CompressionMethod.h>
namespace DB::Parquet
{
/// A good resource for learning how Parquet format works is
/// contrib/arrow/cpp/src/parquet/parquet.thrift
struct WriteOptions
{
bool output_string_as_string = false;
bool output_fixed_string_as_fixed_byte_array = true;
CompressionMethod compression = CompressionMethod::Lz4;
size_t data_page_size = 1024 * 1024;
size_t write_batch_size = 1024;
bool use_dictionary_encoding = true;
size_t dictionary_size_limit = 1024 * 1024;
/// If using dictionary, this encoding is used as a fallback when dictionary gets too big.
/// Otherwise, this is used for everything.
parquet::format::Encoding::type encoding = parquet::format::Encoding::PLAIN;
bool write_page_statistics = true;
bool write_column_chunk_statistics = true;
size_t max_statistics_size = 4096;
};
/// Information about a primitive column (leaf of the schema tree) to write to Parquet file.
struct ColumnChunkWriteState
{
/// After writeColumnChunkBody(), offsets in this struct are relative to the start of column chunk.
/// Then finalizeColumnChunkAndWriteFooter() fixes them up before writing to file.
parquet::format::ColumnChunk column_chunk;
ColumnPtr primitive_column;
CompressionMethod compression; // must match what's inside column_chunk
bool is_bool = false;
/// Repetition and definition levels. Produced by prepareColumnForWrite().
/// def is empty iff max_def == 0, which means no arrays or nullables.
/// rep is empty iff max_rep == 0, which means no arrays.
PaddedPODArray<UInt8> def; // definition levels
PaddedPODArray<UInt8> rep; // repetition levels
/// Max possible levels, according to schema. Actual max in def/rep may be smaller.
UInt8 max_def = 0;
UInt8 max_rep = 0;
ColumnChunkWriteState() = default;
/// Prevent accidental copying.
ColumnChunkWriteState(ColumnChunkWriteState &&) = default;
ColumnChunkWriteState & operator=(ColumnChunkWriteState &&) = default;
/// Estimated memory usage.
size_t allocatedBytes() const
{
size_t r = def.allocated_bytes() + rep.allocated_bytes();
if (primitive_column)
r += primitive_column->allocatedBytes();
return r;
}
};
using SchemaElements = std::vector<parquet::format::SchemaElement>;
using ColumnChunkWriteStates = std::vector<ColumnChunkWriteState>;
/// Parquet file consists of row groups, which consist of column chunks.
///
/// Column chunks can be encoded mostly independently of each other, in parallel.
/// But there are two small complications:
/// 1. One ClickHouse column can translate to multiple leaf columns in parquet.
/// E.g. tuples and maps.
/// If all primitive columns are in one big tuple, we'd like to encode them in parallel too,
/// even though they're one top-level ClickHouse column.
/// 2. At the end of each encoded column chunk there's a footer (struct ColumnMetaData) that
/// contains some absolute offsets in the file. We can't encode it until we know the exact
/// position in the file where the column chunk will go. So these footers have to be serialized
/// sequentially, after we know sizes of all previous column chunks.
///
/// With that in mind, here's how to write a parquet file:
///
/// (1) writeFileHeader()
/// (2) For each row group:
/// | (3) For each ClickHouse column:
/// | (4) Call prepareColumnForWrite().
/// | It'll produce one or more ColumnChunkWriteStates, corresponding to primitive columns that
/// | we need to write.
/// | It'll also produce SchemaElements as a byproduct, describing the logical types and
/// | groupings of the physical columns (e.g. tuples, arrays, maps).
/// | (5) For each ColumnChunkWriteState:
/// | (6) Call writeColumnChunkBody() to write the actual data to the given WriteBuffer.
/// | (7) Call finalizeColumnChunkAndWriteFooter() to write the footer of the column chunk.
/// | (8) Call makeRowGroup() using the ColumnChunk metadata structs from previous step.
/// (9) Call writeFileFooter() using the row groups from previous step and SchemaElements from
/// convertSchema().
///
/// Steps (4) and (6) can be parallelized, both within and across row groups.
/// Parquet schema is a tree of SchemaElements, flattened into a list in depth-first order.
/// Leaf nodes correspond to physical columns of primitive types. Inner nodes describe logical
/// groupings of those columns, e.g. tuples or structs.
SchemaElements convertSchema(const Block & sample, const WriteOptions & options);
void prepareColumnForWrite(
ColumnPtr column, DataTypePtr type, const std::string & name, const WriteOptions & options,
ColumnChunkWriteStates * out_columns_to_write, SchemaElements * out_schema = nullptr);
void writeFileHeader(WriteBuffer & out);
/// Encodes a column chunk, without the footer.
/// The ColumnChunkWriteState-s should then passed to finalizeColumnChunkAndWriteFooter().
void writeColumnChunkBody(ColumnChunkWriteState & s, const WriteOptions & options, WriteBuffer & out);
/// Unlike most of the column chunk data, the footer (`ColumnMetaData`) needs to know its absolute
/// offset in the file. So we encode it separately, after all previous row groups and column chunks
/// have been encoded.
/// (If you're wondering if the 8-byte offset values can be patched inside the encoded blob - no,
/// they're varint-encoded and can't be padded to a fixed length.)
/// `offset_in_file` is the absolute position in the file where the writeColumnChunkBody()'s output
/// starts.
/// Returns a ColumnChunk to add to the RowGroup.
parquet::format::ColumnChunk finalizeColumnChunkAndWriteFooter(
size_t offset_in_file, ColumnChunkWriteState s, const WriteOptions & options, WriteBuffer & out);
parquet::format::RowGroup makeRowGroup(std::vector<parquet::format::ColumnChunk> column_chunks, size_t num_rows);
void writeFileFooter(std::vector<parquet::format::RowGroup> row_groups, SchemaElements schema, const WriteOptions & options, WriteBuffer & out);
}

View File

@ -59,7 +59,12 @@ ParquetBlockInputFormat::ParquetBlockInputFormat(
pool = std::make_unique<ThreadPool>(CurrentMetrics::ParquetDecoderThreads, CurrentMetrics::ParquetDecoderThreadsActive, max_decoding_threads);
}
ParquetBlockInputFormat::~ParquetBlockInputFormat() = default;
ParquetBlockInputFormat::~ParquetBlockInputFormat()
{
is_stopped = true;
if (pool)
pool->wait();
}
void ParquetBlockInputFormat::initializeIfNeeded()
{

View File

@ -3,14 +3,23 @@
#if USE_PARQUET
#include <Formats/FormatFactory.h>
#include <IO/WriteBufferFromVector.h>
#include <parquet/arrow/writer.h>
#include "ArrowBufferedStreams.h"
#include "CHColumnToArrowColumn.h"
namespace CurrentMetrics
{
extern const Metric ParquetEncoderThreads;
extern const Metric ParquetEncoderThreadsActive;
}
namespace DB
{
using namespace Parquet;
namespace ErrorCodes
{
extern const int UNKNOWN_EXCEPTION;
@ -59,19 +68,229 @@ namespace
if (method == FormatSettings::ParquetCompression::GZIP)
return parquet::Compression::type::GZIP;
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported compression method");
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported parquet compression method");
}
}
ParquetBlockOutputFormat::ParquetBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_)
: IOutputFormat(header_, out_), format_settings{format_settings_}
{
if (format_settings.parquet.use_custom_encoder)
{
if (format_settings.parquet.parallel_encoding && format_settings.max_threads > 1)
pool = std::make_unique<ThreadPool>(
CurrentMetrics::ParquetEncoderThreads, CurrentMetrics::ParquetEncoderThreadsActive,
format_settings.max_threads);
using C = FormatSettings::ParquetCompression;
switch (format_settings.parquet.output_compression_method)
{
case C::NONE: options.compression = CompressionMethod::None; break;
case C::SNAPPY: options.compression = CompressionMethod::Snappy; break;
case C::ZSTD: options.compression = CompressionMethod::Zstd; break;
case C::LZ4: options.compression = CompressionMethod::Lz4; break;
case C::GZIP: options.compression = CompressionMethod::Gzip; break;
case C::BROTLI: options.compression = CompressionMethod::Brotli; break;
}
options.output_string_as_string = format_settings.parquet.output_string_as_string;
options.output_fixed_string_as_fixed_byte_array = format_settings.parquet.output_fixed_string_as_fixed_byte_array;
options.data_page_size = format_settings.parquet.data_page_size;
options.write_batch_size = format_settings.parquet.write_batch_size;
schema = convertSchema(header_, options);
}
}
void ParquetBlockOutputFormat::consumeStaged()
ParquetBlockOutputFormat::~ParquetBlockOutputFormat()
{
const size_t columns_num = staging_chunks.at(0).getNumColumns();
if (pool)
{
is_stopped = true;
pool->wait();
}
}
void ParquetBlockOutputFormat::consume(Chunk chunk)
{
/// Poll background tasks.
if (pool)
{
std::unique_lock lock(mutex);
while (true)
{
/// If some row groups are ready to be written to the file, write them.
reapCompletedRowGroups(lock);
if (background_exception)
std::rethrow_exception(background_exception);
if (is_stopped)
return;
/// If there's too much work in flight, wait for some of it to complete.
if (row_groups.size() < 2)
break;
if (bytes_in_flight <= format_settings.parquet.row_group_bytes * 4 &&
task_queue.size() <= format_settings.max_threads * 4)
break;
condvar.wait(lock);
}
}
/// Do something like SquashingTransform to produce big enough row groups.
/// Because the real SquashingTransform is only used for INSERT, not for SELECT ... INTO OUTFILE.
/// The latter doesn't even have a pipeline where a transform could be inserted, so it's more
/// convenient to do the squashing here. It's also parallelized here.
if (chunk.getNumRows() != 0)
{
staging_rows += chunk.getNumRows();
staging_bytes += chunk.bytes();
staging_chunks.push_back(std::move(chunk));
}
const size_t target_rows = std::max(static_cast<UInt64>(1), format_settings.parquet.row_group_rows);
if (staging_rows < target_rows &&
staging_bytes < format_settings.parquet.row_group_bytes)
return;
/// In the rare case that more than `row_group_rows` rows arrived in one chunk, split the
/// staging chunk into multiple row groups.
if (staging_rows >= target_rows * 2)
{
/// Increase row group size slightly (by < 2x) to avoid a small row group at the end.
size_t num_row_groups = std::max(static_cast<size_t>(1), staging_rows / target_rows);
size_t row_group_size = (staging_rows - 1) / num_row_groups + 1; // round up
Chunk concatenated = std::move(staging_chunks[0]);
for (size_t i = 1; i < staging_chunks.size(); ++i)
concatenated.append(staging_chunks[i]);
staging_chunks.clear();
for (size_t offset = 0; offset < staging_rows; offset += row_group_size)
{
size_t count = std::min(row_group_size, staging_rows - offset);
MutableColumns columns = concatenated.cloneEmptyColumns();
for (size_t i = 0; i < columns.size(); ++i)
columns[i]->insertRangeFrom(*concatenated.getColumns()[i], offset, count);
Chunks piece;
piece.emplace_back(std::move(columns), count, concatenated.getChunkInfo());
writeRowGroup(std::move(piece));
}
}
else
{
writeRowGroup(std::move(staging_chunks));
}
staging_chunks.clear();
staging_rows = 0;
staging_bytes = 0;
}
void ParquetBlockOutputFormat::finalizeImpl()
{
if (!staging_chunks.empty())
writeRowGroup(std::move(staging_chunks));
if (format_settings.parquet.use_custom_encoder)
{
if (pool)
{
std::unique_lock lock(mutex);
/// Wait for background work to complete.
while (true)
{
reapCompletedRowGroups(lock);
if (background_exception)
std::rethrow_exception(background_exception);
if (is_stopped)
return;
if (row_groups.empty())
break;
condvar.wait(lock);
}
}
if (row_groups_complete.empty())
{
base_offset = out.count();
writeFileHeader(out);
}
writeFileFooter(std::move(row_groups_complete), schema, options, out);
}
else
{
if (!file_writer)
{
Block header = materializeBlock(getPort(PortKind::Main).getHeader());
std::vector<Chunk> chunks;
chunks.push_back(Chunk(header.getColumns(), 0));
writeRowGroup(std::move(chunks));
}
if (file_writer)
{
auto status = file_writer->Close();
if (!status.ok())
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while closing a table: {}", status.ToString());
}
}
}
void ParquetBlockOutputFormat::resetFormatterImpl()
{
if (pool)
{
is_stopped = true;
pool->wait();
is_stopped = false;
}
background_exception = nullptr;
threads_running = 0;
task_queue.clear();
row_groups.clear();
file_writer.reset();
row_groups_complete.clear();
staging_chunks.clear();
staging_rows = 0;
staging_bytes = 0;
}
void ParquetBlockOutputFormat::onCancel()
{
is_stopped = true;
}
void ParquetBlockOutputFormat::writeRowGroup(std::vector<Chunk> chunks)
{
if (pool)
writeRowGroupInParallel(std::move(chunks));
else if (!format_settings.parquet.use_custom_encoder)
writeUsingArrow(std::move(chunks));
else
{
Chunk concatenated = std::move(chunks[0]);
for (size_t i = 1; i < chunks.size(); ++i)
concatenated.append(chunks[i]);
chunks.clear();
writeRowGroupInOneThread(std::move(concatenated));
}
}
void ParquetBlockOutputFormat::writeUsingArrow(std::vector<Chunk> chunks)
{
const size_t columns_num = chunks.at(0).getNumColumns();
std::shared_ptr<arrow::Table> arrow_table;
if (!ch_column_to_arrow_column)
@ -85,7 +304,7 @@ void ParquetBlockOutputFormat::consumeStaged()
format_settings.parquet.output_fixed_string_as_fixed_byte_array);
}
ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, staging_chunks, columns_num);
ch_column_to_arrow_column->chChunkToArrowTable(arrow_table, chunks, columns_num);
if (!file_writer)
{
@ -112,64 +331,234 @@ void ParquetBlockOutputFormat::consumeStaged()
file_writer = std::move(result.ValueOrDie());
}
// TODO: calculate row_group_size depending on a number of rows and table size
// allow slightly bigger than row_group_size to avoid a very small tail row group
auto status = file_writer->WriteTable(*arrow_table, std::max<size_t>(format_settings.parquet.row_group_rows, staging_rows));
auto status = file_writer->WriteTable(*arrow_table, INT64_MAX);
if (!status.ok())
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while writing a table: {}", status.ToString());
}
void ParquetBlockOutputFormat::consume(Chunk chunk)
{
/// Do something like SquashingTransform to produce big enough row groups.
/// Because the real SquashingTransform is only used for INSERT, not for SELECT ... INTO OUTFILE.
/// The latter doesn't even have a pipeline where a transform could be inserted, so it's more
/// convenient to do the squashing here.
staging_rows += chunk.getNumRows();
staging_bytes += chunk.bytes();
staging_chunks.push_back(std::move(chunk));
chassert(staging_chunks.back().getNumColumns() == staging_chunks.front().getNumColumns());
if (staging_rows < format_settings.parquet.row_group_rows &&
staging_bytes < format_settings.parquet.row_group_bytes)
void ParquetBlockOutputFormat::writeRowGroupInOneThread(Chunk chunk)
{
if (chunk.getNumRows() == 0)
return;
const Block & header = getPort(PortKind::Main).getHeader();
Parquet::ColumnChunkWriteStates columns_to_write;
chassert(header.columns() == chunk.getNumColumns());
for (size_t i = 0; i < header.columns(); ++i)
prepareColumnForWrite(
chunk.getColumns()[i], header.getByPosition(i).type, header.getByPosition(i).name,
options, &columns_to_write);
if (row_groups_complete.empty())
{
base_offset = out.count();
writeFileHeader(out);
}
std::vector<parquet::format::ColumnChunk> column_chunks;
for (auto & s : columns_to_write)
{
size_t offset = out.count() - base_offset;
writeColumnChunkBody(s, options, out);
auto c = finalizeColumnChunkAndWriteFooter(offset, std::move(s), options, out);
column_chunks.push_back(std::move(c));
}
auto r = makeRowGroup(std::move(column_chunks), chunk.getNumRows());
row_groups_complete.push_back(std::move(r));
}
void ParquetBlockOutputFormat::writeRowGroupInParallel(std::vector<Chunk> chunks)
{
std::unique_lock lock(mutex);
const Block & header = getPort(PortKind::Main).getHeader();
RowGroupState & r = row_groups.emplace_back();
r.column_chunks.resize(header.columns());
r.tasks_in_flight = r.column_chunks.size();
std::vector<Columns> columnses;
for (auto & chunk : chunks)
{
chassert(header.columns() == chunk.getNumColumns());
r.num_rows += chunk.getNumRows();
columnses.push_back(chunk.detachColumns());
}
for (size_t i = 0; i < header.columns(); ++i)
{
Task & t = task_queue.emplace_back(&r, i, this);
t.column_type = header.getByPosition(i).type;
t.column_name = header.getByPosition(i).name;
/// Defer concatenating the columns to the threads.
size_t bytes = 0;
for (size_t j = 0; j < chunks.size(); ++j)
{
auto & col = columnses[j][i];
bytes += col->allocatedBytes();
t.column_pieces.push_back(std::move(col));
}
t.mem.set(bytes);
}
startMoreThreadsIfNeeded(lock);
}
void ParquetBlockOutputFormat::reapCompletedRowGroups(std::unique_lock<std::mutex> & lock)
{
while (!row_groups.empty() && row_groups.front().tasks_in_flight == 0 && !is_stopped)
{
RowGroupState & r = row_groups.front();
/// Write to the file.
lock.unlock();
if (row_groups_complete.empty())
{
base_offset = out.count();
writeFileHeader(out);
}
std::vector<parquet::format::ColumnChunk> metadata;
for (auto & cols : r.column_chunks)
{
for (ColumnChunk & col : cols)
{
size_t offset = out.count() - base_offset;
out.write(col.serialized.data(), col.serialized.size());
auto m = finalizeColumnChunkAndWriteFooter(offset, std::move(col.state), options, out);
metadata.push_back(std::move(m));
}
}
row_groups_complete.push_back(makeRowGroup(std::move(metadata), r.num_rows));
lock.lock();
row_groups.pop_front();
}
}
void ParquetBlockOutputFormat::startMoreThreadsIfNeeded(const std::unique_lock<std::mutex> &)
{
/// Speculate that all current are already working on tasks.
size_t to_add = std::min(task_queue.size(), format_settings.max_threads - threads_running);
for (size_t i = 0; i < to_add; ++i)
{
auto job = [this, thread_group = CurrentThread::getGroup()]()
{
if (thread_group)
CurrentThread::attachToGroupIfDetached(thread_group);
SCOPE_EXIT_SAFE(if (thread_group) CurrentThread::detachFromGroupIfNotDetached(););
try
{
setThreadName("ParquetEncoder");
threadFunction();
}
catch (...)
{
std::lock_guard lock(mutex);
background_exception = std::current_exception();
condvar.notify_all();
--threads_running;
}
};
if (threads_running == 0)
{
/// First thread. We need it to succeed; otherwise we may get stuck.
pool->scheduleOrThrowOnError(job);
++threads_running;
}
else
{
consumeStaged();
staging_chunks.clear();
staging_rows = 0;
staging_bytes = 0;
/// More threads. This may be called from inside the thread pool, so avoid waiting;
/// otherwise it may deadlock.
if (!pool->trySchedule(job))
break;
}
}
}
void ParquetBlockOutputFormat::finalizeImpl()
void ParquetBlockOutputFormat::threadFunction()
{
if (!file_writer && staging_chunks.empty())
{
Block header = materializeBlock(getPort(PortKind::Main).getHeader());
std::unique_lock lock(mutex);
consume(Chunk(header.getColumns(), 0)); // this will make staging_chunks non-empty
while (true)
{
if (task_queue.empty() || is_stopped)
{
/// The check and the decrement need to be in the same critical section, to make sure
/// we never get stuck with tasks but no threads.
--threads_running;
return;
}
if (!staging_chunks.empty())
auto task = std::move(task_queue.front());
task_queue.pop_front();
if (task.column_type)
{
consumeStaged();
staging_chunks.clear();
staging_rows = 0;
staging_bytes = 0;
lock.unlock();
IColumn::MutablePtr concatenated = IColumn::mutate(std::move(task.column_pieces[0]));
for (size_t i = 1; i < task.column_pieces.size(); ++i)
{
auto & c = task.column_pieces[i];
concatenated->insertRangeFrom(*c, 0, c->size());
c.reset();
}
task.column_pieces.clear();
std::vector<ColumnChunkWriteState> subcolumns;
prepareColumnForWrite(
std::move(concatenated), task.column_type, task.column_name, options, &subcolumns);
lock.lock();
for (size_t i = 0; i < subcolumns.size(); ++i)
{
task.row_group->column_chunks[task.column_idx].emplace_back(this);
task.row_group->tasks_in_flight += 1;
auto & t = task_queue.emplace_back(task.row_group, task.column_idx, this);
t.subcolumn_idx = i;
t.state = std::move(subcolumns[i]);
t.mem.set(t.state.allocatedBytes());
}
auto status = file_writer->Close();
if (!status.ok())
throw Exception(ErrorCodes::UNKNOWN_EXCEPTION, "Error while closing a table: {}", status.ToString());
startMoreThreadsIfNeeded(lock);
}
else
{
lock.unlock();
PODArray<char> serialized;
{
WriteBufferFromVector buf(serialized);
writeColumnChunkBody(task.state, options, buf);
}
void ParquetBlockOutputFormat::resetFormatterImpl()
{
file_writer.reset();
lock.lock();
auto & c = task.row_group->column_chunks[task.column_idx][task.subcolumn_idx];
c.state = std::move(task.state);
c.serialized = std::move(serialized);
c.mem.set(c.serialized.size() + c.state.allocatedBytes());
}
--task.row_group->tasks_in_flight;
condvar.notify_all();
}
}
void registerOutputFormatParquet(FormatFactory & factory)

View File

@ -2,8 +2,11 @@
#include "config.h"
#if USE_PARQUET
#include <Processors/Formats/IOutputFormat.h>
#include <Processors/Formats/Impl/Parquet/Write.h>
#include <Formats/FormatSettings.h>
#include <Common/ThreadPool.h>
namespace arrow
{
@ -28,25 +31,129 @@ class ParquetBlockOutputFormat : public IOutputFormat
{
public:
ParquetBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_);
~ParquetBlockOutputFormat() override;
String getName() const override { return "ParquetBlockOutputFormat"; }
String getContentType() const override { return "application/octet-stream"; }
private:
void consumeStaged();
struct MemoryToken
{
ParquetBlockOutputFormat * parent;
size_t bytes = 0;
explicit MemoryToken(ParquetBlockOutputFormat * p, size_t b = 0) : parent(p)
{
set(b);
}
MemoryToken(MemoryToken && t)
: parent(std::exchange(t.parent, nullptr)), bytes(std::exchange(t.bytes, 0)) {}
MemoryToken & operator=(MemoryToken && t)
{
parent = std::exchange(t.parent, nullptr);
bytes = std::exchange(t.bytes, 0);
return *this;
}
~MemoryToken()
{
set(0);
}
void set(size_t new_size)
{
if (new_size == bytes)
return;
parent->bytes_in_flight += new_size - bytes; // overflow is fine
bytes = new_size;
}
};
struct ColumnChunk
{
Parquet::ColumnChunkWriteState state;
PODArray<char> serialized;
MemoryToken mem;
ColumnChunk(ParquetBlockOutputFormat * p) : mem(p) {}
};
struct RowGroupState
{
size_t tasks_in_flight = 0;
std::vector<std::vector<ColumnChunk>> column_chunks;
size_t num_rows = 0;
};
struct Task
{
RowGroupState * row_group;
size_t column_idx;
size_t subcolumn_idx = 0;
MemoryToken mem;
/// If not null, we need to call prepareColumnForWrite().
/// Otherwise we need to call writeColumnChunkBody().
DataTypePtr column_type;
std::string column_name;
std::vector<ColumnPtr> column_pieces;
Parquet::ColumnChunkWriteState state;
Task(RowGroupState * rg, size_t ci, ParquetBlockOutputFormat * p)
: row_group(rg), column_idx(ci), mem(p) {}
};
void consume(Chunk) override;
void finalizeImpl() override;
void resetFormatterImpl() override;
void onCancel() override;
void writeRowGroup(std::vector<Chunk> chunks);
void writeUsingArrow(std::vector<Chunk> chunks);
void writeRowGroupInOneThread(Chunk chunk);
void writeRowGroupInParallel(std::vector<Chunk> chunks);
void threadFunction();
void startMoreThreadsIfNeeded(const std::unique_lock<std::mutex> & lock);
/// Called in single-threaded fashion. Writes to the file.
void reapCompletedRowGroups(std::unique_lock<std::mutex> & lock);
const FormatSettings format_settings;
/// Chunks to squash together to form a row group.
std::vector<Chunk> staging_chunks;
size_t staging_rows = 0;
size_t staging_bytes = 0;
const FormatSettings format_settings;
std::unique_ptr<parquet::arrow::FileWriter> file_writer;
std::unique_ptr<CHColumnToArrowColumn> ch_column_to_arrow_column;
Parquet::WriteOptions options;
Parquet::SchemaElements schema;
std::vector<parquet::format::RowGroup> row_groups_complete;
size_t base_offset = 0;
std::mutex mutex;
std::condition_variable condvar; // wakes up consume()
std::unique_ptr<ThreadPool> pool;
std::atomic_bool is_stopped{false};
std::exception_ptr background_exception = nullptr;
/// Invariant: if there's at least one task then there's at least one thread.
size_t threads_running = 0;
std::atomic<size_t> bytes_in_flight{0};
std::deque<Task> task_queue;
std::deque<RowGroupState> row_groups;
};
}

138
src/Server/ServerType.cpp Normal file
View File

@ -0,0 +1,138 @@
#include <Server/ServerType.h>
#include <vector>
#include <algorithm>
#include <base/types.h>
#include <magic_enum.hpp>
namespace DB
{
namespace
{
std::vector<std::string> getTypeIndexToTypeName()
{
constexpr std::size_t types_size = magic_enum::enum_count<ServerType::Type>();
std::vector<std::string> type_index_to_type_name;
type_index_to_type_name.resize(types_size);
auto entries = magic_enum::enum_entries<ServerType::Type>();
for (const auto & [entry, str] : entries)
{
auto str_copy = String(str);
std::replace(str_copy.begin(), str_copy.end(), '_', ' ');
type_index_to_type_name[static_cast<UInt64>(entry)] = std::move(str_copy);
}
return type_index_to_type_name;
}
}
const char * ServerType::serverTypeToString(ServerType::Type type)
{
/** During parsing if SystemQuery is not parsed properly it is added to Expected variants as description check IParser.h.
* Description string must be statically allocated.
*/
static std::vector<std::string> type_index_to_type_name = getTypeIndexToTypeName();
const auto & type_name = type_index_to_type_name[static_cast<UInt64>(type)];
return type_name.data();
}
bool ServerType::shouldStart(Type server_type, const std::string & custom_name_) const
{
if (type == Type::QUERIES_ALL)
return true;
if (type == Type::QUERIES_DEFAULT)
{
switch (server_type)
{
case Type::TCP:
case Type::TCP_WITH_PROXY:
case Type::TCP_SECURE:
case Type::HTTP:
case Type::HTTPS:
case Type::MYSQL:
case Type::GRPC:
case Type::POSTGRESQL:
case Type::PROMETHEUS:
case Type::INTERSERVER_HTTP:
case Type::INTERSERVER_HTTPS:
return true;
default:
return false;
}
}
if (type == Type::QUERIES_CUSTOM)
{
switch (server_type)
{
case Type::CUSTOM:
return true;
default:
return false;
}
}
return type == server_type && custom_name == custom_name_;
}
bool ServerType::shouldStop(const std::string & port_name) const
{
Type port_type;
std::string port_custom_name;
if (port_name == "http_port")
port_type = Type::HTTP;
else if (port_name == "https_port")
port_type = Type::HTTPS;
else if (port_name == "tcp_port")
port_type = Type::TCP;
else if (port_name == "tcp_with_proxy_port")
port_type = Type::TCP_WITH_PROXY;
else if (port_name == "tcp_port_secure")
port_type = Type::TCP_SECURE;
else if (port_name == "mysql_port")
port_type = Type::MYSQL;
else if (port_name == "postgresql_port")
port_type = Type::POSTGRESQL;
else if (port_name == "grpc_port")
port_type = Type::GRPC;
else if (port_name == "prometheus.port")
port_type = Type::PROMETHEUS;
else if (port_name == "interserver_http_port")
port_type = Type::INTERSERVER_HTTP;
else if (port_name == "interserver_https_port")
port_type = Type::INTERSERVER_HTTPS;
else if (port_name.starts_with("protocols.") && port_name.ends_with(".port"))
{
constexpr size_t protocols_size = std::string_view("protocols.").size();
constexpr size_t port_size = std::string_view("protocols.").size();
port_type = Type::CUSTOM;
port_custom_name = port_name.substr(protocols_size, port_name.size() - port_size);
}
else
port_type = Type::UNKNOWN;
if (port_type == Type::UNKNOWN)
return false;
return shouldStart(type, port_custom_name);
}
}

44
src/Server/ServerType.h Normal file
View File

@ -0,0 +1,44 @@
#pragma once
#include <base/types.h>
namespace DB
{
class ServerType
{
public:
enum Type
{
UNKNOWN,
TCP,
TCP_WITH_PROXY,
TCP_SECURE,
HTTP,
HTTPS,
MYSQL,
GRPC,
POSTGRESQL,
PROMETHEUS,
CUSTOM,
INTERSERVER_HTTP,
INTERSERVER_HTTPS,
QUERIES_ALL,
QUERIES_DEFAULT,
QUERIES_CUSTOM,
END
};
ServerType() = default;
explicit ServerType(Type type_, const std::string & custom_name_ = "") : type(type_), custom_name(custom_name_) {}
static const char * serverTypeToString(Type type);
bool shouldStart(Type server_type, const std::string & custom_name_ = "") const;
bool shouldStop(const std::string & port_name) const;
Type type;
std::string custom_name;
};
}

View File

@ -243,6 +243,15 @@ void GinIndexStore::finalize()
{
if (!current_postings.empty())
writeSegment();
if (metadata_file_stream)
metadata_file_stream->finalize();
if (dict_file_stream)
dict_file_stream->finalize();
if (postings_file_stream)
postings_file_stream->finalize();
}
void GinIndexStore::initFileStreams()
@ -319,13 +328,8 @@ void GinIndexStore::writeSegment()
current_segment.segment_id = getNextSegmentID();
metadata_file_stream->sync();
metadata_file_stream->finalize();
dict_file_stream->sync();
dict_file_stream->finalize();
postings_file_stream->sync();
postings_file_stream->finalize();
}
GinIndexStoreDeserializer::GinIndexStoreDeserializer(const GinIndexStorePtr & store_)

View File

@ -328,7 +328,10 @@ MergeTreePrefetchedReadPool::PartsInfos MergeTreePrefetchedReadPool::getPartsInf
for (const auto & range : part.ranges)
part_info->sum_marks += range.end - range.begin;
part_info->approx_size_of_mark = getApproximateSizeOfGranule(*part_info->data_part, column_names);
const auto & columns = settings.merge_tree_determine_task_size_by_prewhere_columns && prewhere_info
? prewhere_info->prewhere_actions->getRequiredColumnsNames()
: column_names;
part_info->approx_size_of_mark = getApproximateSizeOfGranule(*part_info->data_part, columns);
const auto task_columns = getReadTaskColumns(
part_reader_info,
@ -369,9 +372,9 @@ MergeTreePrefetchedReadPool::PartsInfos MergeTreePrefetchedReadPool::getPartsInf
}
if (prewhere_info)
{
for (const auto & columns : task_columns.pre_columns)
for (const auto & cols : task_columns.pre_columns)
{
for (const auto & col : columns)
for (const auto & col : cols)
{
const size_t col_size = part.data_part->getColumnSize(col.name).data_compressed;
part_info->estimated_memory_usage_for_single_prefetch += std::min<size_t>(col_size, settings.prefetch_buffer_size);

View File

@ -73,8 +73,10 @@ MergeTreeReadPool::MergeTreeReadPool(
size_t total_marks = 0;
for (const auto & part : parts_ranges)
{
total_compressed_bytes += getApproxSizeOfPart(
*part.data_part, prewhere_info ? prewhere_info->prewhere_actions->getRequiredColumnsNames() : column_names_);
const auto & columns = settings.merge_tree_determine_task_size_by_prewhere_columns && prewhere_info
? prewhere_info->prewhere_actions->getRequiredColumnsNames()
: column_names_;
total_compressed_bytes += getApproxSizeOfPart(*part.data_part, columns);
total_marks += part.getMarksCount();
}

View File

@ -156,7 +156,7 @@ public:
void checkTableCanBeDropped() const override {}
private:
mutable std::mutex nested_mutex;
mutable std::recursive_mutex nested_mutex;
mutable GetNestedStorageFunc get_nested;
mutable StoragePtr nested;
const bool add_conversion;

View File

@ -128,6 +128,7 @@
02581_share_big_sets_between_mutation_tasks_long
02581_share_big_sets_between_multiple_mutations_tasks_long
00992_system_parts_race_condition_zookeeper_long
02818_parameterized_view_with_cte_multiple_usage
02790_optimize_skip_unused_shards_join
01940_custom_tld_sharding_key
02815_range_dict_no_direct_join

View File

@ -0,0 +1,16 @@
<clickhouse>
<remote_servers>
<default>
<shard>
<replica>
<host>node1</host>
<port>9000</port>
</replica>
<replica>
<host>node2</host>
<port>9000</port>
</replica>
</shard>
</default>
</remote_servers>
</clickhouse>

View File

@ -0,0 +1,40 @@
#!/usr/bin/env python3
import pytest
import time
from helpers.cluster import ClickHouseCluster
from helpers.network import PartitionManager
from helpers.test_tools import assert_eq_with_retry
import random
import string
import json
cluster = ClickHouseCluster(__file__)
node1 = cluster.add_instance(
"node1", main_configs=["configs/cluster.xml"], with_zookeeper=True
)
node2 = cluster.add_instance(
"node2", main_configs=["configs/cluster.xml"], with_zookeeper=True
)
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
def test_system_start_stop_listen_queries(started_cluster):
node1.query("SYSTEM STOP LISTEN QUERIES ALL")
assert "Connection refused" in node1.query_and_get_error("SELECT 1", timeout=3)
node2.query("SYSTEM START LISTEN ON CLUSTER default QUERIES ALL")
node1.query("SELECT 1")

View File

@ -7,7 +7,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
$CLICKHOUSE_CLIENT -q "drop table if exists mt"
$CLICKHOUSE_CLIENT -q "create table mt (n int) engine=MergeTree order by n settings parts_to_throw_insert=1000"
$CLICKHOUSE_CLIENT -q "create table mt (n int) engine=MergeTree order by n settings parts_to_throw_insert=5000"
$CLICKHOUSE_CLIENT -q "insert into mt values (1)"
$CLICKHOUSE_CLIENT -q "insert into mt values (2)"
$CLICKHOUSE_CLIENT -q "insert into mt values (3)"

View File

@ -138,6 +138,7 @@ SYSTEM FLUSH [] \N SYSTEM
SYSTEM THREAD FUZZER ['SYSTEM START THREAD FUZZER','SYSTEM STOP THREAD FUZZER','START THREAD FUZZER','STOP THREAD FUZZER'] GLOBAL SYSTEM
SYSTEM UNFREEZE ['SYSTEM UNFREEZE'] GLOBAL SYSTEM
SYSTEM FAILPOINT ['SYSTEM ENABLE FAILPOINT','SYSTEM DISABLE FAILPOINT'] GLOBAL SYSTEM
SYSTEM LISTEN ['SYSTEM START LISTEN','SYSTEM STOP LISTEN'] GLOBAL SYSTEM
SYSTEM [] \N ALL
dictGet ['dictHas','dictGetHierarchy','dictIsIn'] DICTIONARY ALL
displaySecretsInShowAndSelect [] GLOBAL ALL

View File

@ -2,5 +2,7 @@ CreatedReadBufferMMap
CreatedReadBufferMMapFailed
MMappedFileCacheHits
MMappedFileCacheMisses
MMappedAllocBytes
MMappedAllocs
MMappedFileBytes
MMappedFiles

View File

@ -17,4 +17,3 @@ with client(name="client1>", log=log) as client1:
client1.send("SELECT number FROM numbers(1000) FORMAT Null")
client1.expect("Progress: 1\.00 thousand rows, 8\.00 KB .*" + end_of_block)
client1.expect("0 rows in set. Elapsed: [\\w]{1}\.[\\w]{3} sec.")
client1.expect("Peak memory usage \(for query\) .*B" + end_of_block)

View File

@ -9,7 +9,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
${CLICKHOUSE_CLIENT} --multiquery --query "DROP TABLE IF EXISTS t; CREATE TABLE t (x UInt64) ENGINE = Memory;"
# Rate limit is chosen for operation to spent more than one second.
seq 1 1000 | pv --quiet --rate-limit 1000 | ${CLICKHOUSE_CLIENT} --query "INSERT INTO t FORMAT TSV"
seq 1 1000 | pv --quiet --rate-limit 500 | ${CLICKHOUSE_CLIENT} --query "INSERT INTO t FORMAT TSV"
# We check that the value of NetworkReceiveElapsedMicroseconds correctly includes the time spent waiting data from the client.
${CLICKHOUSE_CLIENT} --multiquery --query "SYSTEM FLUSH LOGS;

View File

@ -297,7 +297,7 @@ CREATE TABLE system.grants
(
`user_name` Nullable(String),
`role_name` Nullable(String),
`access_type` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION' = 96, 'NAMED COLLECTION ADMIN' = 97, 'SYSTEM SHUTDOWN' = 98, 'SYSTEM DROP DNS CACHE' = 99, 'SYSTEM DROP MARK CACHE' = 100, 'SYSTEM DROP UNCOMPRESSED CACHE' = 101, 'SYSTEM DROP MMAP CACHE' = 102, 'SYSTEM DROP QUERY CACHE' = 103, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 104, 'SYSTEM DROP FILESYSTEM CACHE' = 105, 'SYSTEM DROP SCHEMA CACHE' = 106, 'SYSTEM DROP S3 CLIENT CACHE' = 107, 'SYSTEM DROP CACHE' = 108, 'SYSTEM RELOAD CONFIG' = 109, 'SYSTEM RELOAD USERS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH ASYNC INSERT QUEUE' = 135, 'SYSTEM FLUSH' = 136, 'SYSTEM THREAD FUZZER' = 137, 'SYSTEM UNFREEZE' = 138, 'SYSTEM FAILPOINT' = 139, 'SYSTEM' = 140, 'dictGet' = 141, 'displaySecretsInShowAndSelect' = 142, 'addressToLine' = 143, 'addressToLineWithInlines' = 144, 'addressToSymbol' = 145, 'demangle' = 146, 'INTROSPECTION' = 147, 'FILE' = 148, 'URL' = 149, 'REMOTE' = 150, 'MONGO' = 151, 'REDIS' = 152, 'MEILISEARCH' = 153, 'MYSQL' = 154, 'POSTGRES' = 155, 'SQLITE' = 156, 'ODBC' = 157, 'JDBC' = 158, 'HDFS' = 159, 'S3' = 160, 'HIVE' = 161, 'AZURE' = 162, 'SOURCES' = 163, 'CLUSTER' = 164, 'ALL' = 165, 'NONE' = 166),
`access_type` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION' = 96, 'NAMED COLLECTION ADMIN' = 97, 'SYSTEM SHUTDOWN' = 98, 'SYSTEM DROP DNS CACHE' = 99, 'SYSTEM DROP MARK CACHE' = 100, 'SYSTEM DROP UNCOMPRESSED CACHE' = 101, 'SYSTEM DROP MMAP CACHE' = 102, 'SYSTEM DROP QUERY CACHE' = 103, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 104, 'SYSTEM DROP FILESYSTEM CACHE' = 105, 'SYSTEM DROP SCHEMA CACHE' = 106, 'SYSTEM DROP S3 CLIENT CACHE' = 107, 'SYSTEM DROP CACHE' = 108, 'SYSTEM RELOAD CONFIG' = 109, 'SYSTEM RELOAD USERS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH ASYNC INSERT QUEUE' = 135, 'SYSTEM FLUSH' = 136, 'SYSTEM THREAD FUZZER' = 137, 'SYSTEM UNFREEZE' = 138, 'SYSTEM FAILPOINT' = 139, 'SYSTEM LISTEN' = 140, 'SYSTEM' = 141, 'dictGet' = 142, 'displaySecretsInShowAndSelect' = 143, 'addressToLine' = 144, 'addressToLineWithInlines' = 145, 'addressToSymbol' = 146, 'demangle' = 147, 'INTROSPECTION' = 148, 'FILE' = 149, 'URL' = 150, 'REMOTE' = 151, 'MONGO' = 152, 'REDIS' = 153, 'MEILISEARCH' = 154, 'MYSQL' = 155, 'POSTGRES' = 156, 'SQLITE' = 157, 'ODBC' = 158, 'JDBC' = 159, 'HDFS' = 160, 'S3' = 161, 'HIVE' = 162, 'AZURE' = 163, 'SOURCES' = 164, 'CLUSTER' = 165, 'ALL' = 166, 'NONE' = 167),
`database` Nullable(String),
`table` Nullable(String),
`column` Nullable(String),
@ -584,10 +584,10 @@ ENGINE = SystemPartsColumns
COMMENT 'SYSTEM TABLE is built on the fly.'
CREATE TABLE system.privileges
(
`privilege` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION' = 96, 'NAMED COLLECTION ADMIN' = 97, 'SYSTEM SHUTDOWN' = 98, 'SYSTEM DROP DNS CACHE' = 99, 'SYSTEM DROP MARK CACHE' = 100, 'SYSTEM DROP UNCOMPRESSED CACHE' = 101, 'SYSTEM DROP MMAP CACHE' = 102, 'SYSTEM DROP QUERY CACHE' = 103, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 104, 'SYSTEM DROP FILESYSTEM CACHE' = 105, 'SYSTEM DROP SCHEMA CACHE' = 106, 'SYSTEM DROP S3 CLIENT CACHE' = 107, 'SYSTEM DROP CACHE' = 108, 'SYSTEM RELOAD CONFIG' = 109, 'SYSTEM RELOAD USERS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH ASYNC INSERT QUEUE' = 135, 'SYSTEM FLUSH' = 136, 'SYSTEM THREAD FUZZER' = 137, 'SYSTEM UNFREEZE' = 138, 'SYSTEM FAILPOINT' = 139, 'SYSTEM' = 140, 'dictGet' = 141, 'displaySecretsInShowAndSelect' = 142, 'addressToLine' = 143, 'addressToLineWithInlines' = 144, 'addressToSymbol' = 145, 'demangle' = 146, 'INTROSPECTION' = 147, 'FILE' = 148, 'URL' = 149, 'REMOTE' = 150, 'MONGO' = 151, 'REDIS' = 152, 'MEILISEARCH' = 153, 'MYSQL' = 154, 'POSTGRES' = 155, 'SQLITE' = 156, 'ODBC' = 157, 'JDBC' = 158, 'HDFS' = 159, 'S3' = 160, 'HIVE' = 161, 'AZURE' = 162, 'SOURCES' = 163, 'CLUSTER' = 164, 'ALL' = 165, 'NONE' = 166),
`privilege` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION' = 96, 'NAMED COLLECTION ADMIN' = 97, 'SYSTEM SHUTDOWN' = 98, 'SYSTEM DROP DNS CACHE' = 99, 'SYSTEM DROP MARK CACHE' = 100, 'SYSTEM DROP UNCOMPRESSED CACHE' = 101, 'SYSTEM DROP MMAP CACHE' = 102, 'SYSTEM DROP QUERY CACHE' = 103, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 104, 'SYSTEM DROP FILESYSTEM CACHE' = 105, 'SYSTEM DROP SCHEMA CACHE' = 106, 'SYSTEM DROP S3 CLIENT CACHE' = 107, 'SYSTEM DROP CACHE' = 108, 'SYSTEM RELOAD CONFIG' = 109, 'SYSTEM RELOAD USERS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH ASYNC INSERT QUEUE' = 135, 'SYSTEM FLUSH' = 136, 'SYSTEM THREAD FUZZER' = 137, 'SYSTEM UNFREEZE' = 138, 'SYSTEM FAILPOINT' = 139, 'SYSTEM LISTEN' = 140, 'SYSTEM' = 141, 'dictGet' = 142, 'displaySecretsInShowAndSelect' = 143, 'addressToLine' = 144, 'addressToLineWithInlines' = 145, 'addressToSymbol' = 146, 'demangle' = 147, 'INTROSPECTION' = 148, 'FILE' = 149, 'URL' = 150, 'REMOTE' = 151, 'MONGO' = 152, 'REDIS' = 153, 'MEILISEARCH' = 154, 'MYSQL' = 155, 'POSTGRES' = 156, 'SQLITE' = 157, 'ODBC' = 158, 'JDBC' = 159, 'HDFS' = 160, 'S3' = 161, 'HIVE' = 162, 'AZURE' = 163, 'SOURCES' = 164, 'CLUSTER' = 165, 'ALL' = 166, 'NONE' = 167),
`aliases` Array(String),
`level` Nullable(Enum8('GLOBAL' = 0, 'DATABASE' = 1, 'TABLE' = 2, 'DICTIONARY' = 3, 'VIEW' = 4, 'COLUMN' = 5, 'NAMED_COLLECTION' = 6)),
`parent_group` Nullable(Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION' = 96, 'NAMED COLLECTION ADMIN' = 97, 'SYSTEM SHUTDOWN' = 98, 'SYSTEM DROP DNS CACHE' = 99, 'SYSTEM DROP MARK CACHE' = 100, 'SYSTEM DROP UNCOMPRESSED CACHE' = 101, 'SYSTEM DROP MMAP CACHE' = 102, 'SYSTEM DROP QUERY CACHE' = 103, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 104, 'SYSTEM DROP FILESYSTEM CACHE' = 105, 'SYSTEM DROP SCHEMA CACHE' = 106, 'SYSTEM DROP S3 CLIENT CACHE' = 107, 'SYSTEM DROP CACHE' = 108, 'SYSTEM RELOAD CONFIG' = 109, 'SYSTEM RELOAD USERS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH ASYNC INSERT QUEUE' = 135, 'SYSTEM FLUSH' = 136, 'SYSTEM THREAD FUZZER' = 137, 'SYSTEM UNFREEZE' = 138, 'SYSTEM FAILPOINT' = 139, 'SYSTEM' = 140, 'dictGet' = 141, 'displaySecretsInShowAndSelect' = 142, 'addressToLine' = 143, 'addressToLineWithInlines' = 144, 'addressToSymbol' = 145, 'demangle' = 146, 'INTROSPECTION' = 147, 'FILE' = 148, 'URL' = 149, 'REMOTE' = 150, 'MONGO' = 151, 'REDIS' = 152, 'MEILISEARCH' = 153, 'MYSQL' = 154, 'POSTGRES' = 155, 'SQLITE' = 156, 'ODBC' = 157, 'JDBC' = 158, 'HDFS' = 159, 'S3' = 160, 'HIVE' = 161, 'AZURE' = 162, 'SOURCES' = 163, 'CLUSTER' = 164, 'ALL' = 165, 'NONE' = 166))
`parent_group` Nullable(Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER NAMED COLLECTION' = 41, 'ALTER TABLE' = 42, 'ALTER DATABASE' = 43, 'ALTER VIEW REFRESH' = 44, 'ALTER VIEW MODIFY QUERY' = 45, 'ALTER VIEW' = 46, 'ALTER' = 47, 'CREATE DATABASE' = 48, 'CREATE TABLE' = 49, 'CREATE VIEW' = 50, 'CREATE DICTIONARY' = 51, 'CREATE TEMPORARY TABLE' = 52, 'CREATE ARBITRARY TEMPORARY TABLE' = 53, 'CREATE FUNCTION' = 54, 'CREATE NAMED COLLECTION' = 55, 'CREATE' = 56, 'DROP DATABASE' = 57, 'DROP TABLE' = 58, 'DROP VIEW' = 59, 'DROP DICTIONARY' = 60, 'DROP FUNCTION' = 61, 'DROP NAMED COLLECTION' = 62, 'DROP' = 63, 'UNDROP TABLE' = 64, 'TRUNCATE' = 65, 'OPTIMIZE' = 66, 'BACKUP' = 67, 'KILL QUERY' = 68, 'KILL TRANSACTION' = 69, 'MOVE PARTITION BETWEEN SHARDS' = 70, 'CREATE USER' = 71, 'ALTER USER' = 72, 'DROP USER' = 73, 'CREATE ROLE' = 74, 'ALTER ROLE' = 75, 'DROP ROLE' = 76, 'ROLE ADMIN' = 77, 'CREATE ROW POLICY' = 78, 'ALTER ROW POLICY' = 79, 'DROP ROW POLICY' = 80, 'CREATE QUOTA' = 81, 'ALTER QUOTA' = 82, 'DROP QUOTA' = 83, 'CREATE SETTINGS PROFILE' = 84, 'ALTER SETTINGS PROFILE' = 85, 'DROP SETTINGS PROFILE' = 86, 'SHOW USERS' = 87, 'SHOW ROLES' = 88, 'SHOW ROW POLICIES' = 89, 'SHOW QUOTAS' = 90, 'SHOW SETTINGS PROFILES' = 91, 'SHOW ACCESS' = 92, 'ACCESS MANAGEMENT' = 93, 'SHOW NAMED COLLECTIONS' = 94, 'SHOW NAMED COLLECTIONS SECRETS' = 95, 'NAMED COLLECTION' = 96, 'NAMED COLLECTION ADMIN' = 97, 'SYSTEM SHUTDOWN' = 98, 'SYSTEM DROP DNS CACHE' = 99, 'SYSTEM DROP MARK CACHE' = 100, 'SYSTEM DROP UNCOMPRESSED CACHE' = 101, 'SYSTEM DROP MMAP CACHE' = 102, 'SYSTEM DROP QUERY CACHE' = 103, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 104, 'SYSTEM DROP FILESYSTEM CACHE' = 105, 'SYSTEM DROP SCHEMA CACHE' = 106, 'SYSTEM DROP S3 CLIENT CACHE' = 107, 'SYSTEM DROP CACHE' = 108, 'SYSTEM RELOAD CONFIG' = 109, 'SYSTEM RELOAD USERS' = 110, 'SYSTEM RELOAD DICTIONARY' = 111, 'SYSTEM RELOAD MODEL' = 112, 'SYSTEM RELOAD FUNCTION' = 113, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 114, 'SYSTEM RELOAD' = 115, 'SYSTEM RESTART DISK' = 116, 'SYSTEM MERGES' = 117, 'SYSTEM TTL MERGES' = 118, 'SYSTEM FETCHES' = 119, 'SYSTEM MOVES' = 120, 'SYSTEM DISTRIBUTED SENDS' = 121, 'SYSTEM REPLICATED SENDS' = 122, 'SYSTEM SENDS' = 123, 'SYSTEM REPLICATION QUEUES' = 124, 'SYSTEM DROP REPLICA' = 125, 'SYSTEM SYNC REPLICA' = 126, 'SYSTEM RESTART REPLICA' = 127, 'SYSTEM RESTORE REPLICA' = 128, 'SYSTEM WAIT LOADING PARTS' = 129, 'SYSTEM SYNC DATABASE REPLICA' = 130, 'SYSTEM SYNC TRANSACTION LOG' = 131, 'SYSTEM SYNC FILE CACHE' = 132, 'SYSTEM FLUSH DISTRIBUTED' = 133, 'SYSTEM FLUSH LOGS' = 134, 'SYSTEM FLUSH ASYNC INSERT QUEUE' = 135, 'SYSTEM FLUSH' = 136, 'SYSTEM THREAD FUZZER' = 137, 'SYSTEM UNFREEZE' = 138, 'SYSTEM FAILPOINT' = 139, 'SYSTEM LISTEN' = 140, 'SYSTEM' = 141, 'dictGet' = 142, 'displaySecretsInShowAndSelect' = 143, 'addressToLine' = 144, 'addressToLineWithInlines' = 145, 'addressToSymbol' = 146, 'demangle' = 147, 'INTROSPECTION' = 148, 'FILE' = 149, 'URL' = 150, 'REMOTE' = 151, 'MONGO' = 152, 'REDIS' = 153, 'MEILISEARCH' = 154, 'MYSQL' = 155, 'POSTGRES' = 156, 'SQLITE' = 157, 'ODBC' = 158, 'JDBC' = 159, 'HDFS' = 160, 'S3' = 161, 'HIVE' = 162, 'AZURE' = 163, 'SOURCES' = 164, 'CLUSTER' = 165, 'ALL' = 166, 'NONE' = 167))
)
ENGINE = SystemPrivileges
COMMENT 'SYSTEM TABLE is built on the fly.'

View File

@ -5,6 +5,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
set -o pipefail
$CLICKHOUSE_LOCAL -q "select * from numbers(10) format Parquet settings output_format_parquet_compression_method='none'" | $CLICKHOUSE_LOCAL --input-format=Parquet -q "select count() from table"
$CLICKHOUSE_LOCAL -q "select * from numbers(10) format Parquet settings output_format_parquet_compression_method='lz4'" | $CLICKHOUSE_LOCAL --input-format=Parquet -q "select count() from table"
$CLICKHOUSE_LOCAL -q "select * from numbers(10) format Parquet settings output_format_parquet_compression_method='snappy'" | $CLICKHOUSE_LOCAL --input-format=Parquet -q "select count() from table"

View File

@ -0,0 +1,21 @@
2.00000000000000000000000000000000000000000000000000000000000000000000000000000
2.12
-2.00000000000000000000000000000000000000000000000000000000000000000000000000000
-2.12
2.987600000000000033395508580724708735942840576171875000000000
2.15
-2.987600000000000033395508580724708735942840576171875000000000
-2.15
64.1230010986
64.2340000000
-64.1230010986
-64.2340000000
-32.345
32.34500000000000000000000000000000000000000000000000000000000000000000000000000
32.46
-64.5671232345
128.78932312332132985464
-128.78932312332132985464
128.78932312332132985464000000000000000000000000000000000000000000000000000000000
128.7893231233
-128.78932312332132985464123123789323123321329854600000000000000000000000000000000

View File

@ -0,0 +1,41 @@
-- Regular types
SELECT toDecimalString(2, 77); -- more digits required than exist
SELECT toDecimalString(2.123456, 2); -- rounding
SELECT toDecimalString(-2, 77); -- more digits required than exist
SELECT toDecimalString(-2.123456, 2); -- rounding
SELECT toDecimalString(2.9876, 60); -- more digits required than exist (took 60 as it is float by default)
SELECT toDecimalString(2.1456, 2); -- rounding
SELECT toDecimalString(-2.9876, 60); -- more digits required than exist
SELECT toDecimalString(-2.1456, 2); -- rounding
-- Float32 and Float64 tests. No sense to test big float precision -- the result will be a mess anyway.
SELECT toDecimalString(64.123::Float32, 10);
SELECT toDecimalString(64.234::Float64, 10);
SELECT toDecimalString(-64.123::Float32, 10);
SELECT toDecimalString(-64.234::Float64, 10);
-- Decimals
SELECT toDecimalString(-32.345::Decimal32(3), 3);
SELECT toDecimalString(32.345::Decimal32(3), 77); -- more digits required than exist
SELECT toDecimalString(32.456::Decimal32(3), 2); -- rounding
SELECT toDecimalString('-64.5671232345'::Decimal64(10), 10);
SELECT toDecimalString('128.78932312332132985464'::Decimal128(20), 20);
SELECT toDecimalString('-128.78932312332132985464123123'::Decimal128(26), 20); -- rounding
SELECT toDecimalString('128.78932312332132985464'::Decimal128(20), 77); -- more digits required than exist
SELECT toDecimalString('128.789323123321329854641231237893231233213298546'::Decimal256(45), 10); -- rounding
SELECT toDecimalString('-128.789323123321329854641231237893231233213298546'::Decimal256(45), 77); -- more digits required than exist
-- Max number of decimal fractional digits is defined as 77 for Int/UInt/Decimal and 60 for Float.
-- These values shall work OK.
SELECT toDecimalString('32.32'::Float32, 61); -- {serverError CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER}
SELECT toDecimalString('64.64'::Float64, 61); -- {serverError CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER}
SELECT toDecimalString('88'::UInt8, 78); -- {serverError CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER}
SELECT toDecimalString('646464'::Int256, 78); -- {serverError CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER}
SELECT toDecimalString('-128.789323123321329854641231237893231233213298546'::Decimal256(45), 78); -- {serverError CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER}
-- wrong types: #52407 and similar
SELECT toDecimalString('256.256'::Decimal256(45), *); -- {serverError ILLEGAL_COLUMN}
SELECT toDecimalString('128.128'::Decimal128(30), 'str'); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT}
SELECT toDecimalString('64.64'::Decimal64(10)); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH}
SELECT toDecimalString('64.64'::Decimal64(10), 3, 3); -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH}

View File

@ -0,0 +1,55 @@
u8 Nullable(UInt8)
u16 Nullable(UInt16)
u32 Nullable(UInt32)
u64 Nullable(UInt64)
i8 Nullable(Int8)
i16 Nullable(Int16)
i32 Nullable(Int32)
i64 Nullable(Int64)
date Nullable(UInt16)
date32 Nullable(Date32)
datetime Nullable(UInt32)
datetime64 Nullable(DateTime64(3, \'UTC\'))
enum8 Nullable(Int8)
enum16 Nullable(Int16)
float32 Nullable(Float32)
float64 Nullable(Float64)
str Nullable(String)
fstr Nullable(FixedString(12))
u128 Nullable(FixedString(16))
u256 Nullable(FixedString(32))
i128 Nullable(FixedString(16))
i256 Nullable(FixedString(32))
decimal32 Nullable(Decimal(9, 3))
decimal64 Nullable(Decimal(18, 10))
decimal128 Nullable(Decimal(38, 20))
decimal256 Nullable(Decimal(76, 40))
ipv4 Nullable(UInt32)
ipv6 Nullable(FixedString(16))
0
0
0
0
1 2 1
1 2 2
1 3 3
1 1000000 1
3914219105369203805
4 1000000 1
(1000000,0,NULL,'100','299')
(1000000,0,NULL,'0','-1294970296')
(1000000,0,NULL,'-2147483296','2147481000')
(100000,900000,NULL,'100009','999999')
[(2,0,NULL,'','[]')]
1 1
0 1
16159458007063698496
16159458007063698496
BYTE_ARRAY String
FIXED_LEN_BYTE_ARRAY None
BYTE_ARRAY None
BYTE_ARRAY None
BYTE_ARRAY String
never gonna
give you
up

View File

@ -0,0 +1,168 @@
-- Tags: no-fasttest, no-parallel
set output_format_parquet_use_custom_encoder = 1;
set output_format_parquet_row_group_size = 1000;
set output_format_parquet_data_page_size = 800;
set output_format_parquet_batch_size = 100;
set output_format_parquet_row_group_size_bytes = 1000000000;
set engine_file_truncate_on_insert=1;
-- Write random data to parquet file, then read from it and check that it matches what we wrote.
-- Do this for all kinds of data types: primitive, Nullable(primitive), Array(primitive),
-- Array(Nullable(primitive)), Array(Array(primitive)), Map(primitive, primitive), etc.
drop table if exists basic_types_02735;
create temporary table basic_types_02735 as select * from generateRandom('
u8 UInt8,
u16 UInt16,
u32 UInt32,
u64 UInt64,
i8 Int8,
i16 Int16,
i32 Int32,
i64 Int64,
date Date,
date32 Date32,
datetime DateTime,
datetime64 DateTime64,
enum8 Enum8(''x'' = 1, ''y'' = 2, ''z'' = 3),
enum16 Enum16(''xx'' = 1000, ''yy'' = 2000, ''zz'' = 3000),
float32 Float32,
float64 Float64,
str String,
fstr FixedString(12),
u128 UInt128,
u256 UInt256,
i128 Int128,
i256 Int256,
decimal32 Decimal32(3),
decimal64 Decimal64(10),
decimal128 Decimal128(20),
decimal256 Decimal256(40),
ipv4 IPv4,
ipv6 IPv6') limit 10101;
insert into function file(basic_types_02735.parquet) select * from basic_types_02735;
desc file(basic_types_02735.parquet);
select (select sum(cityHash64(*)) from basic_types_02735) - (select sum(cityHash64(*)) from file(basic_types_02735.parquet));
drop table basic_types_02735;
drop table if exists nullables_02735;
create temporary table nullables_02735 as select * from generateRandom('
u16 Nullable(UInt16),
i64 Nullable(Int64),
datetime64 Nullable(DateTime64),
enum8 Nullable(Enum8(''x'' = 1, ''y'' = 2, ''z'' = 3)),
float64 Nullable(Float64),
str Nullable(String),
fstr Nullable(FixedString(12)),
i256 Nullable(Int256),
decimal256 Nullable(Decimal256(40)),
ipv6 Nullable(IPv6)') limit 10000;
insert into function file(nullables_02735.parquet) select * from nullables_02735;
select (select sum(cityHash64(*)) from nullables_02735) - (select sum(cityHash64(*)) from file(nullables_02735.parquet));
drop table nullables_02735;
-- TODO: When cityHash64() fully supports Nullable: https://github.com/ClickHouse/ClickHouse/pull/48625
-- the next two blocks can be simplified: arrays_out_02735 intermediate table is not needed,
-- a.csv and b.csv are not needed.
drop table if exists arrays_02735;
drop table if exists arrays_out_02735;
create table arrays_02735 engine = Memory as select * from generateRandom('
u32 Array(UInt32),
i8 Array(Int8),
datetime Array(DateTime),
enum16 Array(Enum16(''xx'' = 1000, ''yy'' = 2000, ''zz'' = 3000)),
float32 Array(Float32),
str Array(String),
fstr Array(FixedString(12)),
u128 Array(UInt128),
decimal64 Array(Decimal64(10)),
ipv4 Array(IPv4),
msi Map(String, Int16),
tup Tuple(FixedString(3), Array(String), Map(Int8, Date))') limit 10000;
insert into function file(arrays_02735.parquet) select * from arrays_02735;
create temporary table arrays_out_02735 as arrays_02735;
insert into arrays_out_02735 select * from file(arrays_02735.parquet);
select (select sum(cityHash64(*)) from arrays_02735) - (select sum(cityHash64(*)) from arrays_out_02735);
--select (select sum(cityHash64(*)) from arrays_02735) -
-- (select sum(cityHash64(u32, i8, datetime, enum16, float32, str, fstr, arrayMap(x->reinterpret(x, 'UInt128'), u128), decimal64, ipv4, msi, tup)) from file(arrays_02735.parquet));
drop table arrays_02735;
drop table arrays_out_02735;
drop table if exists madness_02735;
create temporary table madness_02735 as select * from generateRandom('
aa Array(Array(UInt32)),
aaa Array(Array(Array(UInt32))),
an Array(Nullable(String)),
aan Array(Array(Nullable(FixedString(10)))),
l LowCardinality(String),
ln LowCardinality(Nullable(FixedString(11))),
al Array(LowCardinality(UInt128)),
aaln Array(Array(LowCardinality(Nullable(String)))),
mln Map(LowCardinality(String), Nullable(Int8)),
t Tuple(Map(FixedString(5), Tuple(Array(UInt16), Nullable(UInt16), Array(Tuple(Int8, Decimal64(10))))), Tuple(kitchen UInt64, sink String)),
n Nested(hello UInt64, world Tuple(first String, second FixedString(1)))
') limit 10000;
insert into function file(madness_02735.parquet) select * from madness_02735;
insert into function file(a.csv) select * from madness_02735 order by tuple(*);
insert into function file(b.csv) select aa, aaa, an, aan, l, ln, arrayMap(x->reinterpret(x, 'UInt128'), al) as al_, aaln, mln, t, n.hello, n.world from file(madness_02735.parquet) order by tuple(aa, aaa, an, aan, l, ln, al_, aaln, mln, t, n.hello, n.world);
select (select sum(cityHash64(*)) from file(a.csv, LineAsString)) - (select sum(cityHash64(*)) from file(b.csv, LineAsString));
--select (select sum(cityHash64(*)) from madness_02735) -
-- (select sum(cityHash64(aa, aaa, an, aan, l, ln, map(x->reinterpret(x, 'UInt128'), al), aaln, mln, t, n.hello, n.world)) from file(madness_02735.parquet));
drop table madness_02735;
-- Merging input blocks into bigger row groups.
insert into function file(squash_02735.parquet) select '012345' union all select '543210' settings max_block_size = 1;
select num_columns, num_rows, num_row_groups from file(squash_02735.parquet, ParquetMetadata);
-- Row group size limit in bytes.
insert into function file(row_group_bytes_02735.parquet) select '012345' union all select '543210' settings max_block_size = 1, output_format_parquet_row_group_size_bytes = 5;
select num_columns, num_rows, num_row_groups from file(row_group_bytes_02735.parquet, ParquetMetadata);
-- Row group size limit in rows.
insert into function file(tiny_row_groups_02735.parquet) select * from numbers(3) settings output_format_parquet_row_group_size = 1;
select num_columns, num_rows, num_row_groups from file(tiny_row_groups_02735.parquet, ParquetMetadata);
-- 1M unique 8-byte values should exceed dictionary_size_limit (1 MB).
insert into function file(big_column_chunk_02735.parquet) select number from numbers(1000000) settings output_format_parquet_row_group_size = 1000000;
select num_columns, num_rows, num_row_groups from file(big_column_chunk_02735.parquet, ParquetMetadata);
select sum(cityHash64(number)) from file(big_column_chunk_02735.parquet);
-- Check statistics: signed vs unsigned, null count. Use enough rows to produce multiple pages.
insert into function file(statistics_02735.parquet) select 100 + number%200 as a, toUInt32(number * 3000) as u, toInt32(number * 3000) as i, if(number % 10 == 9, toString(number), null) as s from numbers(1000000) settings output_format_parquet_row_group_size = 1000000;
select num_columns, num_rows, num_row_groups from file(statistics_02735.parquet, ParquetMetadata);
select tupleElement(c, 'statistics') from file(statistics_02735.parquet, ParquetMetadata) array join tupleElement(row_groups[1], 'columns') as c;
-- Statistics string length limit (max_statistics_size).
insert into function file(long_string_02735.parquet) select toString(range(number * 2000)) from numbers(2);
select tupleElement(tupleElement(row_groups[1], 'columns'), 'statistics') from file(long_string_02735.parquet, ParquetMetadata);
-- Compression setting.
insert into function file(compressed_02735.parquet) select concat('aaaaaaaaaaaaaaaa', toString(number)) as s from numbers(1000) settings output_format_parquet_row_group_size = 10000, output_format_parquet_compression_method='zstd';
select total_compressed_size < 10000, total_uncompressed_size > 15000 from file(compressed_02735.parquet, ParquetMetadata);
insert into function file(compressed_02735.parquet) select concat('aaaaaaaaaaaaaaaa', toString(number)) as s from numbers(1000) settings output_format_parquet_row_group_size = 10000, output_format_parquet_compression_method='none';
select total_compressed_size < 10000, total_uncompressed_size > 15000 from file(compressed_02735.parquet, ParquetMetadata);
-- Single-threaded encoding and Arrow encoder.
drop table if exists other_encoders_02735;
create temporary table other_encoders_02735 as select number, number*2 from numbers(10000);
insert into function file(single_thread_02735.parquet) select * from other_encoders_02735 settings max_threads = 1;
select sum(cityHash64(*)) from file(single_thread_02735.parquet);
insert into function file(arrow_02735.parquet) select * from other_encoders_02735 settings output_format_parquet_use_custom_encoder = 0;
select sum(cityHash64(*)) from file(arrow_02735.parquet);
-- String -> binary vs string; FixedString -> fixed-length-binary vs binary vs string.
insert into function file(strings1_02735.parquet) select 'never', toFixedString('gonna', 5) settings output_format_parquet_string_as_string = 1, output_format_parquet_fixed_string_as_fixed_byte_array = 1;
select columns.5, columns.6 from file(strings1_02735.parquet, ParquetMetadata) array join columns;
insert into function file(strings2_02735.parquet) select 'give', toFixedString('you', 3) settings output_format_parquet_string_as_string = 0, output_format_parquet_fixed_string_as_fixed_byte_array = 0;
select columns.5, columns.6 from file(strings2_02735.parquet, ParquetMetadata) array join columns;
insert into function file(strings3_02735.parquet) select toFixedString('up', 2) settings output_format_parquet_string_as_string = 1, output_format_parquet_fixed_string_as_fixed_byte_array = 0;
select columns.5, columns.6 from file(strings3_02735.parquet, ParquetMetadata) array join columns;
select * from file(strings1_02735.parquet);
select * from file(strings2_02735.parquet);
select * from file(strings3_02735.parquet);

View File

@ -0,0 +1,17 @@
-- Based on https://github.com/ClickHouse/ClickHouse/issues/52436
-- Test that inserts performed via Buffer table engine land into destination table.
-- { echoOn }
DROP TABLE IF EXISTS null_table;
DROP TABLE IF EXISTS null_table_buffer;
DROP TABLE IF EXISTS null_mv;
DROP VIEW IF EXISTS number_view;
CREATE TABLE null_table (number UInt64) ENGINE = Null;
CREATE VIEW number_view as SELECT * FROM numbers(10) as tb;
CREATE MATERIALIZED VIEW null_mv Engine = Log AS SELECT * FROM null_table LEFT JOIN number_view as tb USING number;
CREATE TABLE null_table_buffer (number UInt64) ENGINE = Buffer(currentDatabase(), null_table, 1, 1, 1, 100, 200, 10000, 20000);
INSERT INTO null_table_buffer VALUES (1);
SELECT sleep(3) FORMAT Null;
-- Insert about should've landed into `null_mv`
SELECT count() FROM null_mv;
1

View File

@ -0,0 +1,19 @@
-- Based on https://github.com/ClickHouse/ClickHouse/issues/52436
-- Test that inserts performed via Buffer table engine land into destination table.
-- { echoOn }
DROP TABLE IF EXISTS null_table;
DROP TABLE IF EXISTS null_table_buffer;
DROP TABLE IF EXISTS null_mv;
DROP VIEW IF EXISTS number_view;
CREATE TABLE null_table (number UInt64) ENGINE = Null;
CREATE VIEW number_view as SELECT * FROM numbers(10) as tb;
CREATE MATERIALIZED VIEW null_mv Engine = Log AS SELECT * FROM null_table LEFT JOIN number_view as tb USING number;
CREATE TABLE null_table_buffer (number UInt64) ENGINE = Buffer(currentDatabase(), null_table, 1, 1, 1, 100, 200, 10000, 20000);
INSERT INTO null_table_buffer VALUES (1);
SELECT sleep(3) FORMAT Null;
-- Insert about should've landed into `null_mv`
SELECT count() FROM null_mv;

View File

@ -0,0 +1,16 @@
create view test_param_view as
with {param_test_val:UInt8} as param_test_val
select param_test_val,
arrayCount((a)->(a < param_test_val), t.arr) as cnt1
from (select [1,2,3,4,5] as arr) t;
select * from test_param_view(param_test_val = 3);
create view test_param_view2 as
with {param_test_val:UInt8} as param_test_val
select param_test_val,
arrayCount((a)->(a < param_test_val), t.arr) as cnt1,
arrayCount((a)->(a < param_test_val+1), t.arr) as cnt2
from (select [1,2,3,4,5] as arr) t;
select * from test_param_view2(param_test_val = 3);

View File

@ -0,0 +1,7 @@
drop table if exists t1;
create table t1 as remote('localhost', 'system.one');
rename table t1 to t2;
select * from t2;
rename table t2 to t1;
drop table t1;