Merge branch 'master' into removing-data-streams-folder

This commit is contained in:
Nikolai Kochetov 2021-10-17 10:42:37 +03:00
commit bfcbf5abe0
75 changed files with 2376 additions and 146 deletions

3
.gitmodules vendored
View File

@ -250,6 +250,9 @@
[submodule "contrib/magic_enum"]
path = contrib/magic_enum
url = https://github.com/Neargye/magic_enum
[submodule "contrib/libprotobuf-mutator"]
path = contrib/libprotobuf-mutator
url = https://github.com/google/libprotobuf-mutator
[submodule "contrib/sysroot"]
path = contrib/sysroot
url = https://github.com/ClickHouse-Extras/sysroot.git

View File

@ -136,6 +136,21 @@ if (ENABLE_FUZZING)
message (STATUS "Fuzzing instrumentation enabled")
set (FUZZER "libfuzzer")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -nostdlib++")
set (ENABLE_CLICKHOUSE_ODBC_BRIDGE OFF)
set (ENABLE_LIBRARIES 0)
set (ENABLE_SSL 1)
set (USE_INTERNAL_SSL_LIBRARY 1)
set (USE_UNWIND ON)
set (ENABLE_EMBEDDED_COMPILER 0)
set (ENABLE_EXAMPLES 0)
set (ENABLE_UTILS 0)
set (ENABLE_THINLTO 0)
set (ENABLE_TCMALLOC 0)
set (ENABLE_JEMALLOC 0)
set (ENABLE_CHECK_HEAVY_BUILDS 1)
set (GLIBC_COMPATIBILITY OFF)
set (ENABLE_PROTOBUF ON)
set (USE_INTERNAL_PROTOBUF_LIBRARY ON)
endif()
# Global libraries
@ -188,7 +203,7 @@ endif ()
option(ENABLE_TESTS "Provide unit_test_dbms target with Google.Test unit tests" ON)
option(ENABLE_EXAMPLES "Build all example programs in 'examples' subdirectories" OFF)
if (OS_LINUX AND (ARCH_AMD64 OR ARCH_AARCH64) AND NOT UNBUNDLED AND MAKE_STATIC_LIBRARIES AND NOT SPLIT_SHARED_LIBRARIES AND CMAKE_VERSION VERSION_GREATER "3.9.0")
if (OS_LINUX AND (ARCH_AMD64 OR ARCH_AARCH64) AND NOT UNBUNDLED AND MAKE_STATIC_LIBRARIES AND NOT SPLIT_SHARED_LIBRARIES AND NOT USE_MUSL)
# Only for Linux, x86_64 or aarch64.
option(GLIBC_COMPATIBILITY "Enable compatibility with older glibc libraries." ON)
elseif(GLIBC_COMPATIBILITY)
@ -203,10 +218,6 @@ if (GLIBC_COMPATIBILITY)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -include ${CMAKE_CURRENT_SOURCE_DIR}/base/glibc-compatibility/glibc-compat-2.32.h")
endif()
if (NOT CMAKE_VERSION VERSION_GREATER "3.9.0")
message (WARNING "CMake version must be greater than 3.9.0 for production builds.")
endif ()
# Make sure the final executable has symbols exported
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -rdynamic")
@ -582,6 +593,7 @@ include (cmake/find/cassandra.cmake)
include (cmake/find/sentry.cmake)
include (cmake/find/stats.cmake)
include (cmake/find/datasketches.cmake)
include (cmake/find/libprotobuf-mutator.cmake)
set (USE_INTERNAL_CITYHASH_LIBRARY ON CACHE INTERNAL "")
find_contrib_lib(cityhash)

View File

@ -5,6 +5,10 @@
#include <string.h>
#include <unistd.h>
#include <sys/select.h>
#include <sys/time.h>
#include <sys/types.h>
#ifdef OS_LINUX
/// We can detect if code is linked with one or another readline variants or open the library dynamically.

View File

@ -6,7 +6,7 @@
#include <base/defines.h>
#if defined(__linux__) && !defined(THREAD_SANITIZER)
#if defined(__linux__) && !defined(THREAD_SANITIZER) && !defined(USE_MUSL)
#define USE_PHDR_CACHE 1
#endif

View File

@ -0,0 +1,11 @@
option(USE_LIBPROTOBUF_MUTATOR "Enable libprotobuf-mutator" ${ENABLE_FUZZING})
if (NOT USE_LIBPROTOBUF_MUTATOR)
return()
endif()
set(LibProtobufMutator_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/libprotobuf-mutator")
if (NOT EXISTS "${LibProtobufMutator_SOURCE_DIR}/README.md")
message (ERROR "submodule contrib/libprotobuf-mutator is missing. to fix try run: \n git submodule update --init --recursive")
endif()

View File

@ -14,6 +14,8 @@ endif ()
if (OS_ANDROID)
# pthread and rt are included in libc
set (DEFAULT_LIBS "${DEFAULT_LIBS} ${BUILTINS_LIBRARY} ${COVERAGE_OPTION} -lc -lm -ldl")
elseif (USE_MUSL)
set (DEFAULT_LIBS "${DEFAULT_LIBS} ${BUILTINS_LIBRARY} ${COVERAGE_OPTION} -static -lc")
else ()
set (DEFAULT_LIBS "${DEFAULT_LIBS} ${BUILTINS_LIBRARY} ${COVERAGE_OPTION} -lc -lm -lrt -lpthread -ldl")
endif ()
@ -26,7 +28,7 @@ set(CMAKE_C_STANDARD_LIBRARIES ${DEFAULT_LIBS})
# glibc-compatibility library relies to constant version of libc headers
# (because minor changes in function attributes between different glibc versions will introduce incompatibilities)
# This is for x86_64. For other architectures we have separate toolchains.
if (ARCH_AMD64 AND NOT_UNBUNDLED)
if (ARCH_AMD64 AND NOT_UNBUNDLED AND NOT CMAKE_CROSSCOMPILING)
set(CMAKE_C_STANDARD_INCLUDE_DIRECTORIES ${ClickHouse_SOURCE_DIR}/contrib/libc-headers/x86_64-linux-gnu ${ClickHouse_SOURCE_DIR}/contrib/libc-headers)
set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES ${ClickHouse_SOURCE_DIR}/contrib/libc-headers/x86_64-linux-gnu ${ClickHouse_SOURCE_DIR}/contrib/libc-headers)
endif ()
@ -37,8 +39,10 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
if (NOT OS_ANDROID)
# Our compatibility layer doesn't build under Android, many errors in musl.
add_subdirectory(base/glibc-compatibility)
if (NOT USE_MUSL)
# Our compatibility layer doesn't build under Android, many errors in musl.
add_subdirectory(base/glibc-compatibility)
endif ()
add_subdirectory(base/harmful)
endif ()

View File

@ -0,0 +1,35 @@
set (CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
set (CMAKE_SYSTEM_NAME "Linux")
set (CMAKE_SYSTEM_PROCESSOR "x86_64")
set (CMAKE_C_COMPILER_TARGET "x86_64-linux-musl")
set (CMAKE_CXX_COMPILER_TARGET "x86_64-linux-musl")
set (CMAKE_ASM_COMPILER_TARGET "x86_64-linux-musl")
set (TOOLCHAIN_PATH "${CMAKE_CURRENT_LIST_DIR}/../../contrib/sysroot/linux-x86_64-musl")
set (CMAKE_SYSROOT "${TOOLCHAIN_PATH}")
find_program (LLVM_AR_PATH NAMES "llvm-ar" "llvm-ar-13" "llvm-ar-12" "llvm-ar-11" "llvm-ar-10" "llvm-ar-9" "llvm-ar-8")
find_program (LLVM_RANLIB_PATH NAMES "llvm-ranlib" "llvm-ranlib-13" "llvm-ranlib-12" "llvm-ranlib-11" "llvm-ranlib-10" "llvm-ranlib-9")
set (CMAKE_AR "${LLVM_AR_PATH}" CACHE FILEPATH "" FORCE)
set (CMAKE_RANLIB "${LLVM_RANLIB_PATH}" CACHE FILEPATH "" FORCE)
set (CMAKE_C_FLAGS_INIT "${CMAKE_C_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}")
set (CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}")
set (CMAKE_ASM_FLAGS_INIT "${CMAKE_ASM_FLAGS} --gcc-toolchain=${TOOLCHAIN_PATH}")
set (LINKER_NAME "ld.lld" CACHE STRING "" FORCE)
set (CMAKE_EXE_LINKER_FLAGS_INIT "-fuse-ld=lld")
set (CMAKE_SHARED_LINKER_FLAGS_INIT "-fuse-ld=lld")
set (HAS_PRE_1970_EXITCODE "0" CACHE STRING "Result from TRY_RUN" FORCE)
set (HAS_PRE_1970_EXITCODE__TRYRUN_OUTPUT "" CACHE STRING "Output from TRY_RUN" FORCE)
set (HAS_POST_2038_EXITCODE "0" CACHE STRING "Result from TRY_RUN" FORCE)
set (HAS_POST_2038_EXITCODE__TRYRUN_OUTPUT "" CACHE STRING "Output from TRY_RUN" FORCE)
set (USE_MUSL 1)
add_definitions(-DUSE_MUSL=1)

View File

@ -49,6 +49,10 @@ add_subdirectory (replxx-cmake)
add_subdirectory (unixodbc-cmake)
add_subdirectory (nanodbc-cmake)
if (ENABLE_FUZZING)
add_subdirectory (libprotobuf-mutator-cmake)
endif()
if (USE_YAML_CPP)
add_subdirectory (yaml-cpp-cmake)
endif()

2
contrib/fastops vendored

@ -1 +1 @@
Subproject commit 012b777df9e2d145a24800a6c8c3d4a0249bb09e
Subproject commit 1460583af7d13c0e980ce46aec8ee9400314669a

View File

@ -18,8 +18,10 @@
* Define overrides for non-standard allocator-related functions if they are
* present on the system.
*/
#define JEMALLOC_OVERRIDE_MEMALIGN
#define JEMALLOC_OVERRIDE_VALLOC
#if !defined(USE_MUSL)
#define JEMALLOC_OVERRIDE_MEMALIGN
#define JEMALLOC_OVERRIDE_VALLOC
#endif
/*
* At least Linux omits the "const" in:

View File

@ -1,6 +1,6 @@
// OSX does not have this for system alloc functions, so you will get
// "exception specification in declaration" error.
#if defined(__APPLE__) || defined(__FreeBSD__)
#if defined(__APPLE__) || defined(__FreeBSD__) || defined(USE_MUSL)
# undef JEMALLOC_NOTHROW
# define JEMALLOC_NOTHROW

View File

@ -13,12 +13,14 @@
* Define overrides for non-standard allocator-related functions if they are
* present on the system.
*/
#define JEMALLOC_OVERRIDE___LIBC_CALLOC
#define JEMALLOC_OVERRIDE___LIBC_FREE
#define JEMALLOC_OVERRIDE___LIBC_MALLOC
#define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
#define JEMALLOC_OVERRIDE___LIBC_REALLOC
#define JEMALLOC_OVERRIDE___LIBC_VALLOC
#if !defined(USE_MUSL)
#define JEMALLOC_OVERRIDE___LIBC_CALLOC
#define JEMALLOC_OVERRIDE___LIBC_FREE
#define JEMALLOC_OVERRIDE___LIBC_MALLOC
#define JEMALLOC_OVERRIDE___LIBC_MEMALIGN
#define JEMALLOC_OVERRIDE___LIBC_REALLOC
#define JEMALLOC_OVERRIDE___LIBC_VALLOC
#endif
/* #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN */
/*

View File

@ -56,6 +56,10 @@ if (USE_UNWIND)
target_compile_definitions(cxx PUBLIC -DSTD_EXCEPTION_HAS_STACK_TRACE=1)
endif ()
if (USE_MUSL)
target_compile_definitions(cxx PUBLIC -D_LIBCPP_HAS_MUSL_LIBC=1)
endif ()
# Override the deduced attribute support that causes error.
if (OS_DARWIN AND COMPILER_GCC)
add_compile_definitions(_LIBCPP_INIT_PRIORITY_MAX)

1
contrib/libprotobuf-mutator vendored Submodule

@ -0,0 +1 @@
Subproject commit ffd86a32874e5c08a143019aad1aaf0907294c9f

View File

@ -0,0 +1,14 @@
set(LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/libprotobuf-mutator)
add_library(protobuf-mutator
${LIBRARY_DIR}/src/libfuzzer/libfuzzer_macro.cc
${LIBRARY_DIR}/src/libfuzzer/libfuzzer_mutator.cc
${LIBRARY_DIR}/src/binary_format.cc
${LIBRARY_DIR}/src/mutator.cc
${LIBRARY_DIR}/src/text_format.cc
${LIBRARY_DIR}/src/utf8_fix.cc)
target_include_directories(protobuf-mutator BEFORE PRIVATE "${LIBRARY_DIR}")
target_include_directories(protobuf-mutator BEFORE PRIVATE "${ClickHouse_SOURCE_DIR}/contrib/protobuf/src")
target_link_libraries(protobuf-mutator ${Protobuf_LIBRARY})

View File

@ -98,7 +98,9 @@
#define HAVE_BCOPY 1
/* Define to 1 if you have the <bits/types.h> header file. */
#define HAVE_BITS_TYPES_H 1
#if !defined(USE_MUSL)
#define HAVE_BITS_TYPES_H 1
#endif
/* Define to 1 if you have the `chroot' function. */
#define HAVE_CHROOT 1

2
contrib/sysroot vendored

@ -1 +1 @@
Subproject commit 002415524b5d14124bb8a61a3ce7ac65774f5479
Subproject commit e4663925b73beb57dd29154844c8d50441146753

View File

@ -47,13 +47,17 @@ then
fi
URL="https://builds.clickhouse.com/master/${DIR}/clickhouse"
echo
echo "Will download ${URL}"
echo
curl -O "${URL}" && chmod a+x clickhouse &&
echo
echo "Successfully downloaded the ClickHouse binary, you can run it as:
./clickhouse"
if [ "${OS}" = "Linux" ]
then
echo
echo "You can also install it:
sudo ./clickhouse install"
fi

View File

@ -10,7 +10,7 @@ Columns:
- `[]` — All users share the same quota.
- `['user_name']` — Connections with the same user name share the same quota.
- `['ip_address']` — Connections from the same IP share the same quota.
- `['client_key']` — Connections with the same key share the same quota. A key must be explicitly provided by a client. When using [clickhouse-client](../../interfaces/cli.md), pass a key value in the `--quota-key` parameter, or use the `quota_key` parameter in the client configuration file. When using HTTP interface, use the `X-ClickHouse-Quota` header.
- `['client_key']` — Connections with the same key share the same quota. A key must be explicitly provided by a client. When using [clickhouse-client](../../interfaces/cli.md), pass a key value in the `--quota_key` parameter, or use the `quota_key` parameter in the client configuration file. When using HTTP interface, use the `X-ClickHouse-Quota` header.
- `['user_name', 'client_key']` — Connections with the same `client_key` share the same quota. If a key isnt provided by a client, the qouta is tracked for `user_name`.
- `['client_key', 'ip_address']` — Connections with the same `client_key` share the same quota. If a key isnt provided by a client, the qouta is tracked for `ip_address`.
- `durations` ([Array](../../sql-reference/data-types/array.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Time interval lengths in seconds.

View File

@ -155,6 +155,60 @@ Configuration example:
LAYOUT(COMPLEX_KEY_HASHED())
```
### complex_key_sparse_hashed {#complex-key-sparse-hashed}
This type of storage is for use with composite [keys](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). Similar to `sparse_hashed`.
Configuration example:
``` xml
<layout>
<complex_key_sparse_hashed />
</layout>
```
``` sql
LAYOUT(COMPLEX_KEY_SPARSE_HASHED())
```
### hashed_array {#dicts-external_dicts_dict_layout-hashed-array}
The dictionary is completely stored in memory. Each attribute is stored in array. Key attribute is stored in the form of hashed table where value is index in attributes array. The dictionary can contain any number of elements with any identifiers In practice, the number of keys can reach tens of millions of items.
All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety.
Configuration example:
``` xml
<layout>
<hashed_array>
</hashed_array>
</layout>
```
or
``` sql
LAYOUT(HASHED_ARRAY())
```
### complex_key_hashed_array {#complex-key-hashed-array}
This type of storage is for use with composite [keys](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). Similar to `hashed_array`.
Configuration example:
``` xml
<layout>
<complex_key_hashed_array />
</layout>
```
``` sql
LAYOUT(COMPLEX_KEY_HASHED_ARRAY())
```
### range_hashed {#range-hashed}
The dictionary is stored in memory in the form of a hash table with an ordered array of ranges and their corresponding values.

View File

@ -11,7 +11,7 @@
- `[]`Все пользователи используют одну и ту же квоту.
- `['user_name']` — Соединения с одинаковым именем пользователя используют одну и ту же квоту.
- `['ip_address']` — Соединения с одинаковым IP-адресом используют одну и ту же квоту.
- `['client_key']` — Соединения с одинаковым ключом используют одну и ту же квоту. Ключ может быть явно задан клиентом. При использовании [clickhouse-client](../../interfaces/cli.md), передайте ключевое значение в параметре `--quota-key`, или используйте параметр `quota_key` файле настроек клиента. В случае использования HTTP интерфейса, используйте заголовок `X-ClickHouse-Quota`.
- `['client_key']` — Соединения с одинаковым ключом используют одну и ту же квоту. Ключ может быть явно задан клиентом. При использовании [clickhouse-client](../../interfaces/cli.md), передайте ключевое значение в параметре `--quota_key`, или используйте параметр `quota_key` файле настроек клиента. В случае использования HTTP интерфейса, используйте заголовок `X-ClickHouse-Quota`.
- `['user_name', 'client_key']` — Соединения с одинаковым ключом используют одну и ту же квоту. Если ключ не предоставлен клиентом, то квота отслеживается для `user_name`.
- `['client_key', 'ip_address']` — Соединения с одинаковым ключом используют одну и ту же квоту. Если ключ не предоставлен клиентом, то квота отслеживается для `ip_address`.
- `durations` ([Array](../../sql-reference/data-types/array.md)([UInt64](../../sql-reference/data-types/int-uint.md))) — Длины временных интервалов для расчета потребления ресурсов, в секундах.

View File

@ -21,8 +21,6 @@
- [`sumMap`](../../sql-reference/aggregate-functions/reference/summap.md#agg_functions-summap)
- [`minMap`](../../sql-reference/aggregate-functions/reference/minmap.md#agg_functions-minmap)
- [`maxMap`](../../sql-reference/aggregate-functions/reference/maxmap.md#agg_functions-maxmap)
- [`argMin`](../../sql-reference/aggregate-functions/reference/argmin.md)
- [`argMax`](../../sql-reference/aggregate-functions/reference/argmax.md)
!!! note "Примечание"
Значения `SimpleAggregateFunction(func, Type)` отображаются и хранятся так же, как и `Type`, поэтому комбинаторы [-Merge](../../sql-reference/aggregate-functions/combinators.md#aggregate_functions_combinators-merge) и [-State](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-state) не требуются.

View File

@ -516,6 +516,7 @@ void ClientBase::receiveResult(ASTPtr parsed_query)
const size_t poll_interval
= std::max(min_poll_interval, std::min<size_t>(receive_timeout.totalMicroseconds(), default_poll_interval));
bool break_on_timeout = connection->getConnectionType() != IServerConnection::Type::LOCAL;
while (true)
{
Stopwatch receive_watch(CLOCK_MONOTONIC_COARSE);
@ -546,7 +547,7 @@ void ClientBase::receiveResult(ASTPtr parsed_query)
else
{
double elapsed = receive_watch.elapsedSeconds();
if (elapsed > receive_timeout.totalSeconds())
if (break_on_timeout && elapsed > receive_timeout.totalSeconds())
{
std::cout << "Timeout exceeded while receiving data from server."
<< " Waited for " << static_cast<size_t>(elapsed) << " seconds,"

View File

@ -60,6 +60,8 @@ public:
~Connection() override;
IServerConnection::Type getConnectionType() const override { return IServerConnection::Type::SERVER; }
static ServerConnectionPtr createConnection(const ConnectionParameters & parameters, ContextPtr context);
/// Set throttler of network traffic. One throttler could be used for multiple connections to limit total traffic.

View File

@ -56,6 +56,14 @@ class IServerConnection : boost::noncopyable
public:
virtual ~IServerConnection() = default;
enum class Type
{
SERVER,
LOCAL
};
virtual Type getConnectionType() const = 0;
virtual void setDefaultDatabase(const String & database) = 0;
virtual void getServerVersion(

View File

@ -60,15 +60,15 @@ void LocalConnection::updateProgress(const Progress & value)
void LocalConnection::sendQuery(
const ConnectionTimeouts &,
const String & query_,
const String & query_id_,
UInt64,
const String & query,
const String & query_id,
UInt64 stage,
const Settings *,
const ClientInfo *,
bool)
{
query_context = session.makeQueryContext();
query_context->setCurrentQueryId(query_id_);
query_context->setCurrentQueryId(query_id);
if (send_progress)
query_context->setProgressCallback([this] (const Progress & value) { return this->updateProgress(value); });
@ -77,8 +77,9 @@ void LocalConnection::sendQuery(
state.reset();
state.emplace();
state->query_id = query_id_;
state->query = query_;
state->query_id = query_id;
state->query = query;
state->stage = QueryProcessingStage::Enum(stage);
if (send_progress)
state->after_send_progress.restart();

View File

@ -56,6 +56,8 @@ public:
~LocalConnection() override;
IServerConnection::Type getConnectionType() const override { return IServerConnection::Type::LOCAL; }
static ServerConnectionPtr createConnection(const ConnectionParameters & connection_parameters, ContextPtr current_context, bool send_progress = false);
void setDefaultDatabase(const String & database) override;
@ -76,7 +78,7 @@ public:
void sendQuery(
const ConnectionTimeouts & timeouts,
const String & query,
const String & query_id_/* = "" */,
const String & query_id/* = "" */,
UInt64 stage/* = QueryProcessingStage::Complete */,
const Settings * settings/* = nullptr */,
const ClientInfo * client_info/* = nullptr */,

View File

@ -124,11 +124,13 @@ QueryProfilerBase<ProfilerImpl>::QueryProfilerBase(const UInt64 thread_id, const
sev.sigev_notify = SIGEV_THREAD_ID;
sev.sigev_signo = pause_signal;
# if defined(OS_FREEBSD)
#if defined(OS_FREEBSD)
sev._sigev_un._threadid = thread_id;
# else
#elif defined(USE_MUSL)
sev.sigev_notify_thread_id = thread_id;
#else
sev._sigev_un._tid = thread_id;
# endif
#endif
if (timer_create(clock_type, &sev, &timer_id))
{
/// In Google Cloud Run, the function "timer_create" is implemented incorrectly as of 2020-01-25.

View File

@ -17,7 +17,9 @@ extern "C"
void *aligned_alloc(size_t alignment, size_t size);
void *valloc(size_t size);
void *memalign(size_t alignment, size_t size);
#if !defined(USE_MUSL)
void *pvalloc(size_t size);
#endif
}
#pragma GCC diagnostic pop
@ -39,6 +41,8 @@ static void dummyFunctionForInterposing()
ignore(aligned_alloc(0, 0)); // -V575 NOLINT
ignore(valloc(0)); // -V575 NOLINT
ignore(memalign(0, 0)); // -V575 NOLINT
#if !defined(USE_MUSL)
ignore(pvalloc(0)); // -V575 NOLINT
#endif
}
#endif

View File

@ -10,6 +10,7 @@ if (CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE" OR CMAKE_BUILD_TYPE_UC STREQUAL "RELW
set_source_files_properties(
FlatDictionary.cpp
HashedDictionary.cpp
HashedArrayDictionary.cpp
CacheDictionary.cpp
RangeHashedDictionary.cpp
DirectDictionary.cpp

View File

@ -0,0 +1,691 @@
#include "HashedArrayDictionary.h"
#include <Core/Defines.h>
#include <DataTypes/DataTypesDecimal.h>
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnNullable.h>
#include <Functions/FunctionHelpers.h>
#include <Dictionaries/DictionarySource.h>
#include <Dictionaries/DictionaryFactory.h>
#include <Dictionaries/HierarchyDictionariesUtils.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int DICTIONARY_IS_EMPTY;
extern const int UNSUPPORTED_METHOD;
}
template <DictionaryKeyType dictionary_key_type>
HashedArrayDictionary<dictionary_key_type>::HashedArrayDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
const HashedArrayDictionaryStorageConfiguration & configuration_,
BlockPtr update_field_loaded_block_)
: IDictionary(dict_id_)
, dict_struct(dict_struct_)
, source_ptr(std::move(source_ptr_))
, configuration(configuration_)
, update_field_loaded_block(std::move(update_field_loaded_block_))
{
createAttributes();
loadData();
calculateBytesAllocated();
}
template <DictionaryKeyType dictionary_key_type>
ColumnPtr HashedArrayDictionary<dictionary_key_type>::getColumn(
const std::string & attribute_name,
const DataTypePtr & result_type,
const Columns & key_columns,
const DataTypes & key_types [[maybe_unused]],
const ColumnPtr & default_values_column) const
{
if (dictionary_key_type == DictionaryKeyType::Complex)
dict_struct.validateKeyTypes(key_types);
ColumnPtr result;
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
DictionaryKeysExtractor<dictionary_key_type> extractor(key_columns, arena_holder.getComplexKeyArena());
const size_t size = extractor.getKeysSize();
const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type);
const size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second;
auto & attribute = attributes[attribute_index];
bool is_attribute_nullable = attribute.is_index_null.has_value();
ColumnUInt8::MutablePtr col_null_map_to;
ColumnUInt8::Container * vec_null_map_to = nullptr;
if (attribute.is_index_null)
{
col_null_map_to = ColumnUInt8::create(size, false);
vec_null_map_to = &col_null_map_to->getData();
}
auto type_call = [&](const auto & dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
using ValueType = DictionaryValueType<AttributeType>;
using ColumnProvider = DictionaryAttributeColumnProvider<AttributeType>;
DictionaryDefaultValueExtractor<AttributeType> default_value_extractor(dictionary_attribute.null_value, default_values_column);
auto column = ColumnProvider::getColumn(dictionary_attribute, size);
if constexpr (std::is_same_v<ValueType, Array>)
{
auto * out = column.get();
getItemsImpl<ValueType, false>(
attribute,
extractor,
[&](const size_t, const Array & value, bool) { out->insert(value); },
default_value_extractor);
}
else if constexpr (std::is_same_v<ValueType, StringRef>)
{
auto * out = column.get();
if (is_attribute_nullable)
getItemsImpl<ValueType, true>(
attribute,
extractor,
[&](size_t row, const StringRef value, bool is_null)
{
(*vec_null_map_to)[row] = is_null;
out->insertData(value.data, value.size);
},
default_value_extractor);
else
getItemsImpl<ValueType, false>(
attribute,
extractor,
[&](size_t, const StringRef value, bool) { out->insertData(value.data, value.size); },
default_value_extractor);
}
else
{
auto & out = column->getData();
if (is_attribute_nullable)
getItemsImpl<ValueType, true>(
attribute,
extractor,
[&](size_t row, const auto value, bool is_null)
{
(*vec_null_map_to)[row] = is_null;
out[row] = value;
},
default_value_extractor);
else
getItemsImpl<ValueType, false>(
attribute,
extractor,
[&](size_t row, const auto value, bool) { out[row] = value; },
default_value_extractor);
}
result = std::move(column);
};
callOnDictionaryAttributeType(attribute.type, type_call);
if (is_attribute_nullable)
result = ColumnNullable::create(std::move(result), std::move(col_null_map_to));
return result;
}
template <DictionaryKeyType dictionary_key_type>
ColumnUInt8::Ptr HashedArrayDictionary<dictionary_key_type>::hasKeys(const Columns & key_columns, const DataTypes & key_types) const
{
if (dictionary_key_type == DictionaryKeyType::Complex)
dict_struct.validateKeyTypes(key_types);
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
DictionaryKeysExtractor<dictionary_key_type> extractor(key_columns, arena_holder.getComplexKeyArena());
size_t keys_size = extractor.getKeysSize();
auto result = ColumnUInt8::create(keys_size, false);
auto & out = result->getData();
if (attributes.empty())
{
query_count.fetch_add(keys_size, std::memory_order_relaxed);
return result;
}
size_t keys_found = 0;
for (size_t requested_key_index = 0; requested_key_index < keys_size; ++requested_key_index)
{
auto requested_key = extractor.extractCurrentKey();
out[requested_key_index] = key_attribute.container.find(requested_key) != key_attribute.container.end();
keys_found += out[requested_key_index];
extractor.rollbackCurrentKey();
}
query_count.fetch_add(keys_size, std::memory_order_relaxed);
found_count.fetch_add(keys_found, std::memory_order_relaxed);
return result;
}
template <DictionaryKeyType dictionary_key_type>
ColumnPtr HashedArrayDictionary<dictionary_key_type>::getHierarchy(ColumnPtr key_column [[maybe_unused]], const DataTypePtr &) const
{
if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
{
PaddedPODArray<UInt64> keys_backup_storage;
const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage);
size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
const auto & dictionary_attribute = dict_struct.attributes[hierarchical_attribute_index];
const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
const auto & key_attribute_container = key_attribute.container;
const UInt64 null_value = dictionary_attribute.null_value.template get<UInt64>();
const AttributeContainerType<UInt64> & parent_keys_container = std::get<AttributeContainerType<UInt64>>(hierarchical_attribute.container);
auto is_key_valid_func = [&](auto & key) { return key_attribute_container.find(key) != key_attribute_container.end(); };
size_t keys_found = 0;
auto get_parent_func = [&](auto & hierarchy_key)
{
std::optional<UInt64> result;
auto it = key_attribute_container.find(hierarchy_key);
if (it != key_attribute_container.end())
result = parent_keys_container[it->getMapped()];
keys_found += result.has_value();
return result;
};
auto dictionary_hierarchy_array = getKeysHierarchyArray(keys, null_value, is_key_valid_func, get_parent_func);
query_count.fetch_add(keys.size(), std::memory_order_relaxed);
found_count.fetch_add(keys_found, std::memory_order_relaxed);
return dictionary_hierarchy_array;
}
else
{
return nullptr;
}
}
template <DictionaryKeyType dictionary_key_type>
ColumnUInt8::Ptr HashedArrayDictionary<dictionary_key_type>::isInHierarchy(
ColumnPtr key_column [[maybe_unused]],
ColumnPtr in_key_column [[maybe_unused]],
const DataTypePtr &) const
{
if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
{
PaddedPODArray<UInt64> keys_backup_storage;
const auto & keys = getColumnVectorData(this, key_column, keys_backup_storage);
PaddedPODArray<UInt64> keys_in_backup_storage;
const auto & keys_in = getColumnVectorData(this, in_key_column, keys_in_backup_storage);
size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
const auto & dictionary_attribute = dict_struct.attributes[hierarchical_attribute_index];
auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
const auto & key_attribute_container = key_attribute.container;
const UInt64 null_value = dictionary_attribute.null_value.template get<UInt64>();
const AttributeContainerType<UInt64> & parent_keys_container = std::get<AttributeContainerType<UInt64>>(hierarchical_attribute.container);
auto is_key_valid_func = [&](auto & key) { return key_attribute_container.find(key) != key_attribute_container.end(); };
size_t keys_found = 0;
auto get_parent_func = [&](auto & hierarchy_key)
{
std::optional<UInt64> result;
auto it = key_attribute_container.find(hierarchy_key);
if (it != key_attribute_container.end())
result = parent_keys_container[it->getMapped()];
keys_found += result.has_value();
return result;
};
auto result = getKeysIsInHierarchyColumn(keys, keys_in, null_value, is_key_valid_func, get_parent_func);
query_count.fetch_add(keys.size(), std::memory_order_relaxed);
found_count.fetch_add(keys_found, std::memory_order_relaxed);
return result;
}
else
{
return nullptr;
}
}
template <DictionaryKeyType dictionary_key_type>
ColumnPtr HashedArrayDictionary<dictionary_key_type>::getDescendants(
ColumnPtr key_column [[maybe_unused]],
const DataTypePtr &,
size_t level [[maybe_unused]]) const
{
if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
{
PaddedPODArray<UInt64> keys_backup;
const auto & keys = getColumnVectorData(this, key_column, keys_backup);
size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
const AttributeContainerType<UInt64> & parent_keys_container = std::get<AttributeContainerType<UInt64>>(hierarchical_attribute.container);
const auto & key_attribute_container = key_attribute.container;
HashMap<size_t, UInt64> index_to_key;
index_to_key.reserve(key_attribute.container.size());
for (auto & [key, value] : key_attribute_container)
index_to_key[value] = key;
HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
for (size_t i = 0; i < parent_keys_container.size(); ++i)
{
const auto * it = index_to_key.find(i);
if (it == index_to_key.end())
continue;
auto parent_key = it->getMapped();
auto child_key = parent_keys_container[i];
parent_to_child[parent_key].emplace_back(child_key);
}
size_t keys_found = 0;
auto result = getKeysDescendantsArray(keys, parent_to_child, level, keys_found);
query_count.fetch_add(keys.size(), std::memory_order_relaxed);
found_count.fetch_add(keys_found, std::memory_order_relaxed);
return result;
}
else
{
return nullptr;
}
}
template <DictionaryKeyType dictionary_key_type>
void HashedArrayDictionary<dictionary_key_type>::createAttributes()
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
for (const auto & dictionary_attribute : dict_struct.attributes)
{
auto type_call = [&, this](const auto & dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
using ValueType = DictionaryValueType<AttributeType>;
auto is_index_null = dictionary_attribute.is_nullable ? std::make_optional<std::vector<bool>>() : std::optional<std::vector<bool>>{};
std::unique_ptr<Arena> string_arena = std::is_same_v<AttributeType, String> ? std::make_unique<Arena>() : nullptr;
Attribute attribute{dictionary_attribute.underlying_type, AttributeContainerType<ValueType>(), std::move(is_index_null), std::move(string_arena)};
attributes.emplace_back(std::move(attribute));
};
callOnDictionaryAttributeType(dictionary_attribute.underlying_type, type_call);
}
}
template <DictionaryKeyType dictionary_key_type>
void HashedArrayDictionary<dictionary_key_type>::updateData()
{
if (!update_field_loaded_block || update_field_loaded_block->rows() == 0)
{
QueryPipeline pipeline(source_ptr->loadUpdatedAll());
PullingPipelineExecutor executor(pipeline);
Block block;
while (executor.pull(block))
{
/// We are using this to keep saved data if input stream consists of multiple blocks
if (!update_field_loaded_block)
update_field_loaded_block = std::make_shared<DB::Block>(block.cloneEmpty());
for (size_t attribute_index = 0; attribute_index < block.columns(); ++attribute_index)
{
const IColumn & update_column = *block.getByPosition(attribute_index).column.get();
MutableColumnPtr saved_column = update_field_loaded_block->getByPosition(attribute_index).column->assumeMutable();
saved_column->insertRangeFrom(update_column, 0, update_column.size());
}
}
}
else
{
auto pipe = source_ptr->loadUpdatedAll();
mergeBlockWithPipe<dictionary_key_type>(
dict_struct.getKeysSize(),
*update_field_loaded_block,
std::move(pipe));
}
if (update_field_loaded_block)
{
resize(update_field_loaded_block->rows());
blockToAttributes(*update_field_loaded_block.get());
}
}
template <DictionaryKeyType dictionary_key_type>
void HashedArrayDictionary<dictionary_key_type>::blockToAttributes(const Block & block [[maybe_unused]])
{
size_t skip_keys_size_offset = dict_struct.getKeysSize();
Columns key_columns;
key_columns.reserve(skip_keys_size_offset);
/// Split into keys columns and attribute columns
for (size_t i = 0; i < skip_keys_size_offset; ++i)
key_columns.emplace_back(block.safeGetByPosition(i).column);
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
DictionaryKeysExtractor<dictionary_key_type> keys_extractor(key_columns, arena_holder.getComplexKeyArena());
const size_t keys_size = keys_extractor.getKeysSize();
Field column_value_to_insert;
for (size_t key_index = 0; key_index < keys_size; ++key_index)
{
auto key = keys_extractor.extractCurrentKey();
auto it = key_attribute.container.find(key);
if (it != key_attribute.container.end())
{
keys_extractor.rollbackCurrentKey();
continue;
}
if constexpr (std::is_same_v<KeyType, StringRef>)
key = copyKeyInArena(key);
key_attribute.container.insert({key, element_count});
for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index)
{
const IColumn & attribute_column = *block.safeGetByPosition(skip_keys_size_offset + attribute_index).column;
auto & attribute = attributes[attribute_index];
bool attribute_is_nullable = attribute.is_index_null.has_value();
attribute_column.get(key_index, column_value_to_insert);
auto type_call = [&](const auto & dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
using AttributeValueType = DictionaryValueType<AttributeType>;
auto & attribute_container = std::get<AttributeContainerType<AttributeValueType>>(attribute.container);
attribute_container.emplace_back();
if (attribute_is_nullable)
{
attribute.is_index_null->emplace_back();
if (column_value_to_insert.isNull())
{
(*attribute.is_index_null).back() = true;
return;
}
}
if constexpr (std::is_same_v<AttributeValueType, StringRef>)
{
String & value_to_insert = column_value_to_insert.get<String>();
size_t value_to_insert_size = value_to_insert.size();
const char * string_in_arena = attribute.string_arena->insert(value_to_insert.data(), value_to_insert_size);
StringRef string_in_arena_reference = StringRef{string_in_arena, value_to_insert_size};
attribute_container.back() = string_in_arena_reference;
}
else
{
auto value_to_insert = column_value_to_insert.get<NearestFieldType<AttributeValueType>>();
attribute_container.back() = value_to_insert;
}
};
callOnDictionaryAttributeType(attribute.type, type_call);
}
++element_count;
keys_extractor.rollbackCurrentKey();
}
}
template <DictionaryKeyType dictionary_key_type>
void HashedArrayDictionary<dictionary_key_type>::resize(size_t added_rows)
{
if (unlikely(!added_rows))
return;
key_attribute.container.reserve(added_rows);
}
template <DictionaryKeyType dictionary_key_type>
template <typename AttributeType, bool is_nullable, typename ValueSetter, typename DefaultValueExtractor>
void HashedArrayDictionary<dictionary_key_type>::getItemsImpl(
const Attribute & attribute,
DictionaryKeysExtractor<dictionary_key_type> & keys_extractor,
ValueSetter && set_value [[maybe_unused]],
DefaultValueExtractor & default_value_extractor) const
{
const auto & key_attribute_container = key_attribute.container;
const auto & attribute_container = std::get<AttributeContainerType<AttributeType>>(attribute.container);
const size_t keys_size = keys_extractor.getKeysSize();
size_t keys_found = 0;
for (size_t key_index = 0; key_index < keys_size; ++key_index)
{
auto key = keys_extractor.extractCurrentKey();
const auto it = key_attribute_container.find(key);
if (it != key_attribute_container.end())
{
size_t element_index = it->getMapped();
const auto & element = attribute_container[element_index];
if constexpr (is_nullable)
set_value(key_index, element, (*attribute.is_index_null)[element_index]);
else
set_value(key_index, element, false);
++keys_found;
}
else
{
if constexpr (is_nullable)
set_value(key_index, default_value_extractor[key_index], default_value_extractor.isNullAt(key_index));
else
set_value(key_index, default_value_extractor[key_index], false);
}
keys_extractor.rollbackCurrentKey();
}
query_count.fetch_add(keys_size, std::memory_order_relaxed);
found_count.fetch_add(keys_found, std::memory_order_relaxed);
}
template <DictionaryKeyType dictionary_key_type>
StringRef HashedArrayDictionary<dictionary_key_type>::copyKeyInArena(StringRef key)
{
size_t key_size = key.size;
char * place_for_key = complex_key_arena.alloc(key_size);
memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(key.data), key_size);
StringRef updated_key{place_for_key, key_size};
return updated_key;
}
template <DictionaryKeyType dictionary_key_type>
void HashedArrayDictionary<dictionary_key_type>::loadData()
{
if (!source_ptr->hasUpdateField())
{
QueryPipeline pipeline;
pipeline = QueryPipeline(source_ptr->loadAll());
PullingPipelineExecutor executor(pipeline);
Block block;
while (executor.pull(block))
{
resize(block.rows());
blockToAttributes(block);
}
}
else
{
updateData();
}
if (configuration.require_nonempty && 0 == element_count)
throw Exception(ErrorCodes::DICTIONARY_IS_EMPTY,
"{}: dictionary source is empty and 'require_nonempty' property is set.",
full_name);
}
template <DictionaryKeyType dictionary_key_type>
void HashedArrayDictionary<dictionary_key_type>::calculateBytesAllocated()
{
bytes_allocated += attributes.size() * sizeof(attributes.front());
bytes_allocated += key_attribute.container.size();
for (auto & attribute : attributes)
{
auto type_call = [&](const auto & dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
using ValueType = DictionaryValueType<AttributeType>;
const auto & container = std::get<AttributeContainerType<ValueType>>(attribute.container);
bytes_allocated += sizeof(AttributeContainerType<ValueType>);
if constexpr (std::is_same_v<ValueType, Array>)
{
/// It is not accurate calculations
bytes_allocated += sizeof(Array) * container.size();
}
else
{
bytes_allocated += container.allocated_bytes();
}
bucket_count = container.capacity();
if constexpr (std::is_same_v<ValueType, StringRef>)
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
};
callOnDictionaryAttributeType(attribute.type, type_call);
if (attribute.string_arena)
bytes_allocated += attribute.string_arena->size();
if (attribute.is_index_null.has_value())
bytes_allocated += (*attribute.is_index_null).size();
}
bytes_allocated += complex_key_arena.size();
if (update_field_loaded_block)
bytes_allocated += update_field_loaded_block->allocatedBytes();
}
template <DictionaryKeyType dictionary_key_type>
Pipe HashedArrayDictionary<dictionary_key_type>::read(const Names & column_names, size_t max_block_size) const
{
PaddedPODArray<HashedArrayDictionary::KeyType> keys;
keys.reserve(key_attribute.container.size());
for (auto & [key, _] : key_attribute.container)
keys.emplace_back(key);
return Pipe(std::make_shared<DictionarySource>(DictionarySourceData(shared_from_this(), std::move(keys), column_names), max_block_size));
}
template class HashedArrayDictionary<DictionaryKeyType::Simple>;
template class HashedArrayDictionary<DictionaryKeyType::Complex>;
void registerDictionaryArrayHashed(DictionaryFactory & factory)
{
auto create_layout = [](const std::string & full_name,
const DictionaryStructure & dict_struct,
const Poco::Util::AbstractConfiguration & config,
const std::string & config_prefix,
DictionarySourcePtr source_ptr,
DictionaryKeyType dictionary_key_type) -> DictionaryPtr
{
if (dictionary_key_type == DictionaryKeyType::Simple && dict_struct.key)
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'key' is not supported for simple key hashed array dictionary");
else if (dictionary_key_type == DictionaryKeyType::Complex && dict_struct.id)
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'id' is not supported for complex key hashed array dictionary");
if (dict_struct.range_min || dict_struct.range_max)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"{}: elements .structure.range_min and .structure.range_max should be defined only "
"for a dictionary of layout 'range_hashed'",
full_name);
const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix);
const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"};
const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false);
HashedArrayDictionaryStorageConfiguration configuration{require_nonempty, dict_lifetime};
if (dictionary_key_type == DictionaryKeyType::Simple)
return std::make_unique<HashedArrayDictionary<DictionaryKeyType::Simple>>(dict_id, dict_struct, std::move(source_ptr), configuration);
else
return std::make_unique<HashedArrayDictionary<DictionaryKeyType::Complex>>(dict_id, dict_struct, std::move(source_ptr), configuration);
};
using namespace std::placeholders;
factory.registerLayout("hashed_array",
[=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e, ContextPtr /* global_context */, bool /*created_from_ddl*/){ return create_layout(a, b, c, d, std::move(e), DictionaryKeyType::Simple); }, false);
factory.registerLayout("complex_key_hashed_array",
[=](auto && a, auto && b, auto && c, auto && d, DictionarySourcePtr e, ContextPtr /* global_context */, bool /*created_from_ddl*/){ return create_layout(a, b, c, d, std::move(e), DictionaryKeyType::Complex); }, true);
}
}

View File

@ -0,0 +1,211 @@
#pragma once
#include <atomic>
#include <memory>
#include <variant>
#include <optional>
#include <Common/SparseHashMap.h>
#include <Common/HashTable/HashMap.h>
#include <Common/HashTable/HashSet.h>
#include <Core/Block.h>
#include <Dictionaries/DictionaryStructure.h>
#include <Dictionaries/IDictionary.h>
#include <Dictionaries/IDictionarySource.h>
#include <Dictionaries/DictionaryHelpers.h>
/** This dictionary stores all attributes in arrays.
* Key is stored in hash table and value is index into attribute array.
*/
namespace DB
{
struct HashedArrayDictionaryStorageConfiguration
{
const bool require_nonempty;
const DictionaryLifetime lifetime;
};
template <DictionaryKeyType dictionary_key_type>
class HashedArrayDictionary final : public IDictionary
{
public:
using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::Simple, UInt64, StringRef>;
HashedArrayDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
const HashedArrayDictionaryStorageConfiguration & configuration_,
BlockPtr update_field_loaded_block_ = nullptr);
std::string getTypeName() const override
{
if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
return "HashedArray";
else
return "ComplexHashedArray";
}
size_t getBytesAllocated() const override { return bytes_allocated; }
size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); }
double getFoundRate() const override
{
size_t queries = query_count.load(std::memory_order_relaxed);
if (!queries)
return 0;
return static_cast<double>(found_count.load(std::memory_order_relaxed)) / queries;
}
double getHitRate() const override { return 1.0; }
size_t getElementCount() const override { return element_count; }
double getLoadFactor() const override { return static_cast<double>(element_count) / bucket_count; }
std::shared_ptr<const IExternalLoadable> clone() const override
{
return std::make_shared<HashedArrayDictionary<dictionary_key_type>>(getDictionaryID(), dict_struct, source_ptr->clone(), configuration, update_field_loaded_block);
}
const IDictionarySource * getSource() const override { return source_ptr.get(); }
const DictionaryLifetime & getLifetime() const override { return configuration.lifetime; }
const DictionaryStructure & getStructure() const override { return dict_struct; }
bool isInjective(const std::string & attribute_name) const override
{
return dict_struct.getAttribute(attribute_name).injective;
}
DictionaryKeyType getKeyType() const override { return dictionary_key_type; }
ColumnPtr getColumn(
const std::string& attribute_name,
const DataTypePtr & result_type,
const Columns & key_columns,
const DataTypes & key_types,
const ColumnPtr & default_values_column) const override;
ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override;
bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::Simple && dict_struct.hierarchical_attribute_index.has_value(); }
ColumnPtr getHierarchy(ColumnPtr key_column, const DataTypePtr & hierarchy_attribute_type) const override;
ColumnUInt8::Ptr isInHierarchy(
ColumnPtr key_column,
ColumnPtr in_key_column,
const DataTypePtr & key_type) const override;
ColumnPtr getDescendants(
ColumnPtr key_column,
const DataTypePtr & key_type,
size_t level) const override;
Pipe read(const Names & column_names, size_t max_block_size) const override;
private:
using KeyContainerType = std::conditional_t<
dictionary_key_type == DictionaryKeyType::Simple,
HashMap<UInt64, size_t>,
HashMapWithSavedHash<StringRef, size_t, DefaultHash<StringRef>>>;
template <typename Value>
using AttributeContainerType = std::conditional_t<std::is_same_v<Value, Array>, std::vector<Value>, PaddedPODArray<Value>>;
struct Attribute final
{
AttributeUnderlyingType type;
std::variant<
AttributeContainerType<UInt8>,
AttributeContainerType<UInt16>,
AttributeContainerType<UInt32>,
AttributeContainerType<UInt64>,
AttributeContainerType<UInt128>,
AttributeContainerType<UInt256>,
AttributeContainerType<Int8>,
AttributeContainerType<Int16>,
AttributeContainerType<Int32>,
AttributeContainerType<Int64>,
AttributeContainerType<Int128>,
AttributeContainerType<Int256>,
AttributeContainerType<Decimal32>,
AttributeContainerType<Decimal64>,
AttributeContainerType<Decimal128>,
AttributeContainerType<Decimal256>,
AttributeContainerType<Float32>,
AttributeContainerType<Float64>,
AttributeContainerType<UUID>,
AttributeContainerType<StringRef>,
AttributeContainerType<Array>>
container;
std::optional<std::vector<bool>> is_index_null;
std::unique_ptr<Arena> string_arena;
};
struct KeyAttribute final
{
KeyContainerType container;
};
void createAttributes();
void blockToAttributes(const Block & block);
void updateData();
void loadData();
void calculateBytesAllocated();
template <typename AttributeType, bool is_nullable, typename ValueSetter, typename DefaultValueExtractor>
void getItemsImpl(
const Attribute & attribute,
DictionaryKeysExtractor<dictionary_key_type> & keys_extractor,
ValueSetter && set_value,
DefaultValueExtractor & default_value_extractor) const;
template <typename GetContainerFunc>
void getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func);
template <typename GetContainerFunc>
void getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func) const;
void resize(size_t added_rows);
StringRef copyKeyInArena(StringRef key);
const DictionaryStructure dict_struct;
const DictionarySourcePtr source_ptr;
const HashedArrayDictionaryStorageConfiguration configuration;
std::vector<Attribute> attributes;
KeyAttribute key_attribute;
size_t bytes_allocated = 0;
size_t element_count = 0;
size_t bucket_count = 0;
mutable std::atomic<size_t> query_count{0};
mutable std::atomic<size_t> found_count{0};
BlockPtr update_field_loaded_block;
Arena complex_key_arena;
};
extern template class HashedArrayDictionary<DictionaryKeyType::Simple>;
extern template class HashedArrayDictionary<DictionaryKeyType::Complex>;
}

View File

@ -733,8 +733,18 @@ void registerDictionaryHashed(DictionaryFactory & factory)
const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"};
const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false);
const std::string & layout_prefix = sparse ? ".layout.sparse_hashed" : ".layout.hashed";
const bool preallocate = config.getBool(config_prefix + layout_prefix + ".preallocate", false);
std::string dictionary_layout_name;
if (dictionary_key_type == DictionaryKeyType::Simple)
dictionary_layout_name = "hashed";
else
dictionary_layout_name = "complex_key_hashed";
if (sparse)
dictionary_layout_name = "sparse_" + dictionary_layout_name;
const std::string dictionary_layout_prefix = ".layout." + dictionary_layout_name;
const bool preallocate = config.getBool(config_prefix + dictionary_layout_prefix + ".preallocate", false);
HashedDictionaryStorageConfiguration configuration{preallocate, require_nonempty, dict_lifetime};

View File

@ -18,6 +18,7 @@
#include <Common/MemorySanitizer.h>
#include <Common/HashTable/HashMap.h>
#include <IO/AIO.h>
#include <IO/BufferWithOwnMemory.h>
#include <Dictionaries/DictionaryStructure.h>
#include <Dictionaries/ICacheDictionaryStorage.h>
#include <Dictionaries/DictionaryHelpers.h>

View File

@ -28,6 +28,7 @@ void registerDictionaryComplexKeyHashed(DictionaryFactory & factory);
void registerDictionaryTrie(DictionaryFactory & factory);
void registerDictionaryFlat(DictionaryFactory & factory);
void registerDictionaryHashed(DictionaryFactory & factory);
void registerDictionaryArrayHashed(DictionaryFactory & factory);
void registerDictionaryCache(DictionaryFactory & factory);
void registerDictionaryPolygon(DictionaryFactory & factory);
void registerDictionaryDirect(DictionaryFactory & factory);
@ -60,6 +61,7 @@ void registerDictionaries()
registerDictionaryTrie(factory);
registerDictionaryFlat(factory);
registerDictionaryHashed(factory);
registerDictionaryArrayHashed(factory);
registerDictionaryCache(factory);
registerDictionaryPolygon(factory);
registerDictionaryDirect(factory);

View File

@ -13,6 +13,7 @@
#include <Processors/Formats/Impl/ParallelFormattingOutputFormat.h>
#include <Poco/URI.h>
#include <IO/BufferWithOwnMemory.h>
#include <IO/ReadHelpers.h>
namespace DB

View File

@ -1,9 +1,9 @@
#pragma once
#include <Common/Allocator.h>
#include <Columns/IColumn.h>
#include <Formats/FormatSettings.h>
#include <Interpreters/Context_fwd.h>
#include <IO/BufferWithOwnMemory.h>
#include <base/types.h>
#include <boost/noncopyable.hpp>
@ -34,6 +34,9 @@ struct RowOutputFormatParams;
using InputFormatPtr = std::shared_ptr<IInputFormat>;
using OutputFormatPtr = std::shared_ptr<IOutputFormat>;
template <typename Allocator>
struct Memory;
FormatSettings getFormatSettings(ContextPtr context);
template <typename T>
@ -55,7 +58,7 @@ public:
*/
using FileSegmentationEngine = std::function<std::pair<bool, size_t>(
ReadBuffer & buf,
DB::Memory<> & memory,
DB::Memory<Allocator<false>> & memory,
size_t min_chunk_bytes)>;
/// This callback allows to perform some additional actions after writing a single row.

View File

@ -1,4 +1,5 @@
#include <IO/ReadHelpers.h>
#include <Formats/JSONEachRowUtils.h>
#include <base/find_symbols.h>
namespace DB

View File

@ -1,5 +1,9 @@
#pragma once
#include <IO/BufferWithOwnMemory.h>
#include <IO/ReadBuffer.h>
#include <utility>
namespace DB
{

View File

@ -124,8 +124,8 @@ public:
*/
struct Instruction
{
const IColumn * condition = nullptr;
const IColumn * source = nullptr;
IColumn::Ptr condition = nullptr;
IColumn::Ptr source = nullptr;
bool condition_always_true = false;
bool condition_is_nullable = false;
@ -160,15 +160,15 @@ public:
}
else
{
const ColumnWithTypeAndName & cond_col = arguments[i];
IColumn::Ptr cond_col = arguments[i].column->convertToFullColumnIfLowCardinality();
/// We skip branches that are always false.
/// If we encounter a branch that is always true, we can finish.
if (cond_col.column->onlyNull())
if (cond_col->onlyNull())
continue;
if (const auto * column_const = checkAndGetColumn<ColumnConst>(*cond_col.column))
if (const auto * column_const = checkAndGetColumn<ColumnConst>(*cond_col))
{
Field value = column_const->getField();
@ -181,26 +181,24 @@ public:
}
else
{
if (isColumnNullable(*cond_col.column))
instruction.condition_is_nullable = true;
instruction.condition = cond_col.column.get();
instruction.condition = cond_col;
instruction.condition_is_nullable = instruction.condition->isNullable();
}
instruction.condition_is_short = cond_col.column->size() < arguments[0].column->size();
instruction.condition_is_short = cond_col->size() < arguments[0].column->size();
}
const ColumnWithTypeAndName & source_col = arguments[source_idx];
instruction.source_is_short = source_col.column->size() < arguments[0].column->size();
if (source_col.type->equals(*return_type))
{
instruction.source = source_col.column.get();
instruction.source = source_col.column;
}
else
{
/// Cast all columns to result type.
converted_columns_holder.emplace_back(castColumn(source_col, return_type));
instruction.source = converted_columns_holder.back().get();
instruction.source = converted_columns_holder.back();
}
if (instruction.source && isColumnConst(*instruction.source))

View File

@ -7,17 +7,6 @@
#include <cstring>
#include <cassert>
#if defined(__OpenBSD__) || defined(__FreeBSD__) || defined (__ANDROID__)
# include <sys/endian.h>
#elif defined(__sun)
# include <endian.h>
#elif defined(__APPLE__)
# include <libkern/OSByteOrder.h>
# define htobe64(x) OSSwapHostToBigInt64(x)
# define be64toh(x) OSSwapBigToHostInt64(x)
#endif
namespace DB
{
@ -152,7 +141,7 @@ private:
memcpy(&tmp_buffer, source_current, bytes_to_read);
source_current += bytes_to_read;
tmp_buffer = be64toh(tmp_buffer);
tmp_buffer = __builtin_bswap64(tmp_buffer);
bits_buffer |= BufferType(tmp_buffer) << ((sizeof(BufferType) - sizeof(tmp_buffer)) * 8 - bits_count);
bits_count += static_cast<UInt8>(bytes_to_read) * 8;
@ -200,7 +189,7 @@ public:
capacity = BIT_BUFFER_SIZE - bits_count;
}
// write low bits of value as high bits of bits_buffer
// write low bits of value as high bits of bits_buffer
const UInt64 mask = maskLowBits<UInt64>(bits_to_write);
BufferType v = value & mask;
v <<= capacity - bits_to_write;
@ -212,7 +201,7 @@ public:
// flush contents of bits_buffer to the dest_current, partial bytes are completed with zeroes.
inline void flush()
{
bits_count = (bits_count + 8 - 1) & ~(8 - 1); // align UP to 8-bytes, so doFlush will write ALL data from bits_buffer
bits_count = (bits_count + 8 - 1) & ~(8 - 1); // align up to 8-bytes, so doFlush will write all data from bits_buffer
while (bits_count != 0)
doFlush();
}
@ -231,13 +220,12 @@ private:
if (available < to_write)
{
throw Exception("Can not write past end of buffer. Space available "
+ std::to_string(available) + " bytes, required to write: "
+ std::to_string(to_write) + ".",
ErrorCodes::CANNOT_WRITE_AFTER_END_OF_BUFFER);
throw Exception(ErrorCodes::CANNOT_WRITE_AFTER_END_OF_BUFFER,
"Can not write past end of buffer. Space available {} bytes, required to write {} bytes.",
available, to_write);
}
const auto tmp_buffer = htobe64(static_cast<UInt64>(bits_buffer >> (sizeof(bits_buffer) - sizeof(UInt64)) * 8));
const auto tmp_buffer = __builtin_bswap64(static_cast<UInt64>(bits_buffer >> (sizeof(bits_buffer) - sizeof(UInt64)) * 8));
memcpy(dest_current, &tmp_buffer, to_write);
dest_current += to_write;

View File

@ -6,6 +6,7 @@
#include <Formats/FormatSettings.h>
#include <IO/WriteHelpers.h>
#include <IO/WriteBufferFromString.h>
#include <IO/BufferWithOwnMemory.h>
#include <IO/readFloatText.h>
#include <IO/Operators.h>
#include <base/find_symbols.h>
@ -1120,7 +1121,7 @@ void skipToUnescapedNextLineOrEOF(ReadBuffer & buf)
}
}
void saveUpToPosition(ReadBuffer & in, DB::Memory<> & memory, char * current)
void saveUpToPosition(ReadBuffer & in, Memory<> & memory, char * current)
{
assert(current >= in.position());
assert(current <= in.buffer().end());
@ -1140,7 +1141,7 @@ void saveUpToPosition(ReadBuffer & in, DB::Memory<> & memory, char * current)
in.position() = current;
}
bool loadAtPosition(ReadBuffer & in, DB::Memory<> & memory, char * & current)
bool loadAtPosition(ReadBuffer & in, Memory<> & memory, char * & current)
{
assert(current <= in.buffer().end());

View File

@ -19,6 +19,7 @@
#include <Core/DecimalFunctions.h>
#include <Core/UUID.h>
#include <Common/Allocator.h>
#include <Common/Exception.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/Arena.h>
@ -29,7 +30,6 @@
#include <IO/CompressionMethod.h>
#include <IO/ReadBuffer.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/BufferWithOwnMemory.h>
#include <IO/VarInt.h>
#include <DataTypes/DataTypeDateTime.h>
@ -41,6 +41,9 @@ static constexpr auto DEFAULT_MAX_STRING_SIZE = 1_GiB;
namespace DB
{
template <typename Allocator>
struct Memory;
namespace ErrorCodes
{
extern const int CANNOT_PARSE_DATE;
@ -1290,7 +1293,7 @@ void skipToUnescapedNextLineOrEOF(ReadBuffer & buf);
/** This function just copies the data from buffer's internal position (in.position())
* to current position (from arguments) into memory.
*/
void saveUpToPosition(ReadBuffer & in, Memory<> & memory, char * current);
void saveUpToPosition(ReadBuffer & in, Memory<Allocator<false>> & memory, char * current);
/** This function is negative to eof().
* In fact it returns whether the data was loaded to internal ReadBuffers's buffer or not.
@ -1299,7 +1302,7 @@ void saveUpToPosition(ReadBuffer & in, Memory<> & memory, char * current);
* of our buffer and the current cursor in the end of the buffer. When we call eof() it calls next().
* And this function can fill the buffer with new data, so we will lose the data from previous buffer state.
*/
bool loadAtPosition(ReadBuffer & in, Memory<> & memory, char * & current);
bool loadAtPosition(ReadBuffer & in, Memory<Allocator<false>> & memory, char * & current);
struct PcgDeserializer

View File

@ -6,3 +6,6 @@ target_link_libraries(select_parser_fuzzer PRIVATE clickhouse_parsers ${LIB_FUZZ
add_executable(create_parser_fuzzer create_parser_fuzzer.cpp ${SRCS})
target_link_libraries(create_parser_fuzzer PRIVATE clickhouse_parsers ${LIB_FUZZING_ENGINE})
add_subdirectory(codegen_fuzzer)

View File

@ -0,0 +1,48 @@
find_package(Protobuf REQUIRED)
set (CURRENT_DIR_IN_SOURCES "${ClickHouse_SOURCE_DIR}/src/Parsers/fuzzers/codegen_fuzzer")
set (CURRENT_DIR_IN_BINARY "${ClickHouse_BINARY_DIR}/src/Parsers/fuzzers/codegen_fuzzer")
# Copy scripts and template file to build directory to generate .proto and .cpp file from them
configure_file(
"${CURRENT_DIR_IN_SOURCES}/gen.py"
"${CURRENT_DIR_IN_BINARY}/gen.py"
COPYONLY)
configure_file(
"${CURRENT_DIR_IN_SOURCES}/update.sh"
"${CURRENT_DIR_IN_BINARY}/update.sh"
COPYONLY)
configure_file(
"${CURRENT_DIR_IN_SOURCES}/clickhouse-template.g"
"${CURRENT_DIR_IN_BINARY}/clickhouse-template.g"
COPYONLY)
# Note that it depends on all.dict file!
add_custom_command(
OUTPUT
"${CURRENT_DIR_IN_BINARY}/clickhouse.g"
COMMAND ./update.sh "${ClickHouse_SOURCE_DIR}/tests/fuzz/all.dict"
)
add_custom_command(
OUTPUT
"${CURRENT_DIR_IN_BINARY}/out.cpp"
"${CURRENT_DIR_IN_BINARY}/out.proto"
COMMAND python3 gen.py clickhouse.g out.cpp out.proto
DEPENDS "${CURRENT_DIR_IN_BINARY}/clickhouse.g"
)
PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS "${CURRENT_DIR_IN_BINARY}/out.proto")
set(FUZZER_SRCS codegen_select_fuzzer.cpp "${CURRENT_DIR_IN_BINARY}/out.cpp" ${PROTO_SRCS} ${PROTO_HDRS})
set(CMAKE_INCLUDE_CURRENT_DIR TRUE)
add_executable(codegen_select_fuzzer ${FUZZER_SRCS})
set_source_files_properties("${PROTO_SRCS}" "out.cpp" PROPERTIES COMPILE_FLAGS "-Wno-reserved-identifier")
target_include_directories(codegen_select_fuzzer BEFORE PRIVATE "${Protobuf_INCLUDE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}")
target_include_directories(codegen_select_fuzzer BEFORE PRIVATE "${LibProtobufMutator_SOURCE_DIR}")
target_include_directories(codegen_select_fuzzer BEFORE PRIVATE "${LibProtobufMutator_SOURCE_DIR}/src")
target_link_libraries(codegen_select_fuzzer PRIVATE protobuf-mutator dbms ${LIB_FUZZING_ENGINE})

View File

@ -0,0 +1,121 @@
" ";
" ";
" ";
";";
"(" $1 ")";
"(" $1 ", " $2 ")";
"(" $1 ", " $2 ", " $3 ")";
$1 ", " $2 ;
$1 ", " $2 ", " $3 ;
$1 ", " $2 ", " $3 ", " $4 ;
$1 ", " $2 ", " $3 ", " $4 ", " $5 ;
"[" $1 ", " $2 "]";
"[" $1 ", " $2 ", " $3 "]";
"[" $1 ", " $2 ", " $3 ", " $4 "]";
"[" $1 ", " $2 ", " $3 ", " $4 ", " $5 "]";
$0 "(" $1 ")";
$0 "(" $1 ", " $2 ")";
$0 "(" $1 ", " $2 ", " $3 ")";
$1 " as " $2 ;
// TODO: add more clickhouse specific stuff
"SELECT " $1 " FROM " $2 " WHERE " $3 ;
"SELECT " $1 " FROM " $2 " GROUP BY " $3 ;
"SELECT " $1 " FROM " $2 " SORT BY " $3 ;
"SELECT " $1 " FROM " $2 " LIMIT " $3 ;
"SELECT " $1 " FROM " $2 " JOIN " $3 ;
"SELECT " $1 " FROM " $2 " ARRAY JOIN " $3 ;
"SELECT " $1 " FROM " $2 " JOIN " $3 " ON " $4 ;
"SELECT " $1 " FROM " $2 " JOIN " $3 " USING " $5 ;
"SELECT " $1 " INTO OUTFILE " $2 ;
"WITH " $1 " AS " $2 ;
"{" $1 ":" $2 "}";
"[" $1 "," $2 "]";
"[]";
" x ";
"x";
" `x` ";
"`x`";
" \"value\" ";
"\"value\"";
" 0 ";
"0";
"1";
"2";
"123123123123123123";
"182374019873401982734091873420923123123123123123";
"1e-1";
"1.1";
"\"\"";
" '../../../../../../../../../etc/passwd' ";
"/";
"=";
"==";
"!=";
"<>";
"<";
"<=";
">";
">=";
"<<";
"|<<";
"&";
"|";
"||";
"<|";
"|>";
"+";
"-";
"~";
"*";
"/";
"\\";
"%";
"";
".";
",";
",";
",";
",";
",";
",";
"(";
")";
"(";
")";
"(";
")";
"(";
")";
"(";
")";
"(";
")";
"?";
":";
"@";
"@@";
"$";
"\"";
"`";
"{";
"}";
"^";
"::";
"->";
"]";
"[";

View File

@ -0,0 +1,40 @@
#include <iostream>
#include <string>
#include <IO/WriteBufferFromOStream.h>
#include <Parsers/ParserQueryWithOutput.h>
#include <Parsers/parseQuery.h>
#include <Parsers/formatAST.h>
#include <libfuzzer/libfuzzer_macro.h>
#include "out.pb.h"
void GenerateSentence(const Sentence&, std::string &, int);
DEFINE_BINARY_PROTO_FUZZER(const Sentence& main)
{
static std::string input;
input.reserve(4096);
GenerateSentence(main, input, 0);
if (input.size())
{
std::cout << input << std::endl;
DB::ParserQueryWithOutput parser(input.data() + input.size());
try
{
DB::ASTPtr ast = parseQuery(parser, input.data(), input.data() + input.size(), "", 0, 0);
DB::WriteBufferFromOStream out(std::cerr, 4096);
DB::formatAST(*ast, out);
std::cerr << std::endl;
}
catch (...) {}
input.clear();
}
}

View File

@ -0,0 +1,248 @@
#!/usr/bin/env python3
import sys
import string
TOKEN_TEXT = 1
TOKEN_VAR = 2
TOKEN_COLON = ':'
TOKEN_SEMI = ';'
TOKEN_OR = '|'
TOKEN_QUESTIONMARK = '?'
TOKEN_ROUND_BRACKET_OPEN = '('
TOKEN_ROUND_BRACKET_CLOSE = ')'
TOKEN_ASTERISK = '*'
TOKEN_SLASH = '/'
class TextValue:
def __init__(self, t):
self.t = t
self.slug = None
def get_slug(self):
if self.slug is not None:
return self.slug
slug = ''
for c in self.t:
slug += c if c in string.ascii_letters else '_'
self.slug = slug
return slug
def get_name(self):
return f"TextValue_{self.get_slug()}"
def __repr__(self):
return f"TextValue(\"{self.t}\")"
class Var:
def __init__(self, id_):
self.id_ = id_
def __repr__(self):
return f"Var({self.id_})"
class Parser:
def __init__(self):
self.chains = []
self.text = None
self.col = 0
self.line = 1
self.t = None
self.var_id = -1
self.cur_tok = None
self.includes = []
self.proto = ''
self.cpp = ''
def parse_file(self, filename):
with open(filename) as f:
self.text = f.read()
while self.parse_statement() is not None:
pass
def add_include(self, filename):
self.includes.append(filename)
def get_next_token(self):
self.skip_ws()
if not len(self.text):
return None
if self.text[0] == '"':
return self.parse_txt_value()
if self.text[0] == '$':
return self.parse_var_value()
c, self.text = self.text[0], self.text[1:]
self.cur_tok = c
return c
def parse_var_value(self):
i = self.text.find(' ')
id_, self.text = self.text[1:i], self.text[i+1:]
self.var_id = int(id_)
self.cur_tok = TOKEN_VAR
return TOKEN_VAR
def parse_txt_value(self):
if self.text[0] != '"':
raise Exception("parse_txt_value: expected quote at the start")
self.t = ''
self.text = self.text[1:]
while self.text[0] != '"':
if self.text[0] == '\\':
if self.text[1] == 'x':
self.t += self.text[:4]
self.text = self.text[4:]
elif self.text[1] in 'nt\\"':
self.t += self.text[:2]
self.text = self.text[2:]
else:
raise Exception(f"parse_txt_value: unknown symbol {self.text[0]}")
else:
c, self.text = self.text[0], self.text[1:]
self.t += c
self.text = self.text[1:]
self.cur_tok = TOKEN_TEXT
return TOKEN_TEXT
def skip_ws(self):
while self.text and self.text[0] in string.whitespace:
if self.text[0] == '\n':
self.line += 1
self.col = 0
self.text = self.text[1:]
self.col += 1
if not self.text:
return None
return True
def skip_line(self):
self.line += 1
index = self.text.find('\n')
self.text = self.text[index:]
def parse_statement(self):
if self.skip_ws() is None:
return None
self.get_next_token()
if self.cur_tok == TOKEN_SLASH:
self.skip_line()
return TOKEN_SLASH
chain = []
while self.cur_tok != TOKEN_SEMI:
if self.cur_tok == TOKEN_TEXT:
chain.append(TextValue(self.t))
elif self.cur_tok == TOKEN_VAR:
chain.append(Var(self.var_id))
else:
self.fatal_parsing_error(f"unexpected token {self.cur_tok}")
self.get_next_token()
if not chain:
self.fatal_parsing_error("empty chains are not allowed")
self.chains.append(chain)
return True
def generate(self):
self.proto = 'syntax = "proto3";\n\n'
self.cpp = '#include <iostream>\n#include <string>\n#include <vector>\n\n#include <libfuzzer/libfuzzer_macro.h>\n\n'
for incl_file in self.includes:
self.cpp += f'#include "{incl_file}"\n'
self.cpp += '\n'
self.proto += 'message Word {\n'
self.proto += '\tenum Value {\n'
self.cpp += 'void GenerateWord(const Word&, std::string&, int);\n\n'
self.cpp += 'void GenerateSentence(const Sentence& stc, std::string &s, int depth) {\n'
self.cpp += '\tfor (int i = 0; i < stc.words_size(); i++ ) {\n'
self.cpp += '\t\tGenerateWord(stc.words(i), s, ++depth);\n'
self.cpp += '\t}\n'
self.cpp += '}\n'
self.cpp += 'void GenerateWord(const Word& word, std::string &s, int depth) {\n'
self.cpp += '\tif (depth > 5) return;\n\n'
self.cpp += '\tswitch (word.value()) {\n'
for idx, chain in enumerate(self.chains):
self.proto += f'\t\tvalue_{idx} = {idx};\n'
self.cpp += f'\t\tcase {idx}: {{\n'
num_var = 0
for item in chain:
if isinstance(item, TextValue):
self.cpp += f'\t\t\ts += "{item.t}";\n'
elif isinstance(item, Var):
self.cpp += f'\t\t\tif (word.inner().words_size() > {num_var})\t\t\t\tGenerateWord(word.inner().words({num_var}), s, ++depth);\n'
num_var += 1
else:
raise Exception("unknown token met during generation")
self.cpp += '\t\t\tbreak;\n\t\t}\n'
self.cpp += '\t\tdefault: break;\n'
self.cpp += '\t}\n'
self.proto += '\t}\n'
self.proto += '\tValue value = 1;\n'
self.proto += '\tSentence inner = 2;\n'
self.proto += '}\nmessage Sentence {\n\trepeated Word words = 1;\n}'
self.cpp += '}\n'
return self.cpp, self.proto
def fatal_parsing_error(self, msg):
print(f"Line: {self.line}, Col: {self.col}")
raise Exception(f"fatal error during parsing. {msg}")
def main(args):
input_file, outfile_cpp, outfile_proto = args
if not outfile_proto.endswith('.proto'):
raise Exception("outfile_proto (argv[3]) should end with `.proto`")
include_filename = outfile_proto[:-6] + ".pb.h"
p = Parser()
p.add_include(include_filename)
p.parse_file(input_file)
cpp, proto = p.generate()
proto = proto.replace('\t', ' ' * 4)
cpp = cpp.replace('\t', ' ' * 4)
with open(outfile_cpp, 'w') as f:
f.write(cpp)
with open(outfile_proto, 'w') as f:
f.write(proto)
if __name__ == '__main__':
if len(sys.argv) < 3:
print(f"Usage {sys.argv[0]} <input_file> <outfile.cpp> <outfile.proto>")
sys.exit(1)
main(sys.argv[1:])

View File

@ -0,0 +1,30 @@
#!/bin/bash
_main() {
local dict_filename="${1}"
if [[ $# -ne 1 ]];
then
echo "Usage: $0 <dict_filename>";
exit 1;
fi
if [[ ! -f $dict_filename ]];
then
echo "File $dict_filename doesn't exist";
exit 1
fi
cat clickhouse-template.g > clickhouse.g
while read line;
do
[[ -z "$line" ]] && continue
echo $line | sed -e '/^#/d' -e 's/"\(.*\)"/" \1 ";/g'
done < $dict_filename >> clickhouse.g
}
_main "$@"
# Sample run: ./update.sh ${CLICKHOUSE_SOURCE_DIR}/tests/fuzz/all.dict
# then run `python ./gen.py clickhouse.g out.cpp out.proto` to generate new files with tokens. Rebuild fuzzer

View File

@ -1,4 +1,5 @@
#include <IO/ReadHelpers.h>
#include <IO/BufferWithOwnMemory.h>
#include <IO/Operators.h>
#include <Formats/verbosePrintString.h>

View File

@ -1,5 +1,6 @@
#include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromString.h>
#include <IO/BufferWithOwnMemory.h>
#include <IO/Operators.h>
#include <Processors/Formats/Impl/TabSeparatedRowInputFormat.h>

View File

@ -10,6 +10,7 @@
#include <Storages/MergeTree/IMergeTreeDataPart.h>
#include <Storages/MergeTree/MergeTreeSequentialSource.h>
#include <Storages/MergeTree/FutureMergedMutatedPart.h>
#include <Storages/MergeTree/MergeTreeDataMergerMutator.h>
#include <Processors/Transforms/ExpressionTransform.h>
#include <Processors/Transforms/MaterializingTransform.h>
#include <Processors/Merges/MergingSortedTransform.h>
@ -116,11 +117,23 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
}
ctx->disk = global_ctx->space_reservation->getDisk();
auto local_new_part_relative_tmp_path_name = local_tmp_prefix + global_ctx->future_part->name + local_tmp_suffix;
auto local_new_part_tmp_path = global_ctx->data->relative_data_path + local_new_part_relative_tmp_path_name + "/";
String local_part_path = global_ctx->data->relative_data_path;
String local_tmp_part_basename = local_tmp_prefix + global_ctx->future_part->name + (global_ctx->parent_part ? ".proj" : "");
String local_new_part_tmp_path = local_part_path + local_tmp_part_basename + "/";
if (ctx->disk->exists(local_new_part_tmp_path))
throw Exception("Directory " + fullPath(ctx->disk, local_new_part_tmp_path) + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS);
{
std::lock_guard lock(global_ctx->mutator->tmp_parts_lock);
global_ctx->mutator->tmp_parts.emplace(local_tmp_part_basename);
}
SCOPE_EXIT(
std::lock_guard lock(global_ctx->mutator->tmp_parts_lock);
global_ctx->mutator->tmp_parts.erase(local_tmp_part_basename);
);
global_ctx->all_column_names = global_ctx->metadata_snapshot->getColumns().getNamesOfPhysical();
global_ctx->storage_columns = global_ctx->metadata_snapshot->getColumns().getAllPhysical();
@ -141,7 +154,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
global_ctx->future_part->type,
global_ctx->future_part->part_info,
local_single_disk_volume,
local_new_part_relative_tmp_path_name,
local_tmp_part_basename,
global_ctx->parent_part);
global_ctx->new_data_part->uuid = global_ctx->future_part->uuid;
@ -560,6 +573,7 @@ bool MergeTask::MergeProjectionsStage::mergeMinMaxIndexAndPrepareProjections() c
global_ctx->new_data_part.get(),
".proj",
global_ctx->data,
global_ctx->mutator,
global_ctx->merges_blocker,
global_ctx->ttl_merges_blocker));
}

View File

@ -60,6 +60,7 @@ public:
const IMergeTreeDataPart * parent_part_,
String suffix_,
MergeTreeData * data_,
MergeTreeDataMergerMutator * mutator_,
ActionBlocker * merges_blocker_,
ActionBlocker * ttl_merges_blocker_)
{
@ -78,6 +79,7 @@ public:
global_ctx->deduplicate_by_columns = std::move(deduplicate_by_columns_);
global_ctx->parent_part = std::move(parent_part_);
global_ctx->data = std::move(data_);
global_ctx->mutator = std::move(mutator_);
global_ctx->merges_blocker = std::move(merges_blocker_);
global_ctx->ttl_merges_blocker = std::move(ttl_merges_blocker_);
@ -121,6 +123,7 @@ private:
std::unique_ptr<MergeListElement> projection_merge_list_element;
MergeListElement * merge_list_element_ptr{nullptr};
MergeTreeData * data{nullptr};
MergeTreeDataMergerMutator * mutator{nullptr};
ActionBlocker * merges_blocker{nullptr};
ActionBlocker * ttl_merges_blocker{nullptr};
StorageMetadataPtr metadata_snapshot{nullptr};

View File

@ -1343,7 +1343,7 @@ static bool isOldPartDirectory(const DiskPtr & disk, const String & directory_pa
}
void MergeTreeData::clearOldTemporaryDirectories(size_t custom_directories_lifetime_seconds)
void MergeTreeData::clearOldTemporaryDirectories(const MergeTreeDataMergerMutator & merger_mutator, size_t custom_directories_lifetime_seconds)
{
/// If the method is already called from another thread, then we don't need to do anything.
std::unique_lock lock(clear_old_temporary_directories_mutex, std::defer_lock);
@ -1359,35 +1359,44 @@ void MergeTreeData::clearOldTemporaryDirectories(size_t custom_directories_lifet
{
for (auto it = disk->iterateDirectory(path); it->isValid(); it->next())
{
if (startsWith(it->name(), "tmp_"))
const std::string & basename = it->name();
if (!startsWith(basename, "tmp_"))
{
try
continue;
}
const std::string & full_path = fullPath(disk, it->path());
if (merger_mutator.hasTemporaryPart(basename))
{
LOG_WARNING(log, "{} is an active destination for one of merge/mutation (consider increasing temporary_directories_lifetime setting)", full_path);
continue;
}
try
{
if (disk->isDirectory(it->path()) && isOldPartDirectory(disk, it->path(), deadline))
{
if (disk->isDirectory(it->path()) && isOldPartDirectory(disk, it->path(), deadline))
{
LOG_WARNING(log, "Removing temporary directory {}", fullPath(disk, it->path()));
disk->removeRecursive(it->path());
}
LOG_WARNING(log, "Removing temporary directory {}", full_path);
disk->removeRecursive(it->path());
}
/// see getModificationTime()
catch (const ErrnoException & e)
}
/// see getModificationTime()
catch (const ErrnoException & e)
{
if (e.getErrno() == ENOENT)
{
if (e.getErrno() == ENOENT)
{
/// If the file is already deleted, do nothing.
}
else
throw;
/// If the file is already deleted, do nothing.
}
catch (const fs::filesystem_error & e)
else
throw;
}
catch (const fs::filesystem_error & e)
{
if (e.code() == std::errc::no_such_file_or_directory)
{
if (e.code() == std::errc::no_such_file_or_directory)
{
/// If the file is already deleted, do nothing.
}
else
throw;
/// If the file is already deleted, do nothing.
}
else
throw;
}
}
}

View File

@ -39,6 +39,7 @@ namespace DB
class AlterCommands;
class MergeTreePartsMover;
class MergeTreeDataMergerMutator;
class MutationCommands;
class Context;
struct JobAndPool;
@ -536,7 +537,7 @@ public:
/// Delete all directories which names begin with "tmp"
/// Must be called with locked lockForShare() because it's using relative_data_path.
void clearOldTemporaryDirectories(size_t custom_directories_lifetime_seconds);
void clearOldTemporaryDirectories(const MergeTreeDataMergerMutator & merger_mutator, size_t custom_directories_lifetime_seconds);
void clearEmptyParts();

View File

@ -443,6 +443,7 @@ MergeTaskPtr MergeTreeDataMergerMutator::mergePartsToTemporaryPart(
parent_part,
suffix,
&data,
this,
&merges_blocker,
&ttl_merges_blocker);
}
@ -773,4 +774,10 @@ ExecuteTTLType MergeTreeDataMergerMutator::shouldExecuteTTL(const StorageMetadat
}
bool MergeTreeDataMergerMutator::hasTemporaryPart(const std::string & basename) const
{
std::lock_guard lock(tmp_parts_lock);
return tmp_parts.contains(basename);
}
}

View File

@ -1,6 +1,7 @@
#pragma once
#include <atomic>
#include <mutex>
#include <functional>
#include <Common/ActionBlocker.h>
@ -136,6 +137,7 @@ private:
MergeTreeData::DataPartsVector selectAllPartsFromPartition(const String & partition_id);
friend class MutateTask;
friend class MergeTask;
/** Split mutation commands into two parts:
* First part should be executed by mutations interpreter.
@ -190,6 +192,26 @@ private:
ITTLMergeSelector::PartitionIdToTTLs next_recompress_ttl_merge_times_by_partition;
/// Performing TTL merges independently for each partition guarantees that
/// there is only a limited number of TTL merges and no partition stores data, that is too stale
public:
/// Returns true if passed part name is active.
/// (is the destination for one of active mutation/merge).
///
/// NOTE: that it accept basename (i.e. dirname), not the path,
/// since later requires canonical form.
bool hasTemporaryPart(const std::string & basename) const;
private:
/// Set of active temporary paths that is used as the destination.
/// List of such paths is required to avoid trying to remove them during cleanup.
///
/// NOTE: It is pretty short, so use STL is fine.
std::unordered_set<std::string> tmp_parts;
/// Lock for "tmp_parts".
///
/// NOTE: mutable is required to mark hasTemporaryPath() const
mutable std::mutex tmp_parts_lock;
};

View File

@ -142,30 +142,6 @@ MergeTreeReadTaskPtr MergeTreeReadPool::getTask(const size_t min_marks_to_read,
prewhere_info && prewhere_info->remove_prewhere_column, per_part_should_reorder[part_idx], std::move(curr_task_size_predictor));
}
MarkRanges MergeTreeReadPool::getRestMarks(const IMergeTreeDataPart & part, const MarkRange & from) const
{
MarkRanges all_part_ranges;
/// Inefficient in presence of large number of data parts.
for (const auto & part_ranges : parts_ranges)
{
if (part_ranges.data_part.get() == &part)
{
all_part_ranges = part_ranges.ranges;
break;
}
}
if (all_part_ranges.empty())
throw Exception("Trying to read marks range [" + std::to_string(from.begin) + ", " + std::to_string(from.end) + "] from part '"
+ part.getFullPath() + "' which has no ranges in this query", ErrorCodes::LOGICAL_ERROR);
auto begin = std::lower_bound(all_part_ranges.begin(), all_part_ranges.end(), from, [] (const auto & f, const auto & s) { return f.begin < s.begin; });
if (begin == all_part_ranges.end())
begin = std::prev(all_part_ranges.end());
begin->begin = from.begin;
return MarkRanges(begin, all_part_ranges.end());
}
Block MergeTreeReadPool::getHeader() const
{
return metadata_snapshot->getSampleBlockForColumns(column_names, data.getVirtuals(), data.getStorageID());

View File

@ -85,9 +85,6 @@ public:
*/
void profileFeedback(const ReadBufferFromFileBase::ProfileInfo info);
/// This method tells which mark ranges we have to read if we start from @from mark range
MarkRanges getRestMarks(const IMergeTreeDataPart & part, const MarkRange & from) const;
Block getHeader() const;
private:

View File

@ -68,18 +68,16 @@ bool MergeTreeThreadSelectProcessor::getNewTask()
if (!reader)
{
auto rest_mark_ranges = pool->getRestMarks(*task->data_part, task->mark_ranges[0]);
if (use_uncompressed_cache)
owned_uncompressed_cache = storage.getContext()->getUncompressedCache();
owned_mark_cache = storage.getContext()->getMarkCache();
reader = task->data_part->getReader(task->columns, metadata_snapshot, rest_mark_ranges,
reader = task->data_part->getReader(task->columns, metadata_snapshot, task->mark_ranges,
owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings,
IMergeTreeReader::ValueSizeMap{}, profile_callback);
if (prewhere_info)
pre_reader = task->data_part->getReader(task->pre_columns, metadata_snapshot, rest_mark_ranges,
pre_reader = task->data_part->getReader(task->pre_columns, metadata_snapshot, task->mark_ranges,
owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings,
IMergeTreeReader::ValueSizeMap{}, profile_callback);
}
@ -88,14 +86,13 @@ bool MergeTreeThreadSelectProcessor::getNewTask()
/// in other case we can reuse readers, anyway they will be "seeked" to required mark
if (part_name != last_readed_part_name)
{
auto rest_mark_ranges = pool->getRestMarks(*task->data_part, task->mark_ranges[0]);
/// retain avg_value_size_hints
reader = task->data_part->getReader(task->columns, metadata_snapshot, rest_mark_ranges,
reader = task->data_part->getReader(task->columns, metadata_snapshot, task->mark_ranges,
owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings,
reader->getAvgValueSizeHints(), profile_callback);
if (prewhere_info)
pre_reader = task->data_part->getReader(task->pre_columns, metadata_snapshot, rest_mark_ranges,
pre_reader = task->data_part->getReader(task->pre_columns, metadata_snapshot, task->mark_ranges,
owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings,
reader->getAvgValueSizeHints(), profile_callback);
}

View File

@ -62,7 +62,7 @@ void ReplicatedMergeTreeCleanupThread::iterate()
/// Both use relative_data_path which changes during rename, so we
/// do it under share lock
storage.clearOldWriteAheadLogs();
storage.clearOldTemporaryDirectories(storage.getSettings()->temporary_directories_lifetime.totalSeconds());
storage.clearOldTemporaryDirectories(storage.merger_mutator, storage.getSettings()->temporary_directories_lifetime.totalSeconds());
}
/// This is loose condition: no problem if we actually had lost leadership at this moment

View File

@ -105,7 +105,7 @@ void StorageMergeTree::startup()
/// Temporary directories contain incomplete results of merges (after forced restart)
/// and don't allow to reinitialize them, so delete each of them immediately
clearOldTemporaryDirectories(0);
clearOldTemporaryDirectories(merger_mutator, 0);
/// NOTE background task will also do the above cleanups periodically.
time_after_previous_cleanup_parts.restart();
@ -1063,7 +1063,7 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assign
assignee.scheduleCommonTask(ExecutableLambdaAdapter::create(
[this, share_lock] ()
{
clearOldTemporaryDirectories(getSettings()->temporary_directories_lifetime.totalSeconds());
clearOldTemporaryDirectories(merger_mutator, getSettings()->temporary_directories_lifetime.totalSeconds());
return true;
}, common_assignee_trigger, getStorageID()));
scheduled = true;

View File

@ -478,7 +478,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree(
}
/// Temporary directories contain uninitialized results of Merges or Fetches (after forced restart),
/// don't allow to reinitialize them, delete each of them immediately.
clearOldTemporaryDirectories(0);
clearOldTemporaryDirectories(merger_mutator, 0);
clearOldWriteAheadLogs();
}

View File

@ -109,7 +109,10 @@ def clickhouse_execute_json(base_args, query, timeout=30, settings=None):
data = clickhouse_execute_http(base_args, query, timeout, settings, 'JSONEachRow')
if not data:
return None
return json.loads(data)
rows = []
for row in data.strip().splitlines():
rows.append(json.loads(row))
return rows
class Terminated(KeyboardInterrupt):
@ -475,19 +478,19 @@ class TestCase:
if os.path.isfile(self.stdout_file):
description += ", result:\n\n"
description += '\n'.join(open(self.stdout_file).read().split('\n')[:100])
description += '\n'.join(open(self.stdout_file).read().splitlines()[:100])
description += '\n'
description += "\nstdout:\n{}\n".format(stdout)
return TestResult(self.name, TestStatus.FAIL, reason, total_time, description)
if stderr:
description += "\n{}\n".format('\n'.join(stderr.split('\n')[:100]))
description += "\n{}\n".format('\n'.join(stderr.splitlines()[:100]))
description += "\nstdout:\n{}\n".format(stdout)
return TestResult(self.name, TestStatus.FAIL, FailureReason.STDERR, total_time, description)
if 'Exception' in stdout:
description += "\n{}\n".format('\n'.join(stdout.split('\n')[:100]))
description += "\n{}\n".format('\n'.join(stdout.splitlines()[:100]))
return TestResult(self.name, TestStatus.FAIL, FailureReason.EXCEPTION, total_time, description)
if '@@SKIP@@' in stdout:
@ -1392,7 +1395,6 @@ if __name__ == '__main__':
http_port = os.getenv("CLICKHOUSE_PORT_HTTP")
if http_port is not None:
args.http_port = int(http_port)
args.client += f" --port={http_port}"
else:
args.http_port = 8123

View File

@ -0,0 +1,126 @@
<test>
<create_query>
CREATE TABLE simple_key_hashed_array_dictionary_source_table
(
id UInt64,
value_int UInt64,
value_string String,
value_decimal Decimal64(8),
value_string_nullable Nullable(String)
) ENGINE = Memory;
</create_query>
<create_query>
CREATE TABLE complex_key_hashed_array_dictionary_source_table
(
id UInt64,
id_key String,
value_int UInt64,
value_string String,
value_decimal Decimal64(8),
value_string_nullable Nullable(String)
) ENGINE = Memory;
</create_query>
<create_query>
CREATE DICTIONARY simple_key_hashed_array_dictionary
(
id UInt64,
value_int UInt64,
value_string String,
value_decimal Decimal64(8),
value_string_nullable Nullable(String)
)
PRIMARY KEY id
SOURCE(CLICKHOUSE(DB 'default' TABLE 'simple_key_hashed_array_dictionary_source_table'))
LAYOUT(HASHED_ARRAY())
LIFETIME(MIN 0 MAX 1000);
</create_query>
<create_query>
CREATE DICTIONARY complex_key_hashed_array_dictionary
(
id UInt64,
id_key String,
value_int UInt64,
value_string String,
value_decimal Decimal64(8),
value_string_nullable Nullable(String)
)
PRIMARY KEY id, id_key
SOURCE(CLICKHOUSE(DB 'default' TABLE 'complex_key_hashed_array_dictionary_source_table'))
LAYOUT(COMPLEX_KEY_HASHED_ARRAY())
LIFETIME(MIN 0 MAX 1000);
</create_query>
<fill_query>
INSERT INTO simple_key_hashed_array_dictionary_source_table
SELECT number, number, toString(number), toDecimal64(number, 8), toString(number)
FROM system.numbers
LIMIT 5000000;
</fill_query>
<fill_query>
INSERT INTO complex_key_hashed_array_dictionary_source_table
SELECT number, toString(number), number, toString(number), toDecimal64(number, 8), toString(number)
FROM system.numbers
LIMIT 5000000;
</fill_query>
<substitutions>
<substitution>
<name>column_name</name>
<values>
<value>'value_int'</value>
<value>'value_string'</value>
<value>'value_decimal'</value>
<value>'value_string_nullable'</value>
</values>
</substitution>
<substitution>
<name>elements_count</name>
<values>
<value>5000000</value>
<value>7500000</value>
</values>
</substitution>
</substitutions>
<query>
WITH rand64() % toUInt64({elements_count}) as key
SELECT dictGet('default.simple_key_hashed_array_dictionary', {column_name}, key)
FROM system.numbers
LIMIT {elements_count}
FORMAT Null;
</query>
<query>
WITH rand64() % toUInt64({elements_count}) as key
SELECT dictHas('default.simple_key_hashed_array_dictionary', key)
FROM system.numbers
LIMIT {elements_count}
FORMAT Null;
</query>
<query>
WITH (rand64() % toUInt64({elements_count}), toString(rand64() % toUInt64({elements_count}))) as key
SELECT dictGet('default.complex_key_hashed_array_dictionary', {column_name}, key)
FROM system.numbers
LIMIT {elements_count}
FORMAT Null;
</query>
<query>
WITH (rand64() % toUInt64({elements_count}), toString(rand64() % toUInt64({elements_count}))) as key
SELECT dictHas('default.complex_key_hashed_array_dictionary', key)
FROM system.numbers
LIMIT {elements_count}
FORMAT Null;
</query>
<drop_query>DROP TABLE IF EXISTS simple_key_hashed_array_dictionary_source_table;</drop_query>
<drop_query>DROP TABLE IF EXISTS complex_key_hashed_array_dictionary_source_table;</drop_query>
<drop_query>DROP DICTIONARY IF EXISTS simple_key_hashed_array_dictionary;</drop_query>
<drop_query>DROP DICTIONARY IF EXISTS complex_key_hashed_array_dictionary;</drop_query>
</test>

View File

@ -0,0 +1,15 @@
execute: default
"foo"
1
execute: --stage fetch_columns
"dummy"
0
execute: --stage with_mergeable_state
"1"
1
execute: --stage with_mergeable_state_after_aggregation
"1"
1
execute: --stage complete
"foo"
1

View File

@ -0,0 +1,22 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
function execute_query()
{
if [ $# -eq 0 ]; then
echo "execute: default"
else
echo "execute: $*"
fi
${CLICKHOUSE_LOCAL} "$@" --format CSVWithNames -q "SELECT 1 AS foo"
}
execute_query # default -- complete
execute_query --stage fetch_columns
execute_query --stage with_mergeable_state
execute_query --stage with_mergeable_state_after_aggregation
execute_query --stage complete

View File

@ -0,0 +1,60 @@
0 0
1 1
2 2
3 3
4 40
5 50
6 60
7 70
8 800
9 900
10 1000
11 1100
12 12000
13 13000
14 14000
15 15000
16 160000
17 170000
18 180000
19 190000
0 0
1 1
2 2
3 3
4 40
5 50
6 60
7 70
8 80000
9 90000
10 100000
11 110000
12 120000
13 130000
14 140000
15 150000
16 160000
17 170000
18 180000
19 190000
0 0
1 1
2 2
3 3
4 40
5 50
6 60
7 70
8 800
9 900
10 1000
11 1100
12 12000
13 13000
14 14000
15 15000
16 160000
17 170000
18 180000
19 190000

View File

@ -0,0 +1,45 @@
-- https://github.com/ClickHouse/ClickHouse/issues/30231
SELECT *
FROM (
SELECT number,
multiIf(
CAST(number < 4, 'UInt8'), toString(number),
CAST(number < 8, 'LowCardinality(UInt8)'), toString(number * 10),
CAST(number < 12, 'Nullable(UInt8)'), toString(number * 100),
CAST(number < 16, 'LowCardinality(Nullable(UInt8))'), toString(number * 1000),
toString(number * 10000)) as m
FROM system.numbers
LIMIT 20
)
ORDER BY number
SETTINGS short_circuit_function_evaluation='enable';
SELECT *
FROM (
SELECT number,
multiIf(
CAST(number < 4, 'UInt8'), toString(number),
CAST(number < 8, 'LowCardinality(UInt8)'), toString(number * 10),
CAST(NULL, 'Nullable(UInt8)'), toString(number * 100),
CAST(NULL, 'LowCardinality(Nullable(UInt8))'), toString(number * 1000),
toString(number * 10000)) as m
FROM system.numbers
LIMIT 20
)
ORDER BY number
SETTINGS short_circuit_function_evaluation='enable';
SELECT *
FROM (
SELECT number,
multiIf(
CAST(number < 4, 'UInt8'), toString(number),
CAST(number < 8, 'LowCardinality(UInt8)'), toString(number * 10)::LowCardinality(String),
CAST(number < 12, 'Nullable(UInt8)'), toString(number * 100)::Nullable(String),
CAST(number < 16, 'LowCardinality(Nullable(UInt8))'), toString(number * 1000)::LowCardinality(Nullable(String)),
toString(number * 10000)) as m
FROM system.numbers
LIMIT 20
)
ORDER BY number
SETTINGS short_circuit_function_evaluation='enable';

View File

@ -0,0 +1,66 @@
Dictionary hashed_array_dictionary_simple_key_simple_attributes
dictGet existing value
value_0 value_second_0
value_1 value_second_1
value_2 value_second_2
dictGet with non existing value
value_0 value_second_0
value_1 value_second_1
value_2 value_second_2
value_first_default value_second_default
dictGetOrDefault existing value
value_0 value_second_0
value_1 value_second_1
value_2 value_second_2
dictGetOrDefault non existing value
value_0 value_second_0
value_1 value_second_1
value_2 value_second_2
default default
dictHas
1
1
1
0
select all values as input stream
0 value_0 value_second_0
1 value_1 value_second_1
2 value_2 value_second_2
Dictionary hashed_array_dictionary_simple_key_complex_attributes
dictGet existing value
value_0 value_second_0
value_1 \N
value_2 value_second_2
dictGet with non existing value
value_0 value_second_0
value_1 \N
value_2 value_second_2
value_first_default value_second_default
dictGetOrDefault existing value
value_0 value_second_0
value_1 \N
value_2 value_second_2
dictGetOrDefault non existing value
value_0 value_second_0
value_1 \N
value_2 value_second_2
default default
dictHas
1
1
1
0
select all values as input stream
0 value_0 value_second_0
1 value_1 \N
2 value_2 value_second_2
Dictionary hashed_array_dictionary_simple_key_hierarchy
dictGet
0
0
1
1
2
dictGetHierarchy
[1]
[4,2,1]

View File

@ -0,0 +1,125 @@
DROP TABLE IF EXISTS simple_key_simple_attributes_source_table;
CREATE TABLE simple_key_simple_attributes_source_table
(
id UInt64,
value_first String,
value_second String
)
ENGINE = TinyLog;
INSERT INTO simple_key_simple_attributes_source_table VALUES(0, 'value_0', 'value_second_0');
INSERT INTO simple_key_simple_attributes_source_table VALUES(1, 'value_1', 'value_second_1');
INSERT INTO simple_key_simple_attributes_source_table VALUES(2, 'value_2', 'value_second_2');
DROP DICTIONARY IF EXISTS hashed_array_dictionary_simple_key_simple_attributes;
CREATE DICTIONARY hashed_array_dictionary_simple_key_simple_attributes
(
id UInt64,
value_first String DEFAULT 'value_first_default',
value_second String DEFAULT 'value_second_default'
)
PRIMARY KEY id
SOURCE(CLICKHOUSE(TABLE 'simple_key_simple_attributes_source_table'))
LAYOUT(HASHED_ARRAY())
LIFETIME(MIN 1 MAX 1000);
SELECT 'Dictionary hashed_array_dictionary_simple_key_simple_attributes';
SELECT 'dictGet existing value';
SELECT dictGet('hashed_array_dictionary_simple_key_simple_attributes', 'value_first', number) as value_first,
dictGet('hashed_array_dictionary_simple_key_simple_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 3;
SELECT 'dictGet with non existing value';
SELECT dictGet('hashed_array_dictionary_simple_key_simple_attributes', 'value_first', number) as value_first,
dictGet('hashed_array_dictionary_simple_key_simple_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 4;
SELECT 'dictGetOrDefault existing value';
SELECT dictGetOrDefault('hashed_array_dictionary_simple_key_simple_attributes', 'value_first', number, toString('default')) as value_first,
dictGetOrDefault('hashed_array_dictionary_simple_key_simple_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 3;
SELECT 'dictGetOrDefault non existing value';
SELECT dictGetOrDefault('hashed_array_dictionary_simple_key_simple_attributes', 'value_first', number, toString('default')) as value_first,
dictGetOrDefault('hashed_array_dictionary_simple_key_simple_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 4;
SELECT 'dictHas';
SELECT dictHas('hashed_array_dictionary_simple_key_simple_attributes', number) FROM system.numbers LIMIT 4;
SELECT 'select all values as input stream';
SELECT * FROM hashed_array_dictionary_simple_key_simple_attributes ORDER BY id;
DROP DICTIONARY hashed_array_dictionary_simple_key_simple_attributes;
DROP TABLE simple_key_simple_attributes_source_table;
DROP TABLE IF EXISTS simple_key_complex_attributes_source_table;
CREATE TABLE simple_key_complex_attributes_source_table
(
id UInt64,
value_first String,
value_second Nullable(String)
)
ENGINE = TinyLog;
INSERT INTO simple_key_complex_attributes_source_table VALUES(0, 'value_0', 'value_second_0');
INSERT INTO simple_key_complex_attributes_source_table VALUES(1, 'value_1', NULL);
INSERT INTO simple_key_complex_attributes_source_table VALUES(2, 'value_2', 'value_second_2');
DROP DICTIONARY IF EXISTS hashed_array_dictionary_simple_key_complex_attributes;
CREATE DICTIONARY hashed_array_dictionary_simple_key_complex_attributes
(
id UInt64,
value_first String DEFAULT 'value_first_default',
value_second Nullable(String) DEFAULT 'value_second_default'
)
PRIMARY KEY id
SOURCE(CLICKHOUSE(TABLE 'simple_key_complex_attributes_source_table'))
LAYOUT(HASHED_ARRAY())
LIFETIME(MIN 1 MAX 1000);
SELECT 'Dictionary hashed_array_dictionary_simple_key_complex_attributes';
SELECT 'dictGet existing value';
SELECT dictGet('hashed_array_dictionary_simple_key_complex_attributes', 'value_first', number) as value_first,
dictGet('hashed_array_dictionary_simple_key_complex_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 3;
SELECT 'dictGet with non existing value';
SELECT dictGet('hashed_array_dictionary_simple_key_complex_attributes', 'value_first', number) as value_first,
dictGet('hashed_array_dictionary_simple_key_complex_attributes', 'value_second', number) as value_second FROM system.numbers LIMIT 4;
SELECT 'dictGetOrDefault existing value';
SELECT dictGetOrDefault('hashed_array_dictionary_simple_key_complex_attributes', 'value_first', number, toString('default')) as value_first,
dictGetOrDefault('hashed_array_dictionary_simple_key_complex_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 3;
SELECT 'dictGetOrDefault non existing value';
SELECT dictGetOrDefault('hashed_array_dictionary_simple_key_complex_attributes', 'value_first', number, toString('default')) as value_first,
dictGetOrDefault('hashed_array_dictionary_simple_key_complex_attributes', 'value_second', number, toString('default')) as value_second FROM system.numbers LIMIT 4;
SELECT 'dictHas';
SELECT dictHas('hashed_array_dictionary_simple_key_complex_attributes', number) FROM system.numbers LIMIT 4;
SELECT 'select all values as input stream';
SELECT * FROM hashed_array_dictionary_simple_key_complex_attributes ORDER BY id;
DROP DICTIONARY hashed_array_dictionary_simple_key_complex_attributes;
DROP TABLE simple_key_complex_attributes_source_table;
DROP TABLE IF EXISTS simple_key_hierarchy_table;
CREATE TABLE simple_key_hierarchy_table
(
id UInt64,
parent_id UInt64
) ENGINE = TinyLog();
INSERT INTO simple_key_hierarchy_table VALUES (1, 0);
INSERT INTO simple_key_hierarchy_table VALUES (2, 1);
INSERT INTO simple_key_hierarchy_table VALUES (3, 1);
INSERT INTO simple_key_hierarchy_table VALUES (4, 2);
DROP DICTIONARY IF EXISTS hashed_array_dictionary_simple_key_hierarchy;
CREATE DICTIONARY hashed_array_dictionary_simple_key_hierarchy
(
id UInt64,
parent_id UInt64 HIERARCHICAL
)
PRIMARY KEY id
SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_hierarchy_table'))
LAYOUT(HASHED_ARRAY())
LIFETIME(MIN 1 MAX 1000);
SELECT 'Dictionary hashed_array_dictionary_simple_key_hierarchy';
SELECT 'dictGet';
SELECT dictGet('hashed_array_dictionary_simple_key_hierarchy', 'parent_id', number) FROM system.numbers LIMIT 5;
SELECT 'dictGetHierarchy';
SELECT dictGetHierarchy('hashed_array_dictionary_simple_key_hierarchy', toUInt64(1));
SELECT dictGetHierarchy('hashed_array_dictionary_simple_key_hierarchy', toUInt64(4));
DROP DICTIONARY hashed_array_dictionary_simple_key_hierarchy;
DROP TABLE simple_key_hierarchy_table;

View File

@ -0,0 +1,56 @@
Dictionary hashed_array_dictionary_complex_key_simple_attributes
dictGet existing value
value_0 value_second_0
value_1 value_second_1
value_2 value_second_2
dictGet with non existing value
value_0 value_second_0
value_1 value_second_1
value_2 value_second_2
value_first_default value_second_default
dictGetOrDefault existing value
value_0 value_second_0
value_1 value_second_1
value_2 value_second_2
dictGetOrDefault non existing value
value_0 value_second_0
value_1 value_second_1
value_2 value_second_2
default default
dictHas
1
1
1
0
select all values as input stream
0 id_key_0 value_0 value_second_0
1 id_key_1 value_1 value_second_1
2 id_key_2 value_2 value_second_2
Dictionary hashed_array_dictionary_complex_key_complex_attributes
dictGet existing value
value_0 value_second_0
value_1 \N
value_2 value_second_2
dictGet with non existing value
value_0 value_second_0
value_1 \N
value_2 value_second_2
value_first_default value_second_default
dictGetOrDefault existing value
value_0 value_second_0
value_1 \N
value_2 value_second_2
dictGetOrDefault non existing value
value_0 value_second_0
value_1 \N
value_2 value_second_2
default default
dictHas
1
1
1
0
select all values as input stream
0 id_key_0 value_0 value_second_0
1 id_key_1 value_1 \N
2 id_key_2 value_2 value_second_2

View File

@ -0,0 +1,97 @@
DROP TABLE IF EXISTS complex_key_simple_attributes_source_table;
CREATE TABLE complex_key_simple_attributes_source_table
(
id UInt64,
id_key String,
value_first String,
value_second String
)
ENGINE = TinyLog;
INSERT INTO complex_key_simple_attributes_source_table VALUES(0, 'id_key_0', 'value_0', 'value_second_0');
INSERT INTO complex_key_simple_attributes_source_table VALUES(1, 'id_key_1', 'value_1', 'value_second_1');
INSERT INTO complex_key_simple_attributes_source_table VALUES(2, 'id_key_2', 'value_2', 'value_second_2');
DROP DICTIONARY IF EXISTS hashed_array_dictionary_complex_key_simple_attributes;
CREATE DICTIONARY hashed_array_dictionary_complex_key_simple_attributes
(
id UInt64,
id_key String,
value_first String DEFAULT 'value_first_default',
value_second String DEFAULT 'value_second_default'
)
PRIMARY KEY id, id_key
SOURCE(CLICKHOUSE(TABLE 'complex_key_simple_attributes_source_table'))
LIFETIME(MIN 1 MAX 1000)
LAYOUT(COMPLEX_KEY_HASHED_ARRAY());
SELECT 'Dictionary hashed_array_dictionary_complex_key_simple_attributes';
SELECT 'dictGet existing value';
SELECT dictGet('hashed_array_dictionary_complex_key_simple_attributes', 'value_first', (number, concat('id_key_', toString(number)))) as value_first,
dictGet('hashed_array_dictionary_complex_key_simple_attributes', 'value_second', (number, concat('id_key_', toString(number)))) as value_second FROM system.numbers LIMIT 3;
SELECT 'dictGet with non existing value';
SELECT dictGet('hashed_array_dictionary_complex_key_simple_attributes', 'value_first', (number, concat('id_key_', toString(number)))) as value_first,
dictGet('hashed_array_dictionary_complex_key_simple_attributes', 'value_second', (number, concat('id_key_', toString(number)))) as value_second FROM system.numbers LIMIT 4;
SELECT 'dictGetOrDefault existing value';
SELECT dictGetOrDefault('hashed_array_dictionary_complex_key_simple_attributes', 'value_first', (number, concat('id_key_', toString(number))), toString('default')) as value_first,
dictGetOrDefault('hashed_array_dictionary_complex_key_simple_attributes', 'value_second', (number, concat('id_key_', toString(number))), toString('default')) as value_second FROM system.numbers LIMIT 3;
SELECT 'dictGetOrDefault non existing value';
SELECT dictGetOrDefault('hashed_array_dictionary_complex_key_simple_attributes', 'value_first', (number, concat('id_key_', toString(number))), toString('default')) as value_first,
dictGetOrDefault('hashed_array_dictionary_complex_key_simple_attributes', 'value_second', (number, concat('id_key_', toString(number))), toString('default')) as value_second FROM system.numbers LIMIT 4;
SELECT 'dictHas';
SELECT dictHas('hashed_array_dictionary_complex_key_simple_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4;
SELECT 'select all values as input stream';
SELECT * FROM hashed_array_dictionary_complex_key_simple_attributes ORDER BY (id, id_key);
DROP DICTIONARY hashed_array_dictionary_complex_key_simple_attributes;
DROP TABLE complex_key_simple_attributes_source_table;
DROP TABLE IF EXISTS complex_key_complex_attributes_source_table;
CREATE TABLE complex_key_complex_attributes_source_table
(
id UInt64,
id_key String,
value_first String,
value_second Nullable(String)
)
ENGINE = TinyLog;
INSERT INTO complex_key_complex_attributes_source_table VALUES(0, 'id_key_0', 'value_0', 'value_second_0');
INSERT INTO complex_key_complex_attributes_source_table VALUES(1, 'id_key_1', 'value_1', NULL);
INSERT INTO complex_key_complex_attributes_source_table VALUES(2, 'id_key_2', 'value_2', 'value_second_2');
DROP DICTIONARY IF EXISTS hashed_array_dictionary_complex_key_complex_attributes;
CREATE DICTIONARY hashed_array_dictionary_complex_key_complex_attributes
(
id UInt64,
id_key String,
value_first String DEFAULT 'value_first_default',
value_second Nullable(String) DEFAULT 'value_second_default'
)
PRIMARY KEY id, id_key
SOURCE(CLICKHOUSE(TABLE 'complex_key_complex_attributes_source_table'))
LIFETIME(MIN 1 MAX 1000)
LAYOUT(COMPLEX_KEY_HASHED_ARRAY());
SELECT 'Dictionary hashed_array_dictionary_complex_key_complex_attributes';
SELECT 'dictGet existing value';
SELECT dictGet('hashed_array_dictionary_complex_key_complex_attributes', 'value_first', (number, concat('id_key_', toString(number)))) as value_first,
dictGet('hashed_array_dictionary_complex_key_complex_attributes', 'value_second', (number, concat('id_key_', toString(number)))) as value_second FROM system.numbers LIMIT 3;
SELECT 'dictGet with non existing value';
SELECT dictGet('hashed_array_dictionary_complex_key_complex_attributes', 'value_first', (number, concat('id_key_', toString(number)))) as value_first,
dictGet('hashed_array_dictionary_complex_key_complex_attributes', 'value_second', (number, concat('id_key_', toString(number)))) as value_second FROM system.numbers LIMIT 4;
SELECT 'dictGetOrDefault existing value';
SELECT dictGetOrDefault('hashed_array_dictionary_complex_key_complex_attributes', 'value_first', (number, concat('id_key_', toString(number))), toString('default')) as value_first,
dictGetOrDefault('hashed_array_dictionary_complex_key_complex_attributes', 'value_second', (number, concat('id_key_', toString(number))), toString('default')) as value_second FROM system.numbers LIMIT 3;
SELECT 'dictGetOrDefault non existing value';
SELECT dictGetOrDefault('hashed_array_dictionary_complex_key_complex_attributes', 'value_first', (number, concat('id_key_', toString(number))), toString('default')) as value_first,
dictGetOrDefault('hashed_array_dictionary_complex_key_complex_attributes', 'value_second', (number, concat('id_key_', toString(number))), toString('default')) as value_second FROM system.numbers LIMIT 4;
SELECT 'dictHas';
SELECT dictHas('hashed_array_dictionary_complex_key_complex_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4;
SELECT 'select all values as input stream';
SELECT * FROM hashed_array_dictionary_complex_key_complex_attributes ORDER BY (id, id_key);
DROP DICTIONARY hashed_array_dictionary_complex_key_complex_attributes;
DROP TABLE complex_key_complex_attributes_source_table;

View File

@ -0,0 +1,2 @@
468426149779992039
1

View File

@ -0,0 +1,7 @@
SELECT sum(cityHash64(*)) FROM test.hits SETTINGS max_threads=40;
-- We had a bug which lead to additional compressed data read. test.hits compressed size is about 1.2Gb, but we read more then 3Gb.
-- Small additional reads still possible, so we compare with about 1.5Gb.
SYSTEM FLUSH LOGS;
SELECT ProfileEvents['ReadBufferFromFileDescriptorReadBytes'] < 1500000000 from system.query_log where query = 'SELECT sum(cityHash64(*)) FROM test.hits SETTINGS max_threads=40;' and current_database = currentDatabase() and type = 'QueryFinish';